From ea38e55fff639545394e32208a7dabc7e6258166 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 23 Mar 2014 16:06:03 +0100 Subject: [PATCH] [instagram] Add support for user profiles (Fixes #2606) --- test/helper.py | 18 ++++++++ test/test_download.py | 19 --------- test/test_playlists.py | 30 +++++++++++++- youtube_dl/YoutubeDL.py | 16 ++++---- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/instagram.py | 68 +++++++++++++++++++++++++++++++ 6 files changed, 124 insertions(+), 29 deletions(-) diff --git a/test/helper.py b/test/helper.py index 9e255878f..8739f816c 100644 --- a/test/helper.py +++ b/test/helper.py @@ -110,3 +110,21 @@ def expect_info_dict(self, expected_dict, got_dict): self.assertEqual(expected, got, u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got)) + # Check for the presence of mandatory fields + for key in ('id', 'url', 'title', 'ext'): + self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key) + # Check for mandatory fields that are automatically set by YoutubeDL + for key in ['webpage_url', 'extractor', 'extractor_key']: + self.assertTrue(got_dict.get(key), u'Missing field: %s' % key) + + # Are checkable fields missing from the test case definition? + test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) + for key, value in got_dict.items() + if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location')) + missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys()) + if missing_keys: + sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n') + self.assertFalse( + missing_keys, + 'Missing keys in test definition: %s' % ( + ', '.join(sorted(missing_keys)))) diff --git a/test/test_download.py b/test/test_download.py index f4e5d120e..f171c10ba 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -137,25 +137,6 @@ def generator(test_case): info_dict = json.load(infof) expect_info_dict(self, tc.get('info_dict', {}), info_dict) - - # Check for the presence of mandatory fields - for key in ('id', 'url', 'title', 'ext'): - self.assertTrue(key in info_dict.keys() and info_dict[key]) - # Check for mandatory fields that are automatically set by YoutubeDL - for key in ['webpage_url', 'extractor', 'extractor_key']: - self.assertTrue(info_dict.get(key), u'Missing field: %s' % key) - - # Are checkable fields missing from the test case definition? - test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) - for key, value in info_dict.items() - if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location')) - missing_keys = set(test_info_dict.keys()) - set(tc.get('info_dict', {}).keys()) - if missing_keys: - sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n') - self.assertFalse( - missing_keys, - 'Missing keys in test definition: %s' % ( - ','.join(sorted(missing_keys)))) finally: try_rm_tcs_files() diff --git a/test/test_playlists.py b/test/test_playlists.py index 2b1a7e849..b1e38e7e9 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -9,8 +9,10 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL - +from test.helper import ( + expect_info_dict, + FakeYDL, +) from youtube_dl.extractor import ( AcademicEarthCourseIE, @@ -39,6 +41,7 @@ from youtube_dl.extractor import ( TEDIE, ToypicsUserIE, XTubeUserIE, + InstagramUserIE, ) @@ -287,5 +290,28 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['id'], 'greenshowers') self.assertTrue(len(result['entries']) >= 155) + def test_InstagramUser(self): + dl = FakeYDL() + ie = InstagramUserIE(dl) + result = ie.extract('http://instagram.com/porsche') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], 'porsche') + self.assertTrue(len(result['entries']) >= 2) + test_video = next( + e for e in result['entries'] + if e['id'] == '614605558512799803_462752227') + dl.add_default_extra_info(test_video, ie, '(irrelevant URL)') + dl.process_video_result(test_video, download=False) + EXPECTED = { + 'id': '614605558512799803_462752227', + 'ext': 'mp4', + 'title': '#Porsche Intelligent Performance.', + 'thumbnail': 're:^https?://.*\.jpg', + 'uploader': 'Porsche', + 'uploader_id': 'porsche', + } + expect_info_dict(self, EXPECTED, test_video) + + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c5d08b0bb..d18d6dd00 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -512,13 +512,7 @@ class YoutubeDL(object): '_type': 'compat_list', 'entries': ie_result, } - self.add_extra_info(ie_result, - { - 'extractor': ie.IE_NAME, - 'webpage_url': url, - 'webpage_url_basename': url_basename(url), - 'extractor_key': ie.ie_key(), - }) + self.add_default_extra_info(ie_result, ie, url) if process: return self.process_ie_result(ie_result, download, extra_info) else: @@ -537,6 +531,14 @@ class YoutubeDL(object): else: self.report_error('no suitable InfoExtractor for URL %s' % url) + def add_default_extra_info(self, ie_result, ie, url): + self.add_extra_info(ie_result, { + 'extractor': ie.IE_NAME, + 'webpage_url': url, + 'webpage_url_basename': url_basename(url), + 'extractor_key': ie.ie_key(), + }) + def process_ie_result(self, ie_result, download=True, extra_info={}): """ Take the result of the ie(may be modified) and resolve all unresolved diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b5c8ef682..3e728e876 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -112,7 +112,7 @@ from .imdb import ( ) from .ina import InaIE from .infoq import InfoQIE -from .instagram import InstagramIE +from .instagram import InstagramIE, InstagramUserIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE from .ivi import ( diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 63141af27..994f0e4ae 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -3,6 +3,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + int_or_none, +) class InstagramIE(InfoExtractor): @@ -37,3 +40,68 @@ class InstagramIE(InfoExtractor): 'uploader_id': uploader_id, 'description': desc, } + + +class InstagramUserIE(InfoExtractor): + _VALID_URL = r'http://instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' + IE_DESC = 'Instagram user profile' + IE_NAME = 'instagram:user' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + uploader_id = mobj.group('username') + + entries = [] + page_count = 0 + media_url = 'http://instagram.com/%s/media' % uploader_id + while True: + page = self._download_json( + media_url, uploader_id, + note='Downloading page %d ' % (page_count + 1), + ) + page_count += 1 + + for it in page['items']: + if it.get('type') != 'video': + continue + like_count = int_or_none(it.get('likes', {}).get('count')) + user = it.get('user', {}) + + formats = [{ + 'format_id': k, + 'height': v.get('height'), + 'width': v.get('width'), + 'url': v['url'], + } for k, v in it['videos'].items()] + self._sort_formats(formats) + + thumbnails_el = it.get('images', {}) + thumbnail = thumbnails_el.get('thumbnail', {}).get('url') + + title = it.get('caption', {}).get('text', it['id']) + + entries.append({ + 'id': it['id'], + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'webpage_url': it.get('link'), + 'uploader': user.get('full_name'), + 'uploader_id': user.get('username'), + 'like_count': like_count, + 'upload_timestamp': int_or_none(it.get('created_time')), + }) + + if not page['items']: + break + max_id = page['items'][-1]['id'] + media_url = ( + 'http://instagram.com/%s/media?max_id=%s' % ( + uploader_id, max_id)) + + return { + '_type': 'playlist', + 'entries': entries, + 'id': uploader_id, + 'title': uploader_id, + }