From 29ac31afaf627363fbc1f757aa50078d343acf1f Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Mon, 11 Dec 2017 12:25:13 +0800 Subject: [PATCH 01/13] simply get the correct webpage, but not parsed to extract information --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/weibo.py | 97 ++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 youtube_dl/extractor/weibo.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2cc3bc463..12dc2e7e8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1286,6 +1286,7 @@ from .webofstories import ( WebOfStoriesIE, WebOfStoriesPlaylistIE, ) +from .weibo import WeiboIE from .weiqitv import WeiqiTVIE from .wimp import WimpIE from .wistia import WistiaIE diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py new file mode 100644 index 000000000..195508e99 --- /dev/null +++ b/youtube_dl/extractor/weibo.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from urllib.request import Request +from urllib.parse import urlencode +import json +import random as rnd + +class WeiboIE(InfoExtractor): + _VALID_URL = r'https?://weibo\.com/[0-9]+/(?P[a-zA-Z0-9]+)' + _TEST = { + 'url': 'https://weibo.com/6275294458/Fp6RGfbff?from=page_1005056275294458_profile&wvr=6&mod=weibotime&type=comment', + 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'info_dict': { + 'id': '42', + 'ext': 'mp4', + 'title': 'Video title goes here', + 'thumbnail': r're:^https?://.*\.jpg$', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', + 'Upgrade-Insecure-Requests': '1', + } + # to get Referer url for genvisitor + webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="first visit the page") + + visitor_url = urlh.geturl() + + data = urlencode({ + "cb": "gen_callback", + "fp": '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}', + }).encode() + headers = { + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept': '*/*', + 'Referer': visitor_url, + } + + r_genvisitor = Request( + 'https://passport.weibo.com/visitor/genvisitor', + data = data, + headers = headers, + method = 'POST' + ) + webpage,urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") + print("webpage", webpage) + + p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" + i1 = p.find('{') + i2 = p.rfind('}') + j = p[i1:i2+1] # get JSON object + d = json.loads(j) + tid = d["data"]["tid"] + cnfd = "%03d" % d["data"]["confidence"] + + param = urlencode({ + 'a': 'incarnate', + 't': tid, + 'w': 2, + 'c': cnfd, + 'cb': 'cross_domain', + 'from': 'weibo', + '_rand': rnd.random() + }) + gencallback_url = "https://passport.weibo.com/visitor/visitor?" + param + webpage,urlh = self._download_webpage_handle(gencallback_url, video_id, note="gen callback") + print("webpage", webpage) + + webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="retry to visit the page") + print("webpage", webpage) + + # TODO more code goes here, for example ... + title = self._html_search_regex(r'(.+?)', webpage, 'title') + + video_sources = self._search_regex(r'video-sources=(.+?)', webpage, 'video_sources') + print("video_sources:", video_sources) + return { + 'id': video_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), + # TODO more properties (see youtube_dl/extractor/common.py) + } From 3281af3464a910cb88f22ef0ece4a8323c2a4d38 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Mon, 11 Dec 2017 15:56:54 +0800 Subject: [PATCH 02/13] a working version --- youtube_dl/extractor/weibo.py | 41 +++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 195508e99..9b398e931 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -5,24 +5,19 @@ from .common import InfoExtractor from urllib.request import Request from urllib.parse import urlencode +from urllib import parse import json import random as rnd +from os import path class WeiboIE(InfoExtractor): _VALID_URL = r'https?://weibo\.com/[0-9]+/(?P[a-zA-Z0-9]+)' _TEST = { - 'url': 'https://weibo.com/6275294458/Fp6RGfbff?from=page_1005056275294458_profile&wvr=6&mod=weibotime&type=comment', - 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment', 'info_dict': { - 'id': '42', + 'id': 'Fp6RGfbff', 'ext': 'mp4', - 'title': 'Video title goes here', - 'thumbnail': r're:^https?://.*\.jpg$', - # TODO more properties, either as: - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type (for example int or float) + 'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博', } } @@ -78,20 +73,34 @@ class WeiboIE(InfoExtractor): }) gencallback_url = "https://passport.weibo.com/visitor/visitor?" + param webpage,urlh = self._download_webpage_handle(gencallback_url, video_id, note="gen callback") - print("webpage", webpage) webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="retry to visit the page") - print("webpage", webpage) # TODO more code goes here, for example ... title = self._html_search_regex(r'(.+?)', webpage, 'title') - video_sources = self._search_regex(r'video-sources=(.+?)', webpage, 'video_sources') - print("video_sources:", video_sources) + video_sources_text = self._search_regex("video-sources=\\\\\"(.+?)\"", webpage, 'video_sources') + + video_formats = parse.parse_qs(video_sources_text) + + formats = [] + supported_resolutions = ['720', '480'] + for res in supported_resolutions: + f = video_formats.get(res) + if isinstance(f, list): + if len(f) > 0: + vid_url = f[0] + print("%s:%s" % (res, vid_url)) + formats.append({ + 'url': vid_url + }) + self._sort_formats(formats) + uploader = self._og_search_property('nick-name', webpage, 'uploader', default = None) + print(title, uploader) return { 'id': video_id, 'title': title, - 'description': self._og_search_description(webpage), - 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), + 'uploader': uploader, + 'formats': formats # TODO more properties (see youtube_dl/extractor/common.py) } From 0c69958844a446bc3373f45f8f750cbc3202d14e Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Mon, 11 Dec 2017 16:02:14 +0800 Subject: [PATCH 03/13] add other properties; remove print verbose --- youtube_dl/extractor/weibo.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 9b398e931..b835f8975 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -52,7 +52,6 @@ class WeiboIE(InfoExtractor): method = 'POST' ) webpage,urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") - print("webpage", webpage) p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" i1 = p.find('{') @@ -90,13 +89,13 @@ class WeiboIE(InfoExtractor): if isinstance(f, list): if len(f) > 0: vid_url = f[0] - print("%s:%s" % (res, vid_url)) formats.append({ - 'url': vid_url + 'url': vid_url, + 'format': 'mp4', + 'height': int(res), }) self._sort_formats(formats) uploader = self._og_search_property('nick-name', webpage, 'uploader', default = None) - print(title, uploader) return { 'id': video_id, 'title': title, From 447a5a710dcd05741ea8cefa2fe98b333534e07d Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Tue, 26 Dec 2017 16:18:35 +0800 Subject: [PATCH 04/13] added weibo mobile site support --- youtube_dl/extractor/extractors.py | 5 +++- youtube_dl/extractor/weibo.py | 46 ++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 12dc2e7e8..f1ea735b5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1286,7 +1286,10 @@ from .webofstories import ( WebOfStoriesIE, WebOfStoriesPlaylistIE, ) -from .weibo import WeiboIE +from .weibo import ( + WeiboIE, + WeiboMobileIE +) from .weiqitv import WeiqiTVIE from .wimp import WimpIE from .wistia import WistiaIE diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index b835f8975..eda0fa63d 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -9,6 +9,11 @@ from urllib import parse import json import random as rnd from os import path +import re + +from ..utils import ( + js_to_json, +) class WeiboIE(InfoExtractor): _VALID_URL = r'https?://weibo\.com/[0-9]+/(?P[a-zA-Z0-9]+)' @@ -103,3 +108,44 @@ class WeiboIE(InfoExtractor): 'formats': formats # TODO more properties (see youtube_dl/extractor/common.py) } + +class WeiboMobileIE(InfoExtractor): + _VALID_URL = r'https?://m.weibo.cn/status/(?P[0-9]+)(\?.+)?' + _TEST = { + 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', + 'info_dict': { + 'id': '4189191225395228', + 'ext': 'mp4', + 'title': '午睡当然是要甜甜蜜蜜的啦', + 'uploader': '柴犬柴犬' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + headers = { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8', + 'Upgrade-Insecure-Requests': '1', + } + # to get Referer url for genvisitor + webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="visit the page") + js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags = re.DOTALL) + weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) + page_info = weibo_info['status']['page_info'] + title = weibo_info['status']['status_title'] + format = { + 'url': page_info['media_info']['stream_url'], + 'format': 'mp4', + } + formats = [format] + uploader = weibo_info['status']['user']['screen_name'] + + return { + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'formats': formats + # TODO more properties (see youtube_dl/extractor/common.py) + } From d2be5bb5af7a1d7108b272315265e103a4358b28 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Tue, 26 Dec 2017 16:28:47 +0800 Subject: [PATCH 05/13] change to use compat urllib --- youtube_dl/extractor/weibo.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index eda0fa63d..6a4e0a4cb 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -3,14 +3,16 @@ from __future__ import unicode_literals from .common import InfoExtractor -from urllib.request import Request -from urllib.parse import urlencode -from urllib import parse import json import random as rnd from os import path import re +from ..compat import ( + compat_urllib_parse_urlencode as urlencode, + compat_urllib_request as Request, + compat_urlparse as parse, +) from ..utils import ( js_to_json, ) From 951043724f91b3cfce60cf62cc3228a91a04ae81 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Tue, 26 Dec 2017 16:38:51 +0800 Subject: [PATCH 06/13] re-format code to pass flake8 --- youtube_dl/extractor/weibo.py | 103 +++++++++++++++++----------------- 1 file changed, 52 insertions(+), 51 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 6a4e0a4cb..b4ac7b9fa 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -5,7 +5,6 @@ from .common import InfoExtractor import json import random as rnd -from os import path import re from ..compat import ( @@ -17,16 +16,17 @@ from ..utils import ( js_to_json, ) + class WeiboIE(InfoExtractor): _VALID_URL = r'https?://weibo\.com/[0-9]+/(?P[a-zA-Z0-9]+)' _TEST = { - 'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment', - 'info_dict': { - 'id': 'Fp6RGfbff', - 'ext': 'mp4', - 'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博', - } - } + 'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment', + 'info_dict': { + 'id': 'Fp6RGfbff', + 'ext': 'mp4', + 'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博', + } + } def _real_extract(self, url): video_id = self._match_id(url) @@ -38,32 +38,32 @@ class WeiboIE(InfoExtractor): 'Upgrade-Insecure-Requests': '1', } # to get Referer url for genvisitor - webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="first visit the page") + webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="first visit the page") visitor_url = urlh.geturl() data = urlencode({ "cb": "gen_callback", "fp": '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}', - }).encode() + }).encode() headers = { - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept': '*/*', - 'Referer': visitor_url, - } + 'Accept-Encoding': 'gzip, deflate, br', + 'Accept': '*/*', + 'Referer': visitor_url, + } r_genvisitor = Request( 'https://passport.weibo.com/visitor/genvisitor', - data = data, - headers = headers, - method = 'POST' - ) - webpage,urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") + data=data, + headers=headers, + method='POST' + ) + webpage, urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") - p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" + p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" i1 = p.find('{') i2 = p.rfind('}') - j = p[i1:i2+1] # get JSON object + j = p[i1:i2 + 1] # get JSON object d = json.loads(j) tid = d["data"]["tid"] cnfd = "%03d" % d["data"]["confidence"] @@ -76,17 +76,17 @@ class WeiboIE(InfoExtractor): 'cb': 'cross_domain', 'from': 'weibo', '_rand': rnd.random() - }) + }) gencallback_url = "https://passport.weibo.com/visitor/visitor?" + param - webpage,urlh = self._download_webpage_handle(gencallback_url, video_id, note="gen callback") + webpage, urlh = self._download_webpage_handle(gencallback_url, video_id, note="gen callback") - webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="retry to visit the page") + webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="retry to visit the page") # TODO more code goes here, for example ... title = self._html_search_regex(r'(.+?)', webpage, 'title') video_sources_text = self._search_regex("video-sources=\\\\\"(.+?)\"", webpage, 'video_sources') - + video_formats = parse.parse_qs(video_sources_text) formats = [] @@ -100,28 +100,29 @@ class WeiboIE(InfoExtractor): 'url': vid_url, 'format': 'mp4', 'height': int(res), - }) + }) self._sort_formats(formats) - uploader = self._og_search_property('nick-name', webpage, 'uploader', default = None) + uploader = self._og_search_property('nick-name', webpage, 'uploader', default=None) return { - 'id': video_id, - 'title': title, - 'uploader': uploader, - 'formats': formats - # TODO more properties (see youtube_dl/extractor/common.py) - } + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'formats': formats + # TODO more properties (see youtube_dl/extractor/common.py) + } + class WeiboMobileIE(InfoExtractor): _VALID_URL = r'https?://m.weibo.cn/status/(?P[0-9]+)(\?.+)?' _TEST = { - 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', - 'info_dict': { - 'id': '4189191225395228', - 'ext': 'mp4', - 'title': '午睡当然是要甜甜蜜蜜的啦', - 'uploader': '柴犬柴犬' - } - } + 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', + 'info_dict': { + 'id': '4189191225395228', + 'ext': 'mp4', + 'title': '午睡当然是要甜甜蜜蜜的啦', + 'uploader': '柴犬柴犬' + } + } def _real_extract(self, url): video_id = self._match_id(url) @@ -132,22 +133,22 @@ class WeiboMobileIE(InfoExtractor): 'Upgrade-Insecure-Requests': '1', } # to get Referer url for genvisitor - webpage,urlh = self._download_webpage_handle(url, video_id, headers=headers, note="visit the page") - js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags = re.DOTALL) + webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="visit the page") + js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags=re.DOTALL) weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) page_info = weibo_info['status']['page_info'] title = weibo_info['status']['status_title'] format = { 'url': page_info['media_info']['stream_url'], - 'format': 'mp4', - } + 'format': 'mp4', + } formats = [format] uploader = weibo_info['status']['user']['screen_name'] return { - 'id': video_id, - 'title': title, - 'uploader': uploader, - 'formats': formats - # TODO more properties (see youtube_dl/extractor/common.py) - } + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'formats': formats + # TODO more properties (see youtube_dl/extractor/common.py) + } From 25936512245fc571ab716d59e2d73c50d8cad6ce Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Tue, 26 Dec 2017 16:46:01 +0800 Subject: [PATCH 07/13] fix compat_urllib_request for python2.7 --- youtube_dl/extractor/weibo.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index b4ac7b9fa..f8a5ee71c 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -9,7 +9,7 @@ import re from ..compat import ( compat_urllib_parse_urlencode as urlencode, - compat_urllib_request as Request, + compat_urllib_request as request, compat_urlparse as parse, ) from ..utils import ( @@ -52,11 +52,10 @@ class WeiboIE(InfoExtractor): 'Referer': visitor_url, } - r_genvisitor = Request( + r_genvisitor = request.Request( 'https://passport.weibo.com/visitor/genvisitor', data=data, headers=headers, - method='POST' ) webpage, urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") @@ -85,7 +84,7 @@ class WeiboIE(InfoExtractor): # TODO more code goes here, for example ... title = self._html_search_regex(r'(.+?)', webpage, 'title') - video_sources_text = self._search_regex("video-sources=\\\\\"(.+?)\"", webpage, 'video_sources') + video_sources_text = self._search_regex(r'video-sources=\\\"(.+?)\"', webpage, 'video_sources') video_formats = parse.parse_qs(video_sources_text) From 42a1012c7767306626c5358a18ad3e86417bd7b7 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Tue, 26 Dec 2017 22:20:43 +0800 Subject: [PATCH 08/13] fix according to "https://github.com/rg3/youtube-dl/pull/15079#discussion_r158688607" --- youtube_dl/extractor/weibo.py | 85 +++++++++++++---------------------- 1 file changed, 32 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index f8a5ee71c..2be31fe77 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -4,13 +4,12 @@ from __future__ import unicode_literals from .common import InfoExtractor import json -import random as rnd +import random import re from ..compat import ( - compat_urllib_parse_urlencode as urlencode, - compat_urllib_request as request, - compat_urlparse as parse, + compat_urllib_parse_urlencode, + compat_urlparse, ) from ..utils import ( js_to_json, @@ -30,34 +29,28 @@ class WeiboIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - headers = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8', - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', - 'Upgrade-Insecure-Requests': '1', - } # to get Referer url for genvisitor - webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="first visit the page") + webpage, urlh = self._download_webpage_handle(url, video_id, note="first visit the page") visitor_url = urlh.geturl() - - data = urlencode({ - "cb": "gen_callback", - "fp": '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}', - }).encode() headers = { - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept': '*/*', - 'Referer': visitor_url, + 'Referer': visitor_url } - r_genvisitor = request.Request( - 'https://passport.weibo.com/visitor/genvisitor', - data=data, - headers=headers, - ) - webpage, urlh = self._download_webpage_handle(r_genvisitor, video_id, note="gen visitor") + fp = { + "os": "2", + "browser": "Gecko57,0,0,0", + "fonts": "undefined", + "screenInfo": "1440*900*24", + "plugins": "" + } + data = compat_urllib_parse_urlencode({ + "cb": "gen_callback", + "fp": json.dumps(fp), + }).encode() + + genvisitor_url = 'https://passport.weibo.com/visitor/genvisitor' + webpage, _ = self._download_webpage_handle(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" i1 = p.find('{') @@ -67,29 +60,28 @@ class WeiboIE(InfoExtractor): tid = d["data"]["tid"] cnfd = "%03d" % d["data"]["confidence"] - param = urlencode({ + query = { 'a': 'incarnate', 't': tid, 'w': 2, 'c': cnfd, 'cb': 'cross_domain', 'from': 'weibo', - '_rand': rnd.random() - }) - gencallback_url = "https://passport.weibo.com/visitor/visitor?" + param - webpage, urlh = self._download_webpage_handle(gencallback_url, video_id, note="gen callback") + '_rand': random.random() + } + gencallback_url = "https://passport.weibo.com/visitor/visitor" + self._download_webpage_handle(gencallback_url, video_id, note="gen callback", query=query) - webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="retry to visit the page") + webpage, _ = self._download_webpage_handle(url, video_id, note="retry to visit the page") - # TODO more code goes here, for example ... title = self._html_search_regex(r'(.+?)', webpage, 'title') video_sources_text = self._search_regex(r'video-sources=\\\"(.+?)\"', webpage, 'video_sources') - video_formats = parse.parse_qs(video_sources_text) + video_formats = compat_urlparse.parse_qs(video_sources_text) formats = [] - supported_resolutions = ['720', '480'] + supported_resolutions = ('720', '480') for res in supported_resolutions: f = video_formats.get(res) if isinstance(f, list): @@ -107,12 +99,11 @@ class WeiboIE(InfoExtractor): 'title': title, 'uploader': uploader, 'formats': formats - # TODO more properties (see youtube_dl/extractor/common.py) } class WeiboMobileIE(InfoExtractor): - _VALID_URL = r'https?://m.weibo.cn/status/(?P[0-9]+)(\?.+)?' + _VALID_URL = r'https?://m\.weibo\.cn/status/(?P[0-9]+)(\?.+)?' _TEST = { 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', 'info_dict': { @@ -125,29 +116,17 @@ class WeiboMobileIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - headers = { - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', - 'Accept-Encoding': 'gzip, deflate, br', - 'Accept-Language': 'en,zh-CN;q=0.9,zh;q=0.8', - 'Upgrade-Insecure-Requests': '1', - } # to get Referer url for genvisitor - webpage, urlh = self._download_webpage_handle(url, video_id, headers=headers, note="visit the page") + webpage, _ = self._download_webpage_handle(url, video_id, note="visit the page") js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags=re.DOTALL) weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) page_info = weibo_info['status']['page_info'] - title = weibo_info['status']['status_title'] - format = { - 'url': page_info['media_info']['stream_url'], - 'format': 'mp4', - } - formats = [format] - uploader = weibo_info['status']['user']['screen_name'] + title = weibo_info.get('status').get('status_title') + uploader = weibo_info.get('status').get('user').get('screen_name') return { 'id': video_id, 'title': title, 'uploader': uploader, - 'formats': formats - # TODO more properties (see youtube_dl/extractor/common.py) + 'url': page_info['media_info']['stream_url'] } From 5c97ec5ff5fd77a7975e1e946d53a76ccd5ef0de Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Sat, 30 Dec 2017 01:08:56 +0800 Subject: [PATCH 09/13] replace urlencode.encode with urlencode_postdata --- youtube_dl/extractor/weibo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 2be31fe77..0b28952c9 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -8,11 +8,11 @@ import random import re from ..compat import ( - compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( js_to_json, + urlencode_postdata, ) @@ -44,10 +44,10 @@ class WeiboIE(InfoExtractor): "screenInfo": "1440*900*24", "plugins": "" } - data = compat_urllib_parse_urlencode({ + data = urlencode_postdata({ "cb": "gen_callback", "fp": json.dumps(fp), - }).encode() + }) genvisitor_url = 'https://passport.weibo.com/visitor/genvisitor' webpage, _ = self._download_webpage_handle(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") From 6a41a12d2960efb7b32d3b6ef74cf6237766b569 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Sat, 30 Dec 2017 01:11:30 +0800 Subject: [PATCH 10/13] replace split with strip_jsonp --- youtube_dl/extractor/weibo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 0b28952c9..71e7123e4 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -12,6 +12,7 @@ from ..compat import ( ) from ..utils import ( js_to_json, + strip_jsonp, urlencode_postdata, ) @@ -52,7 +53,7 @@ class WeiboIE(InfoExtractor): genvisitor_url = 'https://passport.weibo.com/visitor/genvisitor' webpage, _ = self._download_webpage_handle(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") - p = webpage.split("&&")[1] # split "gen_callback && gen_callback(...)" + p = strip_jsonp(webpage) i1 = p.find('{') i2 = p.rfind('}') j = p[i1:i2 + 1] # get JSON object From 48058d82dc3b448a72fd5ac1e7fa5492cd11f640 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Sat, 30 Dec 2017 01:14:21 +0800 Subject: [PATCH 11/13] replace unused _download_webpage_handle with _download_webpage --- youtube_dl/extractor/weibo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 71e7123e4..34809bdb2 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -51,7 +51,7 @@ class WeiboIE(InfoExtractor): }) genvisitor_url = 'https://passport.weibo.com/visitor/genvisitor' - webpage, _ = self._download_webpage_handle(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") + webpage = self._download_webpage(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") p = strip_jsonp(webpage) i1 = p.find('{') @@ -71,9 +71,9 @@ class WeiboIE(InfoExtractor): '_rand': random.random() } gencallback_url = "https://passport.weibo.com/visitor/visitor" - self._download_webpage_handle(gencallback_url, video_id, note="gen callback", query=query) + self._download_webpage(gencallback_url, video_id, note="gen callback", query=query) - webpage, _ = self._download_webpage_handle(url, video_id, note="retry to visit the page") + webpage = self._download_webpage(url, video_id, note="retry to visit the page") title = self._html_search_regex(r'(.+?)', webpage, 'title') @@ -118,7 +118,7 @@ class WeiboMobileIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) # to get Referer url for genvisitor - webpage, _ = self._download_webpage_handle(url, video_id, note="visit the page") + webpage = self._download_webpage(url, video_id, note="visit the page") js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags=re.DOTALL) weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) page_info = weibo_info['status']['page_info'] From 6648fd8ad6e581354f46c840465cff4c92d2c6f3 Mon Sep 17 00:00:00 2001 From: sprhawk <465558+sprhawk@users.noreply.github.com> Date: Mon, 1 Jan 2018 18:33:14 +0800 Subject: [PATCH 12/13] changed to use .get to get field from json object --- youtube_dl/extractor/weibo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index 34809bdb2..cbe0c3228 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -121,7 +121,7 @@ class WeiboMobileIE(InfoExtractor): webpage = self._download_webpage(url, video_id, note="visit the page") js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags=re.DOTALL) weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) - page_info = weibo_info['status']['page_info'] + page_info = weibo_info.get('status').get('page_info') title = weibo_info.get('status').get('status_title') uploader = weibo_info.get('status').get('user').get('screen_name') From 5eca00a2e33a6ca26a7f52589e5d77bab7e5edf4 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 9 Jan 2018 18:12:55 +0800 Subject: [PATCH 13/13] [weibo] Misc improvements --- youtube_dl/extractor/weibo.py | 125 ++++++++++++++++++---------------- 1 file changed, 66 insertions(+), 59 deletions(-) diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py index cbe0c3228..3cb4d71a6 100644 --- a/youtube_dl/extractor/weibo.py +++ b/youtube_dl/extractor/weibo.py @@ -8,7 +8,8 @@ import random import re from ..compat import ( - compat_urlparse, + compat_parse_qs, + compat_str, ) from ..utils import ( js_to_json, @@ -31,70 +32,71 @@ class WeiboIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) # to get Referer url for genvisitor - webpage, urlh = self._download_webpage_handle(url, video_id, note="first visit the page") + webpage, urlh = self._download_webpage_handle(url, video_id) visitor_url = urlh.geturl() - headers = { - 'Referer': visitor_url - } - fp = { - "os": "2", - "browser": "Gecko57,0,0,0", - "fonts": "undefined", - "screenInfo": "1440*900*24", - "plugins": "" - } - data = urlencode_postdata({ - "cb": "gen_callback", - "fp": json.dumps(fp), - }) + if 'passport.weibo.com' in visitor_url: + # first visit + visitor_data = self._download_json( + 'https://passport.weibo.com/visitor/genvisitor', video_id, + note='Generating first-visit data', + transform_source=strip_jsonp, + headers={'Referer': visitor_url}, + data=urlencode_postdata({ + 'cb': 'gen_callback', + 'fp': json.dumps({ + 'os': '2', + 'browser': 'Gecko57,0,0,0', + 'fonts': 'undefined', + 'screenInfo': '1440*900*24', + 'plugins': '', + }), + })) - genvisitor_url = 'https://passport.weibo.com/visitor/genvisitor' - webpage = self._download_webpage(genvisitor_url, video_id, data=data, headers=headers, note="gen visitor") + tid = visitor_data['data']['tid'] + cnfd = '%03d' % visitor_data['data']['confidence'] - p = strip_jsonp(webpage) - i1 = p.find('{') - i2 = p.rfind('}') - j = p[i1:i2 + 1] # get JSON object - d = json.loads(j) - tid = d["data"]["tid"] - cnfd = "%03d" % d["data"]["confidence"] + self._download_webpage( + 'https://passport.weibo.com/visitor/visitor', video_id, + note='Running first-visit callback', + query={ + 'a': 'incarnate', + 't': tid, + 'w': 2, + 'c': cnfd, + 'cb': 'cross_domain', + 'from': 'weibo', + '_rand': random.random(), + }) - query = { - 'a': 'incarnate', - 't': tid, - 'w': 2, - 'c': cnfd, - 'cb': 'cross_domain', - 'from': 'weibo', - '_rand': random.random() - } - gencallback_url = "https://passport.weibo.com/visitor/visitor" - self._download_webpage(gencallback_url, video_id, note="gen callback", query=query) + webpage = self._download_webpage( + url, video_id, note='Revisiting webpage') - webpage = self._download_webpage(url, video_id, note="retry to visit the page") + title = self._html_search_regex( + r'(.+?)', webpage, 'title') - title = self._html_search_regex(r'(.+?)', webpage, 'title') - - video_sources_text = self._search_regex(r'video-sources=\\\"(.+?)\"', webpage, 'video_sources') - - video_formats = compat_urlparse.parse_qs(video_sources_text) + video_formats = compat_parse_qs(self._search_regex( + r'video-sources=\\\"(.+?)\"', webpage, 'video_sources')) formats = [] - supported_resolutions = ('720', '480') + supported_resolutions = (480, 720) for res in supported_resolutions: - f = video_formats.get(res) - if isinstance(f, list): - if len(f) > 0: - vid_url = f[0] - formats.append({ - 'url': vid_url, - 'format': 'mp4', - 'height': int(res), - }) + vid_urls = video_formats.get(compat_str(res)) + if not vid_urls or not isinstance(vid_urls, list): + continue + + vid_url = vid_urls[0] + formats.append({ + 'url': vid_url, + 'height': res, + }) + self._sort_formats(formats) - uploader = self._og_search_property('nick-name', webpage, 'uploader', default=None) + + uploader = self._og_search_property( + 'nick-name', webpage, 'uploader', default=None) + return { 'id': video_id, 'title': title, @@ -118,12 +120,17 @@ class WeiboMobileIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) # to get Referer url for genvisitor - webpage = self._download_webpage(url, video_id, note="visit the page") - js_code = self._search_regex(r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\] \|\| {};', webpage, 'js_code', flags=re.DOTALL) - weibo_info = self._parse_json(js_code, video_id, transform_source=js_to_json) - page_info = weibo_info.get('status').get('page_info') - title = weibo_info.get('status').get('status_title') - uploader = weibo_info.get('status').get('user').get('screen_name') + webpage = self._download_webpage(url, video_id, note='visit the page') + + weibo_info = self._parse_json(self._search_regex( + r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};', + webpage, 'js_code', flags=re.DOTALL), + video_id, transform_source=js_to_json) + + status_data = weibo_info.get('status', {}) + page_info = status_data.get('page_info') + title = status_data['status_title'] + uploader = status_data.get('user', {}).get('screen_name') return { 'id': video_id,