0
0
Fork 0

[mtvservices:embedded] Use another endpoint to get feed URL

Closes #10363

In the original mtvservices:embedded test case, config.xml is still used
to get the feed URL. Some other examples, including test_Generic_40
(http://www.vulture.com/2016/06/new-key-peele-sketches-released.html),
and the video mentioned in #10363, use another endpoint to get the feed
URL. The 'index.html' approach works for the original test case, too. So
I didn't keep the old approach.
main
Yen Chi Hsuan 2016-08-24 23:58:22 +08:00
parent 97653f81b2
commit 0c75abbb7b
4 changed files with 23 additions and 20 deletions

View File

@ -1,3 +1,9 @@
version <unreleased>
Extractors
* [mtvservices:embedded] Fix extraction for the new 'edge' player (#10363)
version 2016.08.24.1 version 2016.08.24.1
Extractors Extractors

View File

@ -2,7 +2,6 @@ from __future__ import unicode_literals
from .mtv import MTVServicesInfoExtractor from .mtv import MTVServicesInfoExtractor
from ..utils import unified_strdate from ..utils import unified_strdate
from ..compat import compat_urllib_parse_urlencode
class BetIE(MTVServicesInfoExtractor): class BetIE(MTVServicesInfoExtractor):
@ -53,9 +52,9 @@ class BetIE(MTVServicesInfoExtractor):
_FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player" _FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player"
def _get_feed_query(self, uri): def _get_feed_query(self, uri):
return compat_urllib_parse_urlencode({ return {
'uuid': uri, 'uuid': uri,
}) }
def _extract_mgid(self, webpage): def _extract_mgid(self, webpage):
return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid') return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid')

View File

@ -4,7 +4,6 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_urllib_parse_urlencode,
compat_str, compat_str,
compat_xpath, compat_xpath,
) )
@ -14,12 +13,13 @@ from ..utils import (
fix_xml_ampersands, fix_xml_ampersands,
float_or_none, float_or_none,
HEADRequest, HEADRequest,
RegexNotFoundError,
sanitized_Request, sanitized_Request,
strip_or_none, strip_or_none,
timeconvert, timeconvert,
unescapeHTML, unescapeHTML,
update_url_query,
url_basename, url_basename,
RegexNotFoundError,
xpath_text, xpath_text,
) )
@ -36,6 +36,11 @@ class MTVServicesInfoExtractor(InfoExtractor):
def _id_from_uri(uri): def _id_from_uri(uri):
return uri.split(':')[-1] return uri.split(':')[-1]
@staticmethod
def _remove_template_parameter(url):
# Remove the templates, like &device={device}
return re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', url)
# This was originally implemented for ComedyCentral, but it also works here # This was originally implemented for ComedyCentral, but it also works here
@classmethod @classmethod
def _transform_rtmp_url(cls, rtmp_video_url): def _transform_rtmp_url(cls, rtmp_video_url):
@ -117,9 +122,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
video_id = self._id_from_uri(uri) video_id = self._id_from_uri(uri)
self.report_extraction(video_id) self.report_extraction(video_id)
content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))) content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content')))
mediagen_url = content_el.attrib['url'] mediagen_url = self._remove_template_parameter(content_el.attrib['url'])
# Remove the templates, like &device={device}
mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url)
if 'acceptMethods' not in mediagen_url: if 'acceptMethods' not in mediagen_url:
mediagen_url += '&' if '?' in mediagen_url else '?' mediagen_url += '&' if '?' in mediagen_url else '?'
mediagen_url += 'acceptMethods=fms' mediagen_url += 'acceptMethods=fms'
@ -178,12 +181,12 @@ class MTVServicesInfoExtractor(InfoExtractor):
data = {'uri': uri} data = {'uri': uri}
if self._LANG: if self._LANG:
data['lang'] = self._LANG data['lang'] = self._LANG
return compat_urllib_parse_urlencode(data) return data
def _get_videos_info(self, uri): def _get_videos_info(self, uri):
video_id = self._id_from_uri(uri) video_id = self._id_from_uri(uri)
feed_url = self._get_feed_url(uri) feed_url = self._get_feed_url(uri)
info_url = feed_url + '?' + self._get_feed_query(uri) info_url = update_url_query(feed_url, self._get_feed_query(uri))
return self._get_videos_info_from_url(info_url, video_id) return self._get_videos_info_from_url(info_url, video_id)
def _get_videos_info_from_url(self, url, video_id): def _get_videos_info_from_url(self, url, video_id):
@ -256,13 +259,9 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
def _get_feed_url(self, uri): def _get_feed_url(self, uri):
video_id = self._id_from_uri(uri) video_id = self._id_from_uri(uri)
site_id = uri.replace(video_id, '') config = self._download_json(
config_url = ('http://media.mtvnservices.com/pmt-arc/e1/players/{0}/' 'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge' % uri, video_id)
'context52/config.xml'.format(site_id)) return self._remove_template_parameter(config['feedWithQueryParams'])
config_doc = self._download_xml(config_url, video_id)
feed_node = config_doc.find('.//feed')
feed_url = feed_node.text.strip().split('?')[0]
return feed_url
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)

View File

@ -2,7 +2,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .mtv import MTVServicesInfoExtractor from .mtv import MTVServicesInfoExtractor
from ..compat import compat_urllib_parse_urlencode
from ..utils import update_url_query from ..utils import update_url_query
@ -59,10 +58,10 @@ class NickIE(MTVServicesInfoExtractor):
}] }]
def _get_feed_query(self, uri): def _get_feed_query(self, uri):
return compat_urllib_parse_urlencode({ return {
'feed': 'nick_arc_player_prime', 'feed': 'nick_arc_player_prime',
'mgid': uri, 'mgid': uri,
}) }
def _extract_mgid(self, webpage): def _extract_mgid(self, webpage):
return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid') return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid')