[vrv] Add new extractor

2017-03-31 23:28:24 +01:00 · 2017-03-31 23:28:24 +01:00 · 77c8ebe631
parent 7453999580
commit 77c8ebe631
2 changed files with 152 additions and 0 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -1182,6 +1182,7 @@ from .voxmedia import VoxMediaIE
 from .vporn import VpornIE
 from .vrt import VRTIE
 from .vrak import VrakIE
 from .vrv import VRVIE
 from .medialaan import MedialaanIE
 from .vube import VubeIE
 from .vuclip import VuClipIE
--- a/youtube_dl/extractor/vrv.py
+++ b/youtube_dl/extractor/vrv.py
@ -0,0 +1,151 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import base64
 import json
 import hashlib
 import hmac
 import random
 import string
 import time
 from .common import InfoExtractor
 from ..compat import (
    compat_urllib_parse_urlencode,
    compat_urlparse,
 )
 from ..utils import (
    float_or_none,
    int_or_none,
 )
 class VRVIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)'
    _TEST = {
        'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT',
        'info_dict': {
            'id': 'GR9PNZ396',
            'ext': 'mp4',
            'title': 'BOSTON: WHERE THE PAST IS THE PRESENT',
            'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f',
            'uploader_id': 'seeso',
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        },
    }
    _API_DOMAIN = None
    _API_PARAMS = {}
    _CMS_SIGNING = {}
    def _call_api(self, path, video_id, note, data=None):
        base_url = self._API_DOMAIN + '/core/' + path
        encoded_query = compat_urllib_parse_urlencode({
            'oauth_consumer_key': self._API_PARAMS['oAuthKey'],
            'oauth_nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]),
            'oauth_signature_method': 'HMAC-SHA1',
            'oauth_timestamp': int(time.time()),
            'oauth_version': '1.0',
        })
        headers = self.geo_verification_headers()
        if data:
            data = json.dumps(data).encode()
            headers['Content-Type'] = 'application/json'
        method = 'POST' if data else 'GET'
        base_string = '&'.join([method, compat_urlparse.quote(base_url, ''), compat_urlparse.quote(encoded_query, '')])
        oauth_signature = base64.b64encode(hmac.new(
            (self._API_PARAMS['oAuthSecret'] + '&').encode('ascii'),
            base_string.encode(), hashlib.sha1).digest()).decode()
        encoded_query += '&oauth_signature=' + compat_urlparse.quote(oauth_signature, '')
        return self._download_json(
            '?'.join([base_url, encoded_query]), video_id,
            note='Downloading %s JSON metadata' % note, headers=headers, data=data)
    def _call_cms(self, path, video_id, note):
        return self._download_json(
            self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING,
            note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers())
    def _set_api_params(self, webpage, video_id):
        if not self._API_PARAMS:
            self._API_PARAMS = self._parse_json(self._search_regex(
                r'window\.__APP_CONFIG__\s*=\s*({.+?})</script>',
                webpage, 'api config'), video_id)['cxApiParams']
            self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co')
    def _set_cms_signing(self, video_id):
        if not self._CMS_SIGNING:
            self._CMS_SIGNING = self._call_api('index', video_id, 'CMS Signing')['cms_signing']
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(
            url, video_id,
            headers=self.geo_verification_headers())
        media_resource = self._parse_json(self._search_regex(
            r'window\.__INITIAL_STATE__\s*=\s*({.+?})</script>',
            webpage, 'inital state'), video_id).get('watch', {}).get('mediaResource') or {}
        video_data = media_resource.get('json')
        if not video_data:
            self._set_api_params(webpage, video_id)
            episode_path = self._call_api('cms_resource', video_id, 'episode resource path', data={
                'resource_key': 'cms:/episodes/' + video_id,
            })['__links__']['cms_resource']['href']
            self._set_cms_signing(video_id)
            video_data = self._call_cms(episode_path, video_id, 'video')
        title = video_data['title']
        streams_json = media_resource.get('streams', {}).get('json', {})
        if not streams_json:
            self._set_api_params(webpage, video_id)
            streams_path = video_data['__links__']['streams']['href']
            self._set_cms_signing(video_id)
            streams_json = self._call_cms(streams_path, video_id, 'streams')
        audio_locale = streams_json.get('audio_locale')
        formats = []
        for stream_id, stream in streams_json.get('streams', {}).get('adaptive_hls', {}).items():
            stream_url = stream.get('url')
            if not stream_url:
                continue
            stream_id = stream_id or audio_locale
            m3u8_formats = self._extract_m3u8_formats(
                stream_url, video_id, 'mp4', m3u8_id=stream_id,
                note='Downloading %s m3u8 information' % stream_id,
                fatal=False)
            if audio_locale:
                for f in m3u8_formats:
                    f['language'] = audio_locale
            formats.extend(m3u8_formats)
        self._sort_formats(formats)
        thumbnails = []
        for thumbnail in video_data.get('images', {}).get('thumbnails', []):
            thumbnail_url = thumbnail.get('source')
            if not thumbnail_url:
                continue
            thumbnails.append({
                'url': thumbnail_url,
                'width': int_or_none(thumbnail.get('width')),
                'height': int_or_none(thumbnail.get('height')),
            })
        return {
            'id': video_id,
            'title': title,
            'formats': formats,
            'thumbnails': thumbnails,
            'description': video_data.get('description'),
            'duration': float_or_none(video_data.get('duration_ms'), 1000),
            'uploader_id': video_data.get('channel_id'),
            'series': video_data.get('series_title'),
            'season': video_data.get('season_title'),
            'season_number': int_or_none(video_data.get('season_number')),
            'season_id': video_data.get('season_id'),
            'episode': title,
            'episode_number': int_or_none(video_data.get('episode_number')),
            'episode_id': video_data.get('production_episode_id'),
        }