From cb23192bc4c56d80229a7a5f70cb61d0879db6c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 19 Jun 2016 00:35:29 +0700 Subject: [PATCH] [closertotruth] Update and improve (Closes #8680) --- youtube_dl/extractor/closertotruth.py | 117 +++++++++++++++----------- youtube_dl/extractor/extractors.py | 1 + 2 files changed, 71 insertions(+), 47 deletions(-) diff --git a/youtube_dl/extractor/closertotruth.py b/youtube_dl/extractor/closertotruth.py index d04ff5e4f..26243d52d 100644 --- a/youtube_dl/extractor/closertotruth.py +++ b/youtube_dl/extractor/closertotruth.py @@ -1,69 +1,92 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor class CloserToTruthIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(episodes/|(series|interviews)/(?:[^#]+#video-)?(?P\d+))' - _TESTS = [ - { - 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', - 'md5': '5c548bde260a9247ddfdc07c7458ed29', - 'info_dict': { - 'id': '0_zof1ktre', - 'ext': 'mov', - 'title': 'Solutions to the Mind-Body Problem?', - 'upload_date': '20140221', - 'timestamp': 1392956007, - 'uploader_id': 'CTTXML' - } + _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', + 'info_dict': { + 'id': '0_zof1ktre', + 'display_id': 'solutions-the-mind-body-problem', + 'ext': 'mov', + 'title': 'Solutions to the Mind-Body Problem?', + 'upload_date': '20140221', + 'timestamp': 1392956007, + 'uploader_id': 'CTTXML' }, - { - 'url': 'http://closertotruth.com/interviews/1725', - 'md5': 'b00598fd6a38372edb976408f72c5792', - 'info_dict': { - 'id': '0_19qv5rn1', - 'ext': 'mov', - 'title': 'AyaFr-002 - Francisco J. Ayala', - 'upload_date': '20140307', - 'timestamp': 1394236431, - 'uploader_id': 'CTTXML' - } + 'params': { + 'skip_download': True, }, - { - 'url': 'http://closertotruth.com/episodes/how-do-brains-work', - 'md5': '4dd96aa0a5c296afa5c0bd24895c2f16', - 'info_dict': { - 'id': '0_iuxai6g6', - 'ext': 'mov', - 'title': 'How do Brains Work?', - 'upload_date': '20140221', - 'timestamp': 1392956024, - 'uploader_id': 'CTTXML' - } + }, { + 'url': 'http://closertotruth.com/episodes/how-do-brains-work', + 'info_dict': { + 'id': '0_iuxai6g6', + 'display_id': 'how-do-brains-work', + 'ext': 'mov', + 'title': 'How do Brains Work?', + 'upload_date': '20140221', + 'timestamp': 1392956024, + 'uploader_id': 'CTTXML' }, - ] + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://closertotruth.com/interviews/1725', + 'info_dict': { + 'id': '1725', + 'title': 'AyaFr-002', + }, + 'playlist_mincount': 2, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) - video_title = self._search_regex(r'(.+) \|.+', webpage, 'video title') + webpage = self._download_webpage(url, display_id) - entry_id = self._search_regex(r']+id="(?:video-%s|embed-kaltura)"[^>]+data-kaltura="([^"]+)' % video_id, webpage, "video entry_id") + partner_id = self._search_regex( + r']+src=["\'].*?\b(?:partner_id|p)/(\d+)', + webpage, 'kaltura partner_id') - interviewee_name = self._search_regex(r'
(.*).+', webpage, "video interviewee_name", False) + title = self._search_regex( + r'(.+?)\s*\|\s*.+?', webpage, 'video title') - if interviewee_name: - video_title = video_title + ' - ' + interviewee_name + select = self._search_regex( + r'(?s)]+id="select-version"[^>]*>(.+?)', + webpage, 'select version', default=None) + if select: + entry_ids = set() + entries = [] + for mobj in re.finditer( + r']+value=(["\'])(?P[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P[^<]+)', + webpage): + entry_id = mobj.group('id') + if entry_id in entry_ids: + continue + entry_ids.add(entry_id) + entries.append({ + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'ie_key': 'Kaltura', + 'title': mobj.group('title'), + }) + if entries: + return self.playlist_result(entries, display_id, title) - p_id = self._search_regex(r'<script[^>]+src=["\'].+?partner_id/(\d+)', webpage, "kaltura partner_id") + entry_id = self._search_regex( + r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2', + webpage, 'kaltura entry_id', group='id') return { '_type': 'url_transparent', - 'id': entry_id, - 'url': 'kaltura:%s:%s' % (p_id, entry_id), + 'display_id': display_id, + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), 'ie_key': 'Kaltura', - 'title': video_title + 'title': title } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6dc5904b3..2ff867651 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -140,6 +140,7 @@ from .cliprs import ClipRsIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE +from .closertotruth import CloserToTruthIE from .cloudy import CloudyIE from .clubic import ClubicIE from .clyp import ClypIE