2014-02-26 10:06:31 +11:00
from __future__ import unicode_literals
2013-08-28 20:51:22 +10:00
import re
import json
from . common import InfoExtractor
2014-02-26 10:29:45 +11:00
from . youtube import YoutubeIE
2014-12-13 22:24:42 +11:00
from . . utils import (
2013-08-28 20:51:22 +10:00
clean_html ,
2014-02-26 10:41:13 +11:00
ExtractorError ,
2013-08-28 20:51:22 +10:00
get_element_by_id ,
)
class TechTVMITIE ( InfoExtractor ) :
2014-02-26 10:06:31 +11:00
IE_NAME = ' techtv.mit.edu '
2014-12-17 10:04:24 +11:00
_VALID_URL = r ' https?://techtv \ .mit \ .edu/(?:videos|embeds)/(?P<id> \ d+) '
2013-08-28 20:51:22 +10:00
_TEST = {
2014-02-26 10:06:31 +11:00
' url ' : ' http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set ' ,
2015-09-06 14:32:42 +10:00
' md5 ' : ' 00a3a27ee20d44bcaa0933ccec4a2cf7 ' ,
2014-02-26 10:06:31 +11:00
' info_dict ' : {
' id ' : ' 25418 ' ,
' ext ' : ' mp4 ' ,
2015-09-06 14:32:42 +10:00
' title ' : ' MIT DNA and Protein Sets ' ,
' description ' : ' md5:46f5c69ce434f0a97e7c628cc142802d ' ,
2013-08-28 20:51:22 +10:00
} ,
}
def _real_extract ( self , url ) :
2014-12-17 10:04:24 +11:00
video_id = self . _match_id ( url )
2013-08-29 05:00:59 +10:00
raw_page = self . _download_webpage (
2013-08-28 20:51:22 +10:00
' http://techtv.mit.edu/videos/ %s ' % video_id , video_id )
2014-02-26 10:06:31 +11:00
clean_page = re . compile ( r ' <!--.*?--> ' , re . S ) . sub ( ' ' , raw_page )
2013-08-28 20:51:22 +10:00
2015-09-06 14:28:40 +10:00
base_url = self . _proto_relative_url ( self . _search_regex (
r ' ipadUrl: \' (.+?cloudfront.net/) ' , raw_page , ' base url ' ) , ' http: ' )
2014-02-26 10:06:31 +11:00
formats_json = self . _search_regex (
r ' bitrates: ( \ [.+? \ ]) ' , raw_page , ' video formats ' )
2013-12-24 22:38:08 +11:00
formats_mit = json . loads ( formats_json )
formats = [
{
' format_id ' : f [ ' label ' ] ,
' url ' : base_url + f [ ' url ' ] . partition ( ' : ' ) [ 2 ] ,
' ext ' : f [ ' url ' ] . partition ( ' : ' ) [ 0 ] ,
' format ' : f [ ' label ' ] ,
' width ' : f [ ' width ' ] ,
' vbr ' : f [ ' bitrate ' ] ,
}
for f in formats_mit
]
2013-08-28 20:51:22 +10:00
2013-08-29 05:00:59 +10:00
title = get_element_by_id ( ' edit-title ' , clean_page )
description = clean_html ( get_element_by_id ( ' edit-description ' , clean_page ) )
2014-02-26 10:06:31 +11:00
thumbnail = self . _search_regex (
r ' playlist:.*?url: \' (.+?) \' ' ,
raw_page , ' thumbnail ' , flags = re . DOTALL )
2013-08-28 20:51:22 +10:00
2014-02-26 10:06:31 +11:00
return {
' id ' : video_id ,
' title ' : title ,
' formats ' : formats ,
' description ' : description ,
' thumbnail ' : thumbnail ,
}
2013-08-28 20:51:22 +10:00
class MITIE ( TechTVMITIE ) :
2014-02-26 10:06:31 +11:00
IE_NAME = ' video.mit.edu '
2013-08-28 20:51:22 +10:00
_VALID_URL = r ' https?://video \ .mit \ .edu/watch/(?P<title>[^/]+) '
_TEST = {
2014-02-26 10:06:31 +11:00
' url ' : ' http://video.mit.edu/watch/the-government-is-profiling-you-13222/ ' ,
' md5 ' : ' 7db01d5ccc1895fc5010e9c9e13648da ' ,
' info_dict ' : {
' id ' : ' 21783 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' The Government is Profiling You ' ,
' description ' : ' md5:ad5795fe1e1623b73620dbfd47df9afd ' ,
2013-08-28 20:51:22 +10:00
} ,
}
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
page_title = mobj . group ( ' title ' )
webpage = self . _download_webpage ( url , page_title )
2014-02-26 10:06:31 +11:00
embed_url = self . _search_regex (
r ' <iframe .*?src= " (.+?) " ' , webpage , ' embed url ' )
2013-08-28 20:51:22 +10:00
return self . url_result ( embed_url , ie = ' TechTVMIT ' )
2014-02-26 07:44:34 +11:00
2014-02-26 10:29:45 +11:00
2014-02-26 07:44:34 +11:00
class OCWMITIE ( InfoExtractor ) :
2014-02-26 10:29:45 +11:00
IE_NAME = ' ocw.mit.edu '
2014-02-26 07:44:34 +11:00
_VALID_URL = r ' ^http://ocw \ .mit \ .edu/courses/(?P<topic>[a-z0-9 \ -]+) '
2014-02-26 10:29:45 +11:00
_BASE_URL = ' http://ocw.mit.edu/ '
2014-02-26 07:44:34 +11:00
_TESTS = [
{
2014-02-26 10:29:45 +11:00
' url ' : ' http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/ ' ,
' info_dict ' : {
' id ' : ' EObHWIEKGjA ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence ' ,
' description ' : ' In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution. ' ,
2015-01-07 21:45:19 +11:00
' upload_date ' : ' 20121109 ' ,
' uploader_id ' : ' MIT ' ,
' uploader ' : ' MIT OpenCourseWare ' ,
2014-02-26 07:44:34 +11:00
}
} ,
{
2014-02-26 10:29:45 +11:00
' url ' : ' http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/ ' ,
' info_dict ' : {
' id ' : ' 7K1sB05pE0A ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Session 1: Introduction to Derivatives ' ,
2015-01-07 21:45:19 +11:00
' upload_date ' : ' 20090818 ' ,
' uploader_id ' : ' MIT ' ,
' uploader ' : ' MIT OpenCourseWare ' ,
2014-02-26 10:29:45 +11:00
' description ' : ' This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos. ' ,
2014-02-26 07:44:34 +11:00
}
}
]
def _real_extract ( self , url ) :
2014-02-26 10:29:45 +11:00
mobj = re . match ( self . _VALID_URL , url )
topic = mobj . group ( ' topic ' )
webpage = self . _download_webpage ( url , topic )
title = self . _html_search_meta ( ' WT.cg_s ' , webpage )
description = self . _html_search_meta ( ' Description ' , webpage )
2014-02-26 07:44:34 +11:00
# search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file)
embed_chapter_media = re . search ( r ' ocw_embed_chapter_media \ ((.+?) \ ) ' , webpage )
if embed_chapter_media :
2014-02-26 10:29:45 +11:00
metadata = re . sub ( r ' [ \' " ] ' , ' ' , embed_chapter_media . group ( 1 ) )
2014-02-26 07:44:34 +11:00
metadata = re . split ( r ' , ? ' , metadata )
yt = metadata [ 1 ]
else :
# search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)
embed_media = re . search ( r ' ocw_embed_media \ ((.+?) \ ) ' , webpage )
if embed_media :
2014-02-26 10:29:45 +11:00
metadata = re . sub ( r ' [ \' " ] ' , ' ' , embed_media . group ( 1 ) )
2014-02-26 07:44:34 +11:00
metadata = re . split ( r ' , ? ' , metadata )
yt = metadata [ 1 ]
else :
raise ExtractorError ( ' Unable to find embedded YouTube video. ' )
2014-02-26 10:29:45 +11:00
video_id = YoutubeIE . extract_id ( yt )
2014-02-26 07:44:34 +11:00
2014-02-26 10:29:45 +11:00
return {
' _type ' : ' url_transparent ' ,
' id ' : video_id ,
' title ' : title ,
' description ' : description ,
' url ' : yt ,
' ie_key ' : ' Youtube ' ,
}