2013-07-11 01:49:11 +10:00
# encoding: utf-8
2014-01-06 11:47:52 +11:00
from __future__ import unicode_literals
2013-06-24 04:31:45 +10:00
import os
import re
from . common import InfoExtractor
2014-01-06 11:42:58 +11:00
from . youtube import YoutubeIE
2013-06-24 04:31:45 +10:00
from . . utils import (
compat_urllib_error ,
compat_urllib_parse ,
compat_urllib_request ,
2013-08-28 20:47:27 +10:00
compat_urlparse ,
2013-06-24 04:31:45 +10:00
ExtractorError ,
2013-12-21 03:05:28 +11:00
HEADRequest ,
2013-10-15 21:05:13 +11:00
smuggle_url ,
unescapeHTML ,
2013-12-17 22:33:55 +11:00
unified_strdate ,
url_basename ,
2013-06-24 04:31:45 +10:00
)
2013-07-11 01:49:11 +10:00
from . brightcove import BrightcoveIE
2013-12-20 06:28:52 +11:00
from . ooyala import OoyalaIE
2013-06-24 04:31:45 +10:00
2013-08-25 06:49:52 +10:00
2013-06-24 04:31:45 +10:00
class GenericIE ( InfoExtractor ) :
2014-01-06 11:47:52 +11:00
IE_DESC = ' Generic downloader that works on some sites '
2013-06-24 04:31:45 +10:00
_VALID_URL = r ' .* '
2014-01-06 11:47:52 +11:00
IE_NAME = ' generic '
2013-07-11 01:49:11 +10:00
_TESTS = [
{
2014-01-06 11:47:52 +11:00
' url ' : ' http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html ' ,
' file ' : ' 13601338388002.mp4 ' ,
' md5 ' : ' 6e15c93721d7ec9e9ca3fdbf07982cfd ' ,
' info_dict ' : {
' uploader ' : ' www.hodiho.fr ' ,
' title ' : ' R \u00e9 gis plante sa Jeep ' ,
2013-07-11 01:49:11 +10:00
}
} ,
2013-10-28 00:40:25 +11:00
# bandcamp page with custom domain
{
2014-01-06 11:47:52 +11:00
' add_ie ' : [ ' Bandcamp ' ] ,
' url ' : ' http://bronyrock.com/track/the-pony-mash ' ,
' file ' : ' 3235767654.mp3 ' ,
' info_dict ' : {
' title ' : ' The Pony Mash ' ,
' uploader ' : ' M_Pallante ' ,
2013-10-28 00:40:25 +11:00
} ,
2014-01-06 11:47:52 +11:00
' skip ' : ' There is a limit of 200 free downloads / month for the test song ' ,
2013-10-28 00:40:25 +11:00
} ,
2013-11-07 02:40:24 +11:00
# embedded brightcove video
2013-11-08 07:06:48 +11:00
# it also tests brightcove videos that need to set the 'Referer' in the
# http requests
2013-11-07 02:40:24 +11:00
{
2014-01-06 11:47:52 +11:00
' add_ie ' : [ ' Brightcove ' ] ,
' url ' : ' http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/ ' ,
' info_dict ' : {
' id ' : ' 2765128793001 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Le cours de bourse : l’ analyse technique ' ,
' description ' : ' md5:7e9ad046e968cb2d1114004aba466fd9 ' ,
' uploader ' : ' BFM BUSINESS ' ,
2013-11-07 02:40:24 +11:00
} ,
2014-01-06 11:47:52 +11:00
' params ' : {
' skip_download ' : True ,
2013-11-07 02:40:24 +11:00
} ,
} ,
2014-01-28 13:35:32 +11:00
{
# https://github.com/rg3/youtube-dl/issues/2253
' url ' : ' http://bcove.me/i6nfkrc3 ' ,
' file ' : ' 3101154703001.mp4 ' ,
' md5 ' : ' 0ba9446db037002366bab3b3eb30c88c ' ,
' info_dict ' : {
' title ' : ' Still no power ' ,
' uploader ' : ' thestar.com ' ,
' description ' : ' Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs. ' ,
} ,
' add_ie ' : [ ' Brightcove ' ] ,
} ,
2013-12-17 22:33:55 +11:00
# Direct link to a video
{
2014-01-06 11:47:52 +11:00
' url ' : ' http://media.w3.org/2010/05/sintel/trailer.mp4 ' ,
' file ' : ' trailer.mp4 ' ,
' md5 ' : ' 67d406c2bcb6af27fa886f31aa934bbe ' ,
' info_dict ' : {
' id ' : ' trailer ' ,
' title ' : ' trailer ' ,
' upload_date ' : ' 20100513 ' ,
2013-12-17 22:33:55 +11:00
}
2013-12-20 06:28:52 +11:00
} ,
# ooyala video
{
2014-01-06 11:47:52 +11:00
' url ' : ' http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219 ' ,
2014-01-21 11:40:34 +11:00
' file ' : ' BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ.mp4 ' ,
2014-01-06 11:47:52 +11:00
' md5 ' : ' 5644c6ca5d5782c1d0d350dad9bd840c ' ,
' info_dict ' : {
' id ' : ' BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ ' ,
' ext ' : ' mp4 ' ,
2014-01-21 11:40:34 +11:00
' title ' : ' 2cc213299525360.mov ' , # that's what we get
2013-12-20 06:28:52 +11:00
} ,
} ,
2013-07-11 01:49:11 +10:00
]
2013-06-24 04:31:45 +10:00
def report_download_webpage ( self , video_id ) :
""" Report webpage download. """
if not self . _downloader . params . get ( ' test ' , False ) :
2014-01-06 11:47:52 +11:00
self . _downloader . report_warning ( ' Falling back on generic information extractor. ' )
2013-06-24 04:31:45 +10:00
super ( GenericIE , self ) . report_download_webpage ( video_id )
def report_following_redirect ( self , new_url ) :
""" Report information extraction. """
2014-01-06 11:47:52 +11:00
self . _downloader . to_screen ( ' [redirect] Following redirect to %s ' % new_url )
2013-06-24 04:31:45 +10:00
2013-12-17 22:33:55 +11:00
def _send_head ( self , url ) :
2013-06-24 04:31:45 +10:00
""" Check if it is a redirect, like url shorteners, in case return the new url. """
class HEADRedirectHandler ( compat_urllib_request . HTTPRedirectHandler ) :
"""
Subclass the HTTPRedirectHandler to make it use our
2013-12-21 03:05:28 +11:00
HEADRequest also on the redirected URL
2013-06-24 04:31:45 +10:00
"""
def redirect_request ( self , req , fp , code , msg , headers , newurl ) :
if code in ( 301 , 302 , 303 , 307 ) :
newurl = newurl . replace ( ' ' , ' % 20 ' )
newheaders = dict ( ( k , v ) for k , v in req . headers . items ( )
if k . lower ( ) not in ( " content-length " , " content-type " ) )
2013-12-21 03:05:28 +11:00
return HEADRequest ( newurl ,
2013-06-24 04:31:45 +10:00
headers = newheaders ,
origin_req_host = req . get_origin_req_host ( ) ,
unverifiable = True )
else :
raise compat_urllib_error . HTTPError ( req . get_full_url ( ) , code , msg , headers , fp )
class HTTPMethodFallback ( compat_urllib_request . BaseHandler ) :
"""
Fallback to GET if HEAD is not allowed ( 405 HTTP error )
"""
def http_error_405 ( self , req , fp , code , msg , headers ) :
fp . read ( )
fp . close ( )
newheaders = dict ( ( k , v ) for k , v in req . headers . items ( )
if k . lower ( ) not in ( " content-length " , " content-type " ) )
return self . parent . open ( compat_urllib_request . Request ( req . get_full_url ( ) ,
headers = newheaders ,
origin_req_host = req . get_origin_req_host ( ) ,
unverifiable = True ) )
# Build our opener
opener = compat_urllib_request . OpenerDirector ( )
for handler in [ compat_urllib_request . HTTPHandler , compat_urllib_request . HTTPDefaultErrorHandler ,
HTTPMethodFallback , HEADRedirectHandler ,
compat_urllib_request . HTTPErrorProcessor , compat_urllib_request . HTTPSHandler ] :
opener . add_handler ( handler ( ) )
2013-12-21 03:05:28 +11:00
response = opener . open ( HEADRequest ( url ) )
2013-06-24 04:31:45 +10:00
if response is None :
2014-01-06 11:47:52 +11:00
raise ExtractorError ( ' Invalid URL protocol ' )
2013-12-17 22:33:55 +11:00
return response
2013-06-24 04:31:45 +10:00
def _real_extract ( self , url ) :
2013-09-07 02:39:35 +10:00
parsed_url = compat_urlparse . urlparse ( url )
if not parsed_url . scheme :
2014-01-23 00:16:43 +11:00
default_search = self . _downloader . params . get ( ' default_search ' )
if default_search is None :
default_search = ' auto '
if default_search == ' auto ' :
if ' / ' in url :
self . _downloader . report_warning ( ' The url doesn \' t specify the protocol, trying with http ' )
return self . url_result ( ' http:// ' + url )
else :
return self . url_result ( ' ytsearch: ' + url )
else :
assert ' : ' in default_search
return self . url_result ( default_search + url )
2013-12-17 22:33:55 +11:00
video_id = os . path . splitext ( url . split ( ' / ' ) [ - 1 ] ) [ 0 ]
2013-09-07 02:39:35 +10:00
2014-01-06 11:47:52 +11:00
self . to_screen ( ' %s : Requesting header ' % video_id )
2013-12-27 18:38:42 +11:00
2013-08-21 12:31:57 +10:00
try :
2013-12-17 22:33:55 +11:00
response = self . _send_head ( url )
# Check for redirect
new_url = response . geturl ( )
if url != new_url :
self . report_following_redirect ( new_url )
2013-12-17 22:04:33 +11:00
return self . url_result ( new_url )
2013-12-17 22:33:55 +11:00
# Check for direct link to a video
content_type = response . headers . get ( ' Content-Type ' , ' ' )
2013-12-18 02:26:32 +11:00
m = re . match ( r ' ^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$ ' , content_type )
2013-12-17 22:33:55 +11:00
if m :
upload_date = response . headers . get ( ' Last-Modified ' )
if upload_date :
upload_date = unified_strdate ( upload_date )
return {
' id ' : video_id ,
' title ' : os . path . splitext ( url_basename ( url ) ) [ 0 ] ,
' formats ' : [ {
' format_id ' : m . group ( ' format_id ' ) ,
' url ' : url ,
2014-01-06 11:47:52 +11:00
' vcodec ' : ' none ' if m . group ( ' type ' ) == ' audio ' else None
2013-12-17 22:33:55 +11:00
} ] ,
' upload_date ' : upload_date ,
}
2013-08-21 12:31:57 +10:00
except compat_urllib_error . HTTPError :
# This may be a stupid server that doesn't like HEAD, our UA, or so
pass
2013-06-24 04:31:45 +10:00
try :
webpage = self . _download_webpage ( url , video_id )
except ValueError :
# since this is the last-resort InfoExtractor, if
# this error is thrown, it'll be thrown here
2014-01-06 11:47:52 +11:00
raise ExtractorError ( ' Failed to download URL: %s ' % url )
2013-06-24 04:31:45 +10:00
self . report_extraction ( video_id )
2013-11-18 23:28:26 +11:00
# it's tempting to parse this further, but you would
# have to take into account all the variations like
# Video Title - Site Name
# Site Name | Video Title
# Video Title - Tagline | Site Name
# and so on and so forth; it's just not practical
2013-12-06 19:15:04 +11:00
video_title = self . _html_search_regex (
2014-01-06 11:47:52 +11:00
r ' (?s)<title>(.*?)</title> ' , webpage , ' video title ' ,
default = ' video ' )
2013-12-06 19:15:04 +11:00
# video uploader is domain name
video_uploader = self . _search_regex (
2014-01-06 11:47:52 +11:00
r ' ^(?:https?://)?([^/]*)/.* ' , url , ' video uploader ' )
2013-11-18 23:28:26 +11:00
2013-08-27 05:29:31 +10:00
# Look for BrightCove:
2014-02-04 01:19:40 +11:00
bc_urls = BrightcoveIE . _extract_brightcove_urls ( webpage )
if bc_urls :
2014-01-06 11:47:52 +11:00
self . to_screen ( ' Brightcove video detected. ' )
2014-02-04 01:19:40 +11:00
entries = [ {
' _type ' : ' url ' ,
' url ' : smuggle_url ( bc_url , { ' Referer ' : url } ) ,
' ie_key ' : ' Brightcove '
} for bc_url in bc_urls ]
return {
' _type ' : ' playlist ' ,
' title ' : video_title ,
' id ' : video_id ,
' entries ' : entries ,
}
2013-07-11 01:49:11 +10:00
2013-12-22 13:34:13 +11:00
# Look for embedded (iframe) Vimeo player
2013-10-15 21:05:13 +11:00
mobj = re . search (
2014-01-30 08:26:46 +11:00
r ' <iframe[^>]+?src= " ((?:https?:)?//player \ .vimeo \ .com/video/.+?) " ' , webpage )
2013-10-15 21:05:13 +11:00
if mobj :
player_url = unescapeHTML ( mobj . group ( 1 ) )
surl = smuggle_url ( player_url , { ' Referer ' : url } )
return self . url_result ( surl , ' Vimeo ' )
2013-12-22 13:34:13 +11:00
# Look for embedded (swf embed) Vimeo player
mobj = re . search (
2014-01-30 08:26:46 +11:00
r ' <embed[^>]+?src= " (https?://(?:www \ .)?vimeo \ .com/moogaloop \ .swf.+?) " ' , webpage )
2013-12-22 13:34:13 +11:00
if mobj :
return self . url_result ( mobj . group ( 1 ) , ' Vimeo ' )
2013-10-18 20:44:57 +11:00
# Look for embedded YouTube player
2013-12-20 06:44:30 +11:00
matches = re . findall ( r ''' (?x)
( ? : < iframe [ ^ > ] + ? src = | embedSWF \( \s * )
( [ " \' ])(?P<url>(?:https?:)?//(?:www \ .)?youtube \ .com/
( ? : embed | v ) / . + ? )
\1 ''' , webpage)
2013-11-18 23:28:26 +11:00
if matches :
urlrs = [ self . url_result ( unescapeHTML ( tuppl [ 1 ] ) , ' Youtube ' )
for tuppl in matches ]
return self . playlist_result (
urlrs , playlist_id = video_id , playlist_title = video_title )
2013-10-18 20:44:57 +11:00
2013-12-01 11:21:33 +11:00
# Look for embedded Dailymotion player
matches = re . findall (
2013-12-06 19:15:04 +11:00
r ' <iframe[^>]+?src=([ " \' ])(?P<url>(?:https?:)?//(?:www \ .)?dailymotion \ .com/embed/video/.+?) \ 1 ' , webpage )
2013-12-01 11:21:33 +11:00
if matches :
urlrs = [ self . url_result ( unescapeHTML ( tuppl [ 1 ] ) , ' Dailymotion ' )
for tuppl in matches ]
return self . playlist_result (
urlrs , playlist_id = video_id , playlist_title = video_title )
2013-12-06 19:15:04 +11:00
# Look for embedded Wistia player
match = re . search (
r ' <iframe[^>]+?src=([ " \' ])(?P<url>(?:https?:)?//(?:fast \ .)?wistia \ .net/embed/iframe/.+?) \ 1 ' , webpage )
if match :
return {
' _type ' : ' url_transparent ' ,
' url ' : unescapeHTML ( match . group ( ' url ' ) ) ,
' ie_key ' : ' Wistia ' ,
' uploader ' : video_uploader ,
' title ' : video_title ,
' id ' : video_id ,
}
2013-12-17 06:08:23 +11:00
# Look for embedded blip.tv player
2013-12-30 16:15:02 +11:00
mobj = re . search ( r ' <meta \ s[^>]*https?://api \ .blip \ .tv/ \ w+/redirect/ \ w+/( \ d+) ' , webpage )
2013-12-17 06:08:23 +11:00
if mobj :
2013-12-30 16:15:02 +11:00
return self . url_result ( ' http://blip.tv/a/a- ' + mobj . group ( 1 ) , ' BlipTV ' )
mobj = re . search ( r ' <(?:iframe|embed|object) \ s[^>]*(https?://(?: \ w+ \ .)?blip \ .tv/(?:play/|api \ .swf#)[a-zA-Z0-9]+) ' , webpage )
2013-12-17 06:08:23 +11:00
if mobj :
2013-12-30 16:15:02 +11:00
return self . url_result ( mobj . group ( 1 ) , ' BlipTV ' )
2013-12-17 06:08:23 +11:00
2013-10-28 00:40:25 +11:00
# Look for Bandcamp pages with custom domain
mobj = re . search ( r ' <meta property= " og:url " [^>]*?content= " (.*?bandcamp \ .com.*?) " ' , webpage )
if mobj is not None :
burl = unescapeHTML ( mobj . group ( 1 ) )
2013-11-23 02:05:14 +11:00
# Don't set the extractor because it can be a track url or an album
return self . url_result ( burl )
2013-10-28 00:40:25 +11:00
2013-12-17 07:45:21 +11:00
# Look for embedded Vevo player
mobj = re . search (
r ' <iframe[^>]+?src=([ " \' ])(?P<url>(?:https?:)?//(?:cache \ .)?vevo \ .com/.+?) \ 1 ' , webpage )
if mobj is not None :
return self . url_result ( mobj . group ( ' url ' ) )
2013-12-20 06:28:52 +11:00
# Look for Ooyala videos
mobj = re . search ( r ' player.ooyala.com/[^ " ?]+ \ ?[^ " ]*?(?:embedCode|ec)=([^ " &]+) ' , webpage )
if mobj is not None :
return OoyalaIE . _build_url_result ( mobj . group ( 1 ) )
2013-12-21 03:05:28 +11:00
# Look for Aparat videos
2014-01-07 21:04:27 +11:00
mobj = re . search ( r ' <iframe src= " (http://www \ .aparat \ .com/video/[^ " ]+) " ' , webpage )
2013-12-21 03:05:28 +11:00
if mobj is not None :
return self . url_result ( mobj . group ( 1 ) , ' Aparat ' )
2014-01-07 18:07:46 +11:00
# Look for MPORA videos
2014-01-30 08:26:46 +11:00
mobj = re . search ( r ' <iframe .*?src= " (http://mpora \ .(?:com|de)/videos/[^ " ]+) " ' , webpage )
2014-01-07 18:07:46 +11:00
if mobj is not None :
return self . url_result ( mobj . group ( 1 ) , ' Mpora ' )
2014-01-08 12:11:46 +11:00
2014-01-08 12:07:11 +11:00
# Look for embedded Novamov player
mobj = re . search (
r ' <iframe[^>]+?src=([ " \' ])(?P<url>http://(?:(?:embed|www) \ .)?novamov \ .com/embed \ .php.+?) \ 1 ' , webpage )
if mobj is not None :
return self . url_result ( mobj . group ( ' url ' ) , ' Novamov ' )
2014-01-07 18:07:46 +11:00
2014-01-22 04:10:14 +11:00
# Look for embedded Facebook player
mobj = re . search (
2014-01-27 15:47:30 +11:00
r ' <iframe[^>]+?src=([ " \' ])(?P<url>https://www \ .facebook \ .com/video/embed.+?) \ 1 ' , webpage )
2014-01-22 04:10:14 +11:00
if mobj is not None :
return self . url_result ( mobj . group ( ' url ' ) , ' Facebook ' )
2014-01-27 15:47:30 +11:00
# Look for embedded Huffington Post player
mobj = re . search (
2014-01-30 08:26:46 +11:00
r ' <iframe[^>]+?src=([ " \' ])(?P<url>https?://embed \ .live \ .huffingtonpost \ .com/.+?) \ 1 ' , webpage )
2014-01-27 15:47:30 +11:00
if mobj is not None :
return self . url_result ( mobj . group ( ' url ' ) , ' HuffPost ' )
2013-06-24 04:31:45 +10:00
# Start with something easy: JW Player in SWFObject
mobj = re . search ( r ' flashvars: [ \' " ](?:.*&)?file=(http[^ \' " &]*) ' , webpage )
2014-01-05 15:34:06 +11:00
if mobj is None :
# Look for gorilla-vid style embedding
2014-01-30 15:29:09 +11:00
mobj = re . search ( r ' (?s)(?:jw_plugins|JWPlayerOptions).*?file \ s*: \ s*[ " \' ](.*?)[ " \' ] ' , webpage )
2013-06-24 04:31:45 +10:00
if mobj is None :
# Broaden the search a little bit
mobj = re . search ( r ' [^A-Za-z0-9]?(?:file|source)=(http[^ \' " &]*) ' , webpage )
if mobj is None :
# Broaden the search a little bit: JWPlayer JS loader
2014-01-17 12:13:00 +11:00
mobj = re . search ( r ' [^A-Za-z0-9]?file[ " \' ]?: \ s*[ " \' ](http(?![^ \' " ]+ \ .[0-9]+[ \' " ])[^ \' " ]+)[ " \' ] ' , webpage )
2013-06-24 04:31:45 +10:00
if mobj is None :
# Try to find twitter cards info
mobj = re . search ( r ' <meta (?:property|name)= " twitter:player:stream " (?:content|value)= " (.+?) " ' , webpage )
if mobj is None :
# We look for Open Graph info:
# We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
m_video_type = re . search ( r ' <meta.*?property= " og:video:type " .*?content= " video/(.*?) " ' , webpage )
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
if m_video_type is not None :
mobj = re . search ( r ' <meta.*?property= " og:video " .*?content= " (.*?) " ' , webpage )
2013-08-21 12:32:22 +10:00
if mobj is None :
# HTML5 video
2013-09-06 02:02:17 +10:00
mobj = re . search ( r ' <video[^<]*(?:>.*?<source.*?)? src= " ([^ " ]+) " ' , webpage , flags = re . DOTALL )
2013-06-24 04:31:45 +10:00
if mobj is None :
2014-01-06 11:47:52 +11:00
raise ExtractorError ( ' Unsupported URL: %s ' % url )
2013-06-24 04:31:45 +10:00
# It's possible that one of the regexes
# matched, but returned an empty group:
if mobj . group ( 1 ) is None :
2014-01-06 11:47:52 +11:00
raise ExtractorError ( ' Did not find a valid video URL at %s ' % url )
2013-06-24 04:31:45 +10:00
2013-09-06 02:02:17 +10:00
video_url = mobj . group ( 1 )
2013-08-28 20:47:27 +10:00
video_url = compat_urlparse . urljoin ( url , video_url )
2013-09-06 02:02:17 +10:00
video_id = compat_urllib_parse . unquote ( os . path . basename ( video_url ) )
2013-06-24 04:31:45 +10:00
2014-01-06 11:42:58 +11:00
# Sometimes, jwplayer extraction will result in a YouTube URL
if YoutubeIE . suitable ( video_url ) :
return self . url_result ( video_url , ' Youtube ' )
2013-06-24 04:31:45 +10:00
# here's a fun little line of code for you:
video_id = os . path . splitext ( video_id ) [ 0 ]
2013-11-25 13:35:52 +11:00
return {
2014-01-06 11:42:58 +11:00
' id ' : video_id ,
' url ' : video_url ,
2013-06-24 04:31:45 +10:00
' uploader ' : video_uploader ,
2014-01-06 11:42:58 +11:00
' title ' : video_title ,
2013-11-25 13:35:52 +11:00
}