2014-02-10 00:22:56 +11:00
from __future__ import unicode_literals
2013-09-10 19:19:58 +10:00
import re
import json
from . common import InfoExtractor
2014-12-13 22:24:42 +11:00
from . . compat import (
2013-09-10 19:19:58 +10:00
compat_urlparse ,
2014-12-13 22:24:42 +11:00
)
from . . utils import (
2013-09-10 19:19:58 +10:00
ExtractorError ,
2016-07-05 21:01:04 +10:00
get_element_by_id ,
2013-09-10 19:19:58 +10:00
)
class SlideshareIE ( InfoExtractor ) :
_VALID_URL = r ' https?://www \ .slideshare \ .net/[^/]+?/(?P<title>.+?)($| \ ?) '
_TEST = {
2014-02-10 00:22:56 +11:00
' url ' : ' http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity ' ,
' info_dict ' : {
' id ' : ' 25665706 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Managing Scale and Complexity ' ,
' description ' : ' This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix. ' ,
2013-09-10 19:19:58 +10:00
} ,
}
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
page_title = mobj . group ( ' title ' )
webpage = self . _download_webpage ( url , page_title )
slideshare_obj = self . _search_regex (
2015-03-27 02:46:20 +11:00
r ' \ $ \ .extend \ (slideshare_object, \ s*( \ { .*? \ }) \ ); ' ,
2014-02-10 00:22:56 +11:00
webpage , ' slideshare object ' )
2013-09-10 19:19:58 +10:00
info = json . loads ( slideshare_obj )
2014-02-10 00:22:56 +11:00
if info [ ' slideshow ' ] [ ' type ' ] != ' video ' :
raise ExtractorError ( ' Webpage type is " %s " : only video extraction is supported for Slideshare ' % info [ ' slideshow ' ] [ ' type ' ] , expected = True )
2013-09-10 19:19:58 +10:00
doc = info [ ' doc ' ]
bucket = info [ ' jsplayer ' ] [ ' video_bucket ' ]
ext = info [ ' jsplayer ' ] [ ' video_extension ' ]
video_url = compat_urlparse . urljoin ( bucket , doc + ' -SD. ' + ext )
2016-07-05 21:01:04 +10:00
description = get_element_by_id ( ' slideshow-description-paragraph ' , webpage ) or self . _html_search_regex (
2015-01-01 05:26:19 +11:00
r ' (?s)<p[^>]+itemprop= " description " [^>]*>(.+?)</p> ' , webpage ,
2014-04-11 10:19:15 +10:00
' description ' , fatal = False )
2013-09-10 19:19:58 +10:00
return {
' _type ' : ' video ' ,
' id ' : info [ ' slideshow ' ] [ ' id ' ] ,
' title ' : info [ ' slideshow ' ] [ ' title ' ] ,
' ext ' : ext ,
' url ' : video_url ,
' thumbnail ' : info [ ' slideshow ' ] [ ' pin_image_url ' ] ,
2016-07-05 21:01:04 +10:00
' description ' : description . strip ( ) if description else None ,
2013-09-10 19:19:58 +10:00
}