[9gag] Fix and improve extraction
parent
2d4c98dbd1
commit
d7666dff82
|
@ -1,8 +1,10 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
import json
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
|
from ..utils import str_to_int
|
||||||
|
|
||||||
|
|
||||||
class NineGagIE(InfoExtractor):
|
class NineGagIE(InfoExtractor):
|
||||||
|
@ -44,23 +46,14 @@ class NineGagIE(InfoExtractor):
|
||||||
|
|
||||||
webpage = self._download_webpage(url, display_id)
|
webpage = self._download_webpage(url, display_id)
|
||||||
|
|
||||||
youtube_id = self._html_search_regex(
|
post_view = json.loads(self._html_search_regex(
|
||||||
r'(?s)id="jsid-video-post-container".*?data-external-id="([^"]+)"',
|
r'var postView = new app\.PostView\({ post: ({.+?}),', webpage, 'post view'))
|
||||||
webpage, 'video ID')
|
|
||||||
title = self._html_search_regex(
|
youtube_id = post_view['videoExternalId']
|
||||||
r'(?s)id="jsid-video-post-container".*?data-title="([^"]+)"',
|
title = post_view['title']
|
||||||
webpage, 'title', default=None)
|
description = post_view['description']
|
||||||
if not title:
|
view_count = str_to_int(post_view['externalView'])
|
||||||
title = self._og_search_title(webpage)
|
thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w')
|
||||||
description = self._html_search_regex(
|
|
||||||
r'(?s)<div class="video-caption">.*?<p>(.*?)</p>', webpage,
|
|
||||||
'description', fatal=False)
|
|
||||||
view_count_str = self._html_search_regex(
|
|
||||||
r'<p><b>([0-9][0-9,]*)</b> views</p>', webpage, 'view count',
|
|
||||||
fatal=False)
|
|
||||||
view_count = (
|
|
||||||
None if view_count_str is None
|
|
||||||
else int(view_count_str.replace(',', '')))
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'_type': 'url_transparent',
|
'_type': 'url_transparent',
|
||||||
|
@ -71,5 +64,5 @@ class NineGagIE(InfoExtractor):
|
||||||
'title': title,
|
'title': title,
|
||||||
'description': description,
|
'description': description,
|
||||||
'view_count': view_count,
|
'view_count': view_count,
|
||||||
'thumbnail': self._og_search_thumbnail(webpage),
|
'thumbnail': thumbnail,
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue