[utils] Use a regexp instead of HTMLParser for get_element_by_attribute
parent
11fba1751d
commit
3828505646
|
@ -152,86 +152,6 @@ def xpath_text(node, xpath, name=None, fatal=False):
|
||||||
return n.text
|
return n.text
|
||||||
|
|
||||||
|
|
||||||
if sys.version_info < (2, 7):
|
|
||||||
compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
|
|
||||||
|
|
||||||
class BaseHTMLParser(compat_html_parser.HTMLParser):
|
|
||||||
def __init(self):
|
|
||||||
compat_html_parser.HTMLParser.__init__(self)
|
|
||||||
self.html = None
|
|
||||||
|
|
||||||
def loads(self, html):
|
|
||||||
self.html = html
|
|
||||||
self.feed(html)
|
|
||||||
self.close()
|
|
||||||
|
|
||||||
class AttrParser(BaseHTMLParser):
|
|
||||||
"""Modified HTMLParser that isolates a tag with the specified attribute"""
|
|
||||||
def __init__(self, attribute, value):
|
|
||||||
self.attribute = attribute
|
|
||||||
self.value = value
|
|
||||||
self.result = None
|
|
||||||
self.started = False
|
|
||||||
self.depth = {}
|
|
||||||
self.watch_startpos = False
|
|
||||||
self.error_count = 0
|
|
||||||
BaseHTMLParser.__init__(self)
|
|
||||||
|
|
||||||
def error(self, message):
|
|
||||||
if self.error_count > 10 or self.started:
|
|
||||||
raise compat_html_parser.HTMLParseError(message, self.getpos())
|
|
||||||
self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
|
|
||||||
self.error_count += 1
|
|
||||||
self.goahead(1)
|
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
|
||||||
attrs = dict(attrs)
|
|
||||||
if self.started:
|
|
||||||
self.find_startpos(None)
|
|
||||||
if self.attribute in attrs and attrs[self.attribute] == self.value:
|
|
||||||
self.result = [tag]
|
|
||||||
self.started = True
|
|
||||||
self.watch_startpos = True
|
|
||||||
if self.started:
|
|
||||||
if not tag in self.depth: self.depth[tag] = 0
|
|
||||||
self.depth[tag] += 1
|
|
||||||
|
|
||||||
def handle_endtag(self, tag):
|
|
||||||
if self.started:
|
|
||||||
if tag in self.depth: self.depth[tag] -= 1
|
|
||||||
if self.depth[self.result[0]] == 0:
|
|
||||||
self.started = False
|
|
||||||
self.result.append(self.getpos())
|
|
||||||
|
|
||||||
def find_startpos(self, x):
|
|
||||||
"""Needed to put the start position of the result (self.result[1])
|
|
||||||
after the opening tag with the requested id"""
|
|
||||||
if self.watch_startpos:
|
|
||||||
self.watch_startpos = False
|
|
||||||
self.result.append(self.getpos())
|
|
||||||
handle_entityref = handle_charref = handle_data = handle_comment = \
|
|
||||||
handle_decl = handle_pi = unknown_decl = find_startpos
|
|
||||||
|
|
||||||
def get_result(self):
|
|
||||||
if self.result is None:
|
|
||||||
return None
|
|
||||||
if len(self.result) != 3:
|
|
||||||
return None
|
|
||||||
lines = self.html.split('\n')
|
|
||||||
lines = lines[self.result[1][0]-1:self.result[2][0]]
|
|
||||||
lines[0] = lines[0][self.result[1][1]:]
|
|
||||||
if len(lines) == 1:
|
|
||||||
lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
|
|
||||||
lines[-1] = lines[-1][:self.result[2][1]]
|
|
||||||
return '\n'.join(lines).strip()
|
|
||||||
# Hack for https://github.com/rg3/youtube-dl/issues/662
|
|
||||||
if sys.version_info < (2, 7, 3):
|
|
||||||
AttrParser.parse_endtag = (lambda self, i:
|
|
||||||
i + len("</scr'+'ipt>")
|
|
||||||
if self.rawdata[i:].startswith("</scr'+'ipt>")
|
|
||||||
else compat_html_parser.HTMLParser.parse_endtag(self, i))
|
|
||||||
|
|
||||||
|
|
||||||
def get_element_by_id(id, html):
|
def get_element_by_id(id, html):
|
||||||
"""Return the content of the tag with the specified ID in the passed HTML document"""
|
"""Return the content of the tag with the specified ID in the passed HTML document"""
|
||||||
return get_element_by_attribute("id", id, html)
|
return get_element_by_attribute("id", id, html)
|
||||||
|
@ -239,34 +159,25 @@ def get_element_by_id(id, html):
|
||||||
|
|
||||||
def get_element_by_attribute(attribute, value, html):
|
def get_element_by_attribute(attribute, value, html):
|
||||||
"""Return the content of the tag with the specified attribute in the passed HTML document"""
|
"""Return the content of the tag with the specified attribute in the passed HTML document"""
|
||||||
parser = AttrParser(attribute, value)
|
|
||||||
try:
|
|
||||||
parser.loads(html)
|
|
||||||
except compat_html_parser.HTMLParseError:
|
|
||||||
pass
|
|
||||||
return parser.get_result()
|
|
||||||
|
|
||||||
class MetaParser(BaseHTMLParser):
|
m = re.search(r'''(?xs)
|
||||||
"""
|
<([a-zA-Z0-9:._-]+)
|
||||||
Modified HTMLParser that isolates a meta tag with the specified name
|
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
|
||||||
attribute.
|
\s+%s=['"]?%s['"]?
|
||||||
"""
|
(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
|
||||||
def __init__(self, name):
|
\s*>
|
||||||
BaseHTMLParser.__init__(self)
|
(?P<content>.*?)
|
||||||
self.name = name
|
</\1>
|
||||||
self.content = None
|
''' % (re.escape(attribute), re.escape(value)), html)
|
||||||
self.result = None
|
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
if not m:
|
||||||
if tag != 'meta':
|
return None
|
||||||
return
|
res = m.group('content')
|
||||||
attrs = dict(attrs)
|
|
||||||
if attrs.get('name') == self.name:
|
|
||||||
self.result = attrs.get('content')
|
|
||||||
|
|
||||||
def get_result(self):
|
if res.startswith('"') or res.startswith("'"):
|
||||||
return self.result
|
res = res[1:-1]
|
||||||
|
|
||||||
|
return unescapeHTML(res)
|
||||||
|
|
||||||
|
|
||||||
def clean_html(html):
|
def clean_html(html):
|
||||||
|
|
Loading…
Reference in New Issue