mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-02-18 18:30:58 +01:00
[parsing] add new module containing various HTML parser classes as replacement for utils.get_html_... functions
* performance is mostly better for large HTML data and on PyPy
This commit is contained in:
parent
af03fa4542
commit
5e3894df3f
|
@ -21,6 +21,14 @@
|
|||
compat_HTMLParseError,
|
||||
compat_os_name,
|
||||
)
|
||||
from yt_dlp.parsing import (
|
||||
HTMLTagParser,
|
||||
FirstMatchingElementParser,
|
||||
)
|
||||
|
||||
# some testcases don't work with current functions
|
||||
get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag
|
||||
|
||||
from yt_dlp.utils import (
|
||||
Config,
|
||||
DateRange,
|
||||
|
@ -60,7 +68,6 @@
|
|||
get_element_by_class,
|
||||
get_element_html_by_attribute,
|
||||
get_element_html_by_class,
|
||||
get_element_text_and_html_by_tag,
|
||||
get_elements_by_attribute,
|
||||
get_elements_by_class,
|
||||
get_elements_html_by_attribute,
|
||||
|
@ -1797,11 +1804,14 @@ def test_get_element_text_and_html_by_tag(self):
|
|||
self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
|
||||
|
||||
def test_get_element_text_and_html_by_tag_malformed(self):
|
||||
inner_text = 'inner_text'
|
||||
inner_text = 'inner text'
|
||||
malnested_elements = f'<malnested_a><malnested_b>{inner_text}</malnested_a></malnested_b>'
|
||||
html = f'<div>{malnested_elements}</div>'
|
||||
commented_html = '<!--<div>inner comment</div>-->'
|
||||
outerdiv_html = f'<div>{malnested_elements}</div>'
|
||||
html = f'{commented_html}{outerdiv_html}'
|
||||
|
||||
self.assertEqual(get_element_text_and_html_by_tag('div', html), (malnested_elements, html))
|
||||
self.assertEqual(
|
||||
get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html))
|
||||
self.assertEqual(
|
||||
get_element_text_and_html_by_tag('malnested_a', html),
|
||||
(f'<malnested_b>{inner_text}',
|
||||
|
@ -1815,6 +1825,61 @@ def test_get_element_text_and_html_by_tag_malformed(self):
|
|||
self.assertRaises(
|
||||
compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'<orphan>{html}')
|
||||
|
||||
def test_strict_html_parsing(self):
|
||||
class StrictTagParser(HTMLTagParser):
|
||||
STRICT = True
|
||||
|
||||
parser = StrictTagParser()
|
||||
with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"):
|
||||
parser.taglist('</p>', reset=True)
|
||||
with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"):
|
||||
parser.taglist('<div><p>', reset=True)
|
||||
with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '</p>'"):
|
||||
parser.taglist('<div><p></div></p>', reset=True)
|
||||
with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '</p>'"):
|
||||
parser.taglist('<div><p>/p></div>', reset=True)
|
||||
with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"):
|
||||
parser.taglist('<div><p></p<< </div>', reset=True)
|
||||
with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"):
|
||||
parser.taglist('<img>must be empty</img>', reset=True)
|
||||
|
||||
def test_relaxed_html_parsing(self):
|
||||
Tag = HTMLTagParser.Tag
|
||||
parser = HTMLTagParser()
|
||||
|
||||
self.assertEqual(parser.taglist('</p>', reset=True), [])
|
||||
self.assertEqual(parser.taglist('<div><p>', reset=True), [])
|
||||
|
||||
tags = parser.taglist('<div><p></div></p>', reset=True)
|
||||
self.assertEqual(tags, [Tag('div'), Tag('p')])
|
||||
|
||||
tags = parser.taglist('<div><p>/p></div>', reset=True)
|
||||
self.assertEqual(tags, [Tag('div')])
|
||||
|
||||
tags = parser.taglist('<div><p>paragraph</p<ignored /></div>', reset=True)
|
||||
self.assertEqual(tags, [Tag('p'), Tag('div')])
|
||||
self.assertEqual(tags[0].text_and_html(), ('paragraph', '<p>paragraph</p'))
|
||||
|
||||
tags = parser.taglist('<img width="300px">must be empty</img>', reset=True)
|
||||
self.assertEqual(tags, [Tag('img')])
|
||||
self.assertEqual(tags[0].text_and_html(), ('', '<img width="300px">'))
|
||||
|
||||
def test_compliant_html_parsing(self):
|
||||
# certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS)
|
||||
Tag = HTMLTagParser.Tag
|
||||
html = '''
|
||||
no error without closing tag: <img>
|
||||
self closing is ok: <img />
|
||||
'''
|
||||
parser = HTMLTagParser()
|
||||
tags = parser.taglist(html, reset=True)
|
||||
self.assertEqual(tags, [Tag('img'), Tag('img')])
|
||||
|
||||
# don't get fooled by '>' in attributes
|
||||
html = '''<img greater_a='1>0' greater_b="1>0">'''
|
||||
tags = parser.taglist(html, reset=True)
|
||||
self.assertEqual(tags[0].text_and_html(), ('', html))
|
||||
|
||||
def test_iri_to_uri(self):
|
||||
self.assertEqual(
|
||||
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
|
||||
|
|
219
yt_dlp/parsing.py
Normal file
219
yt_dlp/parsing.py
Normal file
|
@ -0,0 +1,219 @@
|
|||
import collections
|
||||
import contextlib
|
||||
import itertools
|
||||
import re
|
||||
from html.parser import HTMLParser
|
||||
|
||||
from .utils import orderedSet
|
||||
|
||||
from .compat import compat_HTMLParseError
|
||||
|
||||
|
||||
class HTMLTagParser(HTMLParser):
|
||||
"""HTML parser which acts as iterator
|
||||
returns found elements as instances of Tag
|
||||
nested elements will be returned before its parents
|
||||
|
||||
strict=True raises compat_HTMLParseError on malformed html
|
||||
|
||||
two modes of usage:
|
||||
# as an lazy iterator:
|
||||
for tag_obj in HTMLTagParser(html):
|
||||
tag_obj.text_and_html()
|
||||
|
||||
# or return a list with all found tag objects
|
||||
# this is faster by factor 2-5 compared to iteration
|
||||
for tag_obj in HTMLTagParser(html).taglist():
|
||||
tag_obj.text_and_html()
|
||||
"""
|
||||
|
||||
STRICT = False
|
||||
ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''')
|
||||
CLOSING_TAG_REGEX = re.compile(r'</\s*[^\s<>]+(?:\s*>)?')
|
||||
VOID_TAGS = {
|
||||
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
|
||||
'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr',
|
||||
}
|
||||
|
||||
class Tag:
|
||||
__slots__ = 'name', 'string', 'start', 'start_len', 'stop', 'attrs'
|
||||
|
||||
def __init__(self, name, *, string='', start=None, stop=None, attrs=()):
|
||||
self.name = name
|
||||
self.string = string
|
||||
self.start = start
|
||||
self.start_len = 0
|
||||
self.stop = stop
|
||||
self.attrs = tuple(attrs)
|
||||
|
||||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
def __repr__(self):
|
||||
return f'{self.__class__.__name__}({str(self)!r})'
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.name == other
|
||||
|
||||
def html(self):
|
||||
return self.string[self.start:self.stop]
|
||||
|
||||
def text_and_html(self):
|
||||
assert isinstance(self.start, int)
|
||||
if not self.start_len:
|
||||
match = HTMLTagParser.ANY_TAG_REGEX.match(self.string[self.start:])
|
||||
assert match
|
||||
self.start_len = len(match.group())
|
||||
if self.stop is None:
|
||||
return '', self.string[self.start: self.start + self.start_len]
|
||||
html = self.html()
|
||||
cidx = html.rindex('</')
|
||||
return html[self.start_len:cidx], html
|
||||
|
||||
class EarlyExitException(Exception):
|
||||
pass
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.tagstack = collections.deque()
|
||||
self._offset = self.offset
|
||||
self.found_tags = []
|
||||
|
||||
def predicate(self, tag, attrs):
|
||||
return True
|
||||
|
||||
def callback(self, tag_obj):
|
||||
pass
|
||||
|
||||
def abort(self, last_tag=None):
|
||||
if last_tag:
|
||||
self.found_tags.append(last_tag)
|
||||
raise HTMLTagParser.EarlyExitException()
|
||||
|
||||
def taglist(self, data, reset=True):
|
||||
self.found_tags.clear()
|
||||
if reset:
|
||||
self.reset()
|
||||
self.tagstack.clear()
|
||||
with contextlib.suppress(HTMLTagParser.EarlyExitException):
|
||||
self.feed(data)
|
||||
if self.STRICT and self.tagstack:
|
||||
orphans = ', '.join(map(repr, map(str, orderedSet(self.tagstack, lazy=True))))
|
||||
raise compat_HTMLParseError(f'unclosed tag {orphans}')
|
||||
return self.found_tags
|
||||
|
||||
def updatepos(self, i, j):
|
||||
offset = self._offset = super().updatepos(i, j)
|
||||
return offset
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
try:
|
||||
# we use internal variable for performance reason
|
||||
tag_text = getattr(self, '_HTMLParser__starttag_text')
|
||||
except AttributeError:
|
||||
tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group()
|
||||
if self.predicate(tag, attrs):
|
||||
obj = self.Tag(
|
||||
tag, string=self.rawdata, start=self._offset, attrs=attrs)
|
||||
obj.start_len = len(tag_text)
|
||||
if tag_text.endswith('/>') or tag in self.VOID_TAGS:
|
||||
if self.callback(obj) is not False:
|
||||
self.found_tags.append(obj)
|
||||
return
|
||||
else:
|
||||
obj = None
|
||||
|
||||
self.tagstack.appendleft(obj or tag)
|
||||
|
||||
handle_startendtag = handle_starttag
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if '<' in tag:
|
||||
if self.STRICT:
|
||||
raise compat_HTMLParseError(f'malformed closing tag {tag!r}')
|
||||
tag = tag[:tag.index('<')]
|
||||
|
||||
try:
|
||||
idx = self.tagstack.index(tag)
|
||||
if self.STRICT and idx:
|
||||
open_tags = ''.join(f'</{tag}>' for tag in itertools.islice(self.tagstack, idx))
|
||||
raise compat_HTMLParseError(
|
||||
f'malnested closing tag {tag!r}, expected after {open_tags!r}')
|
||||
tag_obj = self.tagstack[idx]
|
||||
self.tagstack.remove(tag)
|
||||
if not isinstance(tag_obj, str):
|
||||
# since we landed here we'll always find a closing tag
|
||||
match = self.CLOSING_TAG_REGEX.match(self.rawdata[self._offset:])
|
||||
tag_obj.stop = self._offset + match.end()
|
||||
if self.callback(tag_obj) is not False:
|
||||
self.found_tags.append(tag_obj)
|
||||
except ValueError as exc:
|
||||
if isinstance(exc, compat_HTMLParseError):
|
||||
raise
|
||||
elif self.STRICT:
|
||||
raise compat_HTMLParseError(f'stray closing tag {tag!r}')
|
||||
|
||||
|
||||
class ClassParser(HTMLTagParser):
|
||||
def __init__(self, attribute, matchfunc, stop):
|
||||
super().__init__()
|
||||
self.search_attr = attribute
|
||||
self.matchfunc = matchfunc
|
||||
self.stop = stop
|
||||
self.processing = 0
|
||||
|
||||
def predicate(self, tag, attrs):
|
||||
if self.processing <= 0 and self.stop is not None and self._offset > self.stop:
|
||||
self.abort()
|
||||
string = dict(attrs).get(self.search_attr, '')
|
||||
if self.matchfunc(string):
|
||||
self.processing += 1
|
||||
return True
|
||||
return False
|
||||
|
||||
def callback(self, tag_obj):
|
||||
if self.stop is None:
|
||||
self.abort(tag_obj)
|
||||
self.processing -= 1
|
||||
|
||||
@classmethod
|
||||
def get_elements_html_by_class(cls, class_name, html):
|
||||
regex = re.compile(rf'[\w\- ]*\b{re.escape(class_name)}\b')
|
||||
it = re.finditer(rf'<.+ class=[\'"]{regex.pattern}', html)
|
||||
start = stop = None
|
||||
for match in it:
|
||||
if start is None:
|
||||
start = match.start()
|
||||
else:
|
||||
stop = match.end()
|
||||
if start is None:
|
||||
return []
|
||||
parser = cls('class', lambda x: regex.match(x), stop)
|
||||
return [tag.html() for tag in parser.taglist(html[start:])]
|
||||
|
||||
|
||||
class FirstMatchingElementParser(HTMLTagParser):
|
||||
def __init__(self, matchfunc):
|
||||
super().__init__()
|
||||
self.matchfunc = matchfunc
|
||||
self.found = False
|
||||
|
||||
def predicate(self, tag, attrs):
|
||||
if not self.found and self.matchfunc(tag, attrs):
|
||||
self.found = True
|
||||
return True
|
||||
return False
|
||||
|
||||
def callback(self, obj):
|
||||
self.abort(obj)
|
||||
|
||||
@classmethod
|
||||
def get_element_text_and_html_by_tag(cls, tag, html):
|
||||
"""
|
||||
For the first element with the specified tag in the given HTML document
|
||||
return its content (text) and the whole element (html)
|
||||
"""
|
||||
parser = cls(lambda _tag, _: _tag == tag)
|
||||
for tag_obj in parser.taglist(html):
|
||||
return tag_obj.text_and_html()
|
||||
raise compat_HTMLParseError(f'tag {tag} not found')
|
Loading…
Reference in New Issue
Block a user