[parsing] rework interface, implemented all get_element(s) functions + extract_attributes() as MatchingElementParser class methods and improve performance

This commit is contained in:
Marcel 2022-11-22 19:58:06 +01:00
parent e092ba9922
commit 176a156c65
No known key found for this signature in database
GPG Key ID: 7813C97693AD6AAE
2 changed files with 399 additions and 142 deletions

View File

@ -1,29 +1,71 @@
import textwrap
import unittest
from parsing import (
FirstMatchingElementParser,
HTMLTagParser,
from yt_dlp.compat import compat_HTMLParseError
from yt_dlp.parsing import (
MatchingElementParser,
HTMLCommentRanges,
HTMLTagParser,
)
from yt_dlp.compat import compat_HTMLParseError
get_element_by_attribute = FirstMatchingElementParser
get_element_by_class = FirstMatchingElementParser
get_element_html_by_attribute = FirstMatchingElementParser
get_element_html_by_class = FirstMatchingElementParser.get_element_html_by_class
get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag
get_elements_by_attribute = MatchingElementParser
get_elements_by_class = MatchingElementParser
get_elements_html_by_attribute = MatchingElementParser
get_elements_html_by_class = FirstMatchingElementParser.get_elements_html_by_class
get_elements_text_and_html_by_attribute = MatchingElementParser
extract_attributes = MatchingElementParser.extract_attributes
get_element_by_attribute = MatchingElementParser.get_element_by_attribute
get_element_by_class = MatchingElementParser.get_element_by_class
get_element_html_by_attribute = MatchingElementParser.get_element_html_by_attribute
get_element_html_by_class = MatchingElementParser.get_element_html_by_class
get_element_text_and_html_by_tag = MatchingElementParser.get_element_text_and_html_by_tag
get_elements_by_attribute = MatchingElementParser.get_elements_by_attribute
get_elements_by_class = MatchingElementParser.get_elements_by_class
get_elements_html_by_attribute = MatchingElementParser.get_elements_html_by_attribute
get_elements_html_by_class = MatchingElementParser.get_elements_html_by_class
get_elements_text_and_html_by_attribute = MatchingElementParser.get_elements_text_and_html_by_attribute
get_elements_text_and_html_by_tag = MatchingElementParser.get_elements_text_and_html_by_tag
class TestParsing(unittest.TestCase):
def test_extract_attributes(self):
self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
self.assertEqual(extract_attributes('<e x=y>'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x="a \'b\' c">'), {'x': "a 'b' c"})
self.assertEqual(extract_attributes('<e x=\'a "b" c\'>'), {'x': 'a "b" c'})
self.assertEqual(extract_attributes('<e x="&#121;">'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x="&#x79;">'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x="&amp;">'), {'x': '&'}) # XML
self.assertEqual(extract_attributes('<e x="&quot;">'), {'x': '"'})
self.assertEqual(extract_attributes('<e x="&pound;">'), {'x': '£'}) # HTML 3.2
self.assertEqual(extract_attributes('<e x="&lambda;">'), {'x': 'λ'}) # HTML 4.0
self.assertEqual(extract_attributes('<e x="&foo">'), {'x': '&foo'})
self.assertEqual(extract_attributes('<e x="\'">'), {'x': "'"})
self.assertEqual(extract_attributes('<e x=\'"\'>'), {'x': '"'})
self.assertEqual(extract_attributes('<e x >'), {'x': None})
self.assertEqual(extract_attributes('<e x=y a>'), {'x': 'y', 'a': None})
self.assertEqual(extract_attributes('<e x= y>'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x=1 y=2 x=3>'), {'y': '2', 'x': '3'})
self.assertEqual(extract_attributes('<e \nx=\ny\n>'), {'x': 'y'})
self.assertEqual(extract_attributes('<e \nx=\n"y"\n>'), {'x': 'y'})
self.assertEqual(extract_attributes("<e \nx=\n'y'\n>"), {'x': 'y'})
self.assertEqual(extract_attributes('<e \nx="\ny\n">'), {'x': '\ny\n'})
self.assertEqual(extract_attributes('<e CAPS=x>'), {'caps': 'x'}) # Names lowercased
self.assertEqual(extract_attributes('<e x=1 X=2>'), {'x': '2'})
self.assertEqual(extract_attributes('<e X=1 x=2>'), {'x': '2'})
self.assertEqual(extract_attributes('<e _:funny-name1=1>'), {'_:funny-name1': '1'})
self.assertEqual(extract_attributes('<e x="Fáilte 世界 \U0001f600">'), {'x': 'Fáilte 世界 \U0001f600'})
self.assertEqual(extract_attributes('<e x="décompose&#769;">'), {'x': 'décompose\u0301'})
# "Narrow" Python builds don't support unicode code points outside BMP.
try:
chr(0x10000)
supports_outside_bmp = True
except ValueError:
supports_outside_bmp = False
if supports_outside_bmp:
self.assertEqual(extract_attributes('<e x="Smile &#128512;!">'), {'x': 'Smile \U0001f600!'})
# Malformed HTML should not break attributes extraction on older Python
self.assertEqual(extract_attributes('<mal"formed/>'), {})
GET_ELEMENT_BY_CLASS_TEST_STRING = '''
<span class="foo bar">nice</span>
<div class="foo bar">also nice</div>
'''
def test_get_element_by_class(self):
@ -35,7 +77,8 @@ def test_get_element_by_class(self):
def test_get_element_html_by_class(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_html_by_class('foo', html), html.strip())
self.assertEqual(get_element_html_by_class('foo', html),
'<span class="foo bar">nice</span>')
self.assertEqual(get_element_by_class('no-such-class', html), None)
GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
@ -48,6 +91,7 @@ def test_get_element_by_attribute(self):
self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'foo bar', html, tag='div'), 'also nice')
html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
@ -56,7 +100,8 @@ def test_get_element_by_attribute(self):
def test_get_element_html_by_attribute(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip())
self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html),
'<span class="foo bar">nice</span>')
self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None)
@ -110,7 +155,7 @@ def test_get_elements_text_and_html_by_attribute(self):
self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), [])
self.assertEqual(get_elements_text_and_html_by_attribute(
'class', 'foo', '<a class="foo">nice</a><span class="foo">nice</span>', tag='a'),
'class', 'foo', '<a class="foo">nice</a><span class="foo">not nice</span>', tag='a'),
[('nice', '<a class="foo">nice</a>')])
def test_get_element_text_and_html_by_tag(self):
@ -138,7 +183,16 @@ def test_get_element_text_and_html_by_tag(self):
self.assertEqual(
get_element_text_and_html_by_tag('span', html),
(get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
self.assertIsNone(get_element_text_and_html_by_tag('article', html))
def test_get_elements_text_and_html_by_tag(self):
test_string = '''
<img src="a.png">
<img src="b.png" />
<span>ignore</span>
'''
items = get_elements_text_and_html_by_tag('img', test_string)
self.assertListEqual(items, [('', '<img src="a.png">'), ('', '<img src="b.png" />')])
def test_get_element_text_and_html_by_tag_malformed(self):
inner_text = 'inner text'
@ -157,10 +211,8 @@ def test_get_element_text_and_html_by_tag_malformed(self):
get_element_text_and_html_by_tag('malnested_b', html),
(f'{inner_text}</malnested_a>',
f'<malnested_b>{inner_text}</malnested_a></malnested_b>'))
self.assertRaises(
compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}</orphan>')
self.assertRaises(
compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'<orphan>{html}')
self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}</orphan>'))
self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'<orphan>{html}'))
def test_strict_html_parsing(self):
class StrictTagParser(HTMLTagParser):
@ -188,14 +240,14 @@ def test_relaxed_html_parsing(self):
self.assertEqual(parser.taglist('<div><p>', reset=True), [])
tags = parser.taglist('<div><p></div></p>', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
self.assertEqual(tags, [Tag('p'), Tag('div')])
tags = parser.taglist('<div><p>/p></div>', reset=True)
self.assertEqual(tags, [Tag('div')])
tags = parser.taglist('<div><p>paragraph</p<ignored /></div>', reset=True)
self.assertEqual(tags, [Tag('p'), Tag('div')])
self.assertEqual(tags[0].text_and_html(), ('paragraph', '<p>paragraph</p'))
tags = parser.taglist('<div><p>paragraph</p<ignored></div>', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
self.assertEqual(tags[1].text_and_html(), ('paragraph', '<p>paragraph</p<ignored>'))
tags = parser.taglist('<img width="300px">must be empty</img>', reset=True)
self.assertEqual(tags, [Tag('img')])
@ -216,3 +268,65 @@ def test_compliant_html_parsing(self):
html = '''<img greater_a='1>0' greater_b="1>0">'''
tags = parser.taglist(html, reset=True)
self.assertEqual(tags[0].text_and_html(), ('', html))
def test_tag_return_order(self):
Tag = HTMLTagParser.Tag
html = '''
<t0>
<t1>
<t2>
<t3 /> <t4 />
</t2>
</t1>
<t5>
<t6 />
</t5>
</t0>
<t7>
<t8 />
</t7>
'''
parser = HTMLTagParser()
tags = parser.taglist(html, reset=True)
self.assertEqual(
str(tags), str([Tag('t0'), Tag('t1'), Tag('t2'), Tag('t3'), Tag('t4'),
Tag('t5'), Tag('t6'), Tag('t7'), Tag('t8')]))
tags = parser.taglist(html, reset=True, depth_first=True)
self.assertEqual(
str(tags), str([Tag('t3'), Tag('t4'), Tag('t2'), Tag('t1'), Tag('t6'),
Tag('t5'), Tag('t0'), Tag('t8'), Tag('t7')]))
# return tags in nested order
tags = parser.taglist(html, reset=True, depth_first=None)
self.assertEqual(
str(tags), str([
[Tag('t0'),
[Tag('t1'),
[Tag('t2'), Tag('t3'), Tag('t4')]],
[Tag('t5'), Tag('t6')]],
[Tag('t7'), Tag('t8')]]))
def test_within_html_comment(self):
def mark_comments(_string, char='^', nochar='-'):
cmts = HTMLCommentRanges(_string)
return "".join(char if _idx in cmts else nochar for _idx in range(len(_string)))
html_string = '''
no comments in this line
---------------------------------------------------------------------
<!-- whole line represents a comment -->
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
before <!-- comment --> after
-------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^--------
here is <!-- a comment --> and <!-- another comment --> end
------------^^^^^^^^^^^^^^^^^^---------^^^^^^^^^^^^^^^^^^^^^^^^------
this <!-- nested <!-- comment --> ends here --> and not here
-----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^----------------------------
stray --> comment closings --> are ignored <!-- but not <!-- openings
-------------------------------------------^^^^^^^^^^^^^^^^^^^^^^^^^^
'''
lines = textwrap.dedent(html_string).strip().splitlines()
for line, marker in zip(lines[0::2], lines[1::2]):
self.assertEqual((line, mark_comments(line)), (line, marker))

View File

@ -4,47 +4,89 @@
import re
from html.parser import HTMLParser
from .compat import compat_HTMLParseError
from .utils import orderedSet
from .compat import compat_HTMLParseError
def iter_find(string, sub: str):
size = len(sub)
idx = -size
while True:
idx = string.find(sub, idx + size)
if idx == -1:
return
yield idx
class HTMLCommentRanges:
"""computes the offsets of HTML comments
comments start with '<!--' and end with the first '-->' encountered
note: markers within quotes are not ignored
"""
def __init__(self, html):
self._range_iter = self.ranges(html)
self._range = next(self._range_iter, None)
self._last_offset = 0
@staticmethod
def ranges(string, sopen='<!--', sclose='-->'):
assert not (sopen.startswith(sclose) or sclose.startswith(sopen))
open_iter = iter_find(string, sopen)
close_len = len(sclose)
close_iter = (idx + close_len for idx in iter_find(string, sclose))
next_open = next(open_iter, None)
next_close = next(close_iter, None)
while True:
if next_open is None:
return
while next_close is not None and next_open > next_close:
next_close = next(close_iter, None)
yield slice(next_open, next_close)
if next_close is None:
return
while next_open is not None and next_open < next_close:
next_open = next(open_iter, None)
def __contains__(self, offset):
assert isinstance(offset, int)
assert offset >= self._last_offset, 'offset must be in increasing order'
self._last_offset = offset
while self._range and self._range.stop is not None and offset >= self._range.stop:
self._range = next(self._range_iter, None)
return not (self._range is None or offset < self._range.start)
class HTMLTagParser(HTMLParser):
"""HTML parser which acts as iterator
returns found elements as instances of Tag
nested elements will be returned before its parents
"""HTML parser which returns found elements as instances of 'Tag'
when STRICT=True can raise compat_HTMLParseError() on malformed HTML elements
strict=True raises compat_HTMLParseError on malformed html
two modes of usage:
# as an lazy iterator:
for tag_obj in HTMLTagParser(html):
usage:
parser = HTMLTagParser()
for tag_obj in parser.taglist(html):
tag_obj.text_and_html()
# or return a list with all found tag objects
# this is faster by factor 2-5 compared to iteration
for tag_obj in HTMLTagParser(html).taglist():
tag_obj.text_and_html()
"""
STRICT = False
ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''')
CLOSING_TAG_REGEX = re.compile(r'</\s*[^\s<>]+(?:\s*>)?')
VOID_TAGS = {
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr',
}
class Tag:
__slots__ = 'name', 'string', 'start', 'start_len', 'stop', 'attrs'
__slots__ = 'name', 'string', 'attrs', '_openrange', '_closerange'
def __init__(self, name, *, string='', start=None, stop=None, attrs=()):
def __init__(self, name, *, string='', attrs=()):
self.name = name
self.string = string
self.start = start
self.start_len = 0
self.stop = stop
self.attrs = tuple(attrs)
self._openrange = None
self._closerange = None
def __str__(self):
return self.name
@ -55,52 +97,81 @@ def __repr__(self):
def __eq__(self, other):
return self.name == other
def openrange(self, offset, startlen=0):
if isinstance(offset, slice):
self._openrange = offset
else:
self._openrange = slice(offset, offset + startlen)
def closerange(self, offset, stoplen=0):
if isinstance(offset, slice):
self._closerange = offset
else:
self._closerange = slice(offset, offset + stoplen)
def opentag(self):
return self.string[self._openrange] if self._openrange else ''
def html(self):
return self.string[self.start:self.stop]
if not self._openrange:
return ''
if self._closerange:
return self.string[self._openrange.start:self._closerange.stop]
return self.string[self._openrange]
def text(self):
if self._openrange and self._closerange:
return self.string[self._openrange.stop:self._closerange.start]
return ''
def text_and_html(self):
assert isinstance(self.start, int)
if not self.start_len:
match = HTMLTagParser.ANY_TAG_REGEX.match(self.string[self.start:])
assert match
self.start_len = len(match.group())
if self.stop is None:
return '', self.string[self.start: self.start + self.start_len]
html = self.html()
cidx = html.rindex('</')
return html[self.start_len:cidx], html
return self.text(), self.html()
class EarlyExitException(Exception):
class AbortException(Exception):
pass
def __init__(self):
super().__init__()
self.tagstack = collections.deque()
self._nestedtags = [[]]
super().__init__()
self._offset = self.offset
self.found_tags = []
def predicate(self, tag, attrs):
""" return True for every encountered opening tag that should be processed """
return True
def callback(self, tag_obj):
pass
""" this will be called when the requested tag is closed """
def abort(self, last_tag=None):
if last_tag:
self.found_tags.append(last_tag)
raise HTMLTagParser.EarlyExitException()
def reset(self):
super().reset()
self.tagstack.clear()
def taglist(self, data, reset=True, depth_first=False):
""" parse data and return found tag objects
@param data: html string
@param reset: reset state
@param depth_first: return order: as opened (False), as closed (True), nested (None)
@return: list of Tag objects
"""
def flatten(_list, first=True):
rlist = _list if first or not depth_first else itertools.chain(_list[1:], _list[:1])
for item in rlist:
if isinstance(item, list):
yield from flatten(item, first=False)
else:
yield item
def taglist(self, data, reset=True):
self.found_tags.clear()
if reset:
self.reset()
self.tagstack.clear()
with contextlib.suppress(HTMLTagParser.EarlyExitException):
with contextlib.suppress(HTMLTagParser.AbortException):
self.feed(data)
if self.STRICT and self.tagstack:
orphans = ', '.join(map(repr, map(str, orderedSet(self.tagstack, lazy=True))))
raise compat_HTMLParseError(f'unclosed tag {orphans}')
return self.found_tags
taglist = self._nestedtags[0] if depth_first is None else list(flatten(self._nestedtags[0]))
self._nestedtags = [[]]
return taglist
def updatepos(self, i, j):
offset = self._offset = super().updatepos(i, j)
@ -108,22 +179,23 @@ def updatepos(self, i, j):
def handle_starttag(self, tag, attrs):
try:
# we use internal variable for performance reason
# we use internal variable for performance reasons
tag_text = getattr(self, '_HTMLParser__starttag_text')
except AttributeError:
tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group()
if self.predicate(tag, attrs):
obj = self.Tag(
tag, string=self.rawdata, start=self._offset, attrs=attrs)
obj.start_len = len(tag_text)
if tag_text.endswith('/>') or tag in self.VOID_TAGS:
if self.callback(obj) is not False:
self.found_tags.append(obj)
return
else:
obj = None
self.tagstack.appendleft(obj or tag)
tag_obj = tag
if self.predicate(tag, attrs):
tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
tag_obj.openrange(self._offset, len(tag_text))
if tag_text.endswith('/>') or tag in self.VOID_TAGS:
self._nestedtags[-1].append(tag_obj)
self.callback(tag_obj)
return
nesting = []
self._nestedtags[-1].append(nesting)
self._nestedtags.append(nesting)
self.tagstack.appendleft(tag_obj)
handle_startendtag = handle_starttag
@ -141,79 +213,150 @@ def handle_endtag(self, tag):
f'malnested closing tag {tag!r}, expected after {open_tags!r}')
tag_obj = self.tagstack[idx]
self.tagstack.remove(tag)
if not isinstance(tag_obj, str):
# since we landed here we'll always find a closing tag
match = self.CLOSING_TAG_REGEX.match(self.rawdata[self._offset:])
tag_obj.stop = self._offset + match.end()
if self.callback(tag_obj) is not False:
self.found_tags.append(tag_obj)
if isinstance(tag_obj, self.Tag):
close_idx = self.rawdata.find('>', self._offset) + 1
tag_obj.closerange(self._offset, close_idx - self._offset)
self._nestedtags.pop().insert(0, tag_obj)
self.callback(tag_obj)
except ValueError as exc:
if isinstance(exc, compat_HTMLParseError):
raise
elif self.STRICT:
raise compat_HTMLParseError(f'stray closing tag {tag!r}')
if self.STRICT:
raise compat_HTMLParseError(f'stray closing tag {tag!r}') from exc
class ClassParser(HTMLTagParser):
def __init__(self, attribute, matchfunc, stop):
super().__init__()
self.search_attr = attribute
self.matchfunc = matchfunc
self.stop = stop
self.processing = 0
def predicate(self, tag, attrs):
if self.processing <= 0 and self.stop is not None and self._offset > self.stop:
self.abort()
string = dict(attrs).get(self.search_attr, '')
if self.matchfunc(string):
self.processing += 1
return True
return False
def callback(self, tag_obj):
if self.stop is None:
self.abort(tag_obj)
self.processing -= 1
@classmethod
def get_elements_html_by_class(cls, class_name, html):
regex = re.compile(rf'[\w\- ]*\b{re.escape(class_name)}\b')
it = re.finditer(rf'<.+ class=[\'"]{regex.pattern}', html)
start = stop = None
for match in it:
if start is None:
start = match.start()
else:
stop = match.end()
if start is None:
return []
parser = cls('class', lambda x: regex.match(x), stop)
return [tag.html() for tag in parser.taglist(html[start:])]
class FirstMatchingElementParser(HTMLTagParser):
class MatchingElementParser(HTMLTagParser):
""" optimized version of HTMLTagParser
"""
def __init__(self, matchfunc):
super().__init__()
self.matchfunc = matchfunc
self.found = False
self.found_none = True
def reset(self):
super().reset()
self.found_none = True
def callback(self, tag_obj):
raise self.AbortException()
def predicate(self, tag, attrs):
if not self.found and self.matchfunc(tag, attrs):
self.found = True
if self.found_none and self.matchfunc(tag, attrs):
self.found_none = False
return True
return False
def callback(self, obj):
self.abort(obj)
@staticmethod
def class_value_regex(class_name):
return rf'[\w\s\-]*(?<![\w\-]){re.escape(class_name)}(?![\w\-])[\w\s\-]*'
@staticmethod
def matching_tag_regex(tag, attribute, value_regex, escape=True):
if isinstance(value_regex, re.Pattern):
value_regex = value_regex.pattern
elif escape:
value_regex = re.escape(value_regex)
return rf'''(?x)
<(?:{tag})
(?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
\s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q)
'''
@classmethod
def iter_tags(cls, regex, html, *, matchfunc):
comments = HTMLCommentRanges(html)
parser = cls(matchfunc)
for match in re.finditer(regex, html):
if match.start() not in comments:
yield from parser.taglist(html[match.start():], reset=True)
@classmethod
def tags_by_name(cls, tag, html):
def matchfunc(tag_str, _attrs):
return tag_str == tag
yield from cls.iter_tags(rf'<\s*{re.escape(tag)}[\s>]', html, matchfunc=matchfunc)
@classmethod
def tags_by_attribute(cls, attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
def matchfunc(_tag_str, attrs):
return any(attr == attribute and re.fullmatch(value, value_str)
for attr, value_str in attrs)
tag_regex = cls.matching_tag_regex(tag, attribute, value, escape_value)
yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
@classmethod
def extract_attributes(cls, html):
attr_dict = {}
def matchfunc(_tag, attrs):
attr_dict.update(attrs)
raise cls.AbortException()
with contextlib.suppress(cls.AbortException):
cls(matchfunc).feed(html)
return attr_dict
@classmethod
def get_elements_text_and_html_by_tag(cls, tag, html):
return [tag.text_and_html() for tag in cls.tags_by_name(tag, html)]
@classmethod
def get_element_text_and_html_by_tag(cls, tag, html):
"""
For the first element with the specified tag in the given HTML document
return its content (text) and the whole element (html)
"""
parser = cls(lambda _tag, _: _tag == tag)
for tag_obj in parser.taglist(html):
return tag_obj.text_and_html()
raise compat_HTMLParseError(f'tag {tag} not found')
tag = next(cls.tags_by_name(tag, html), None)
return tag and tag.text_and_html()
@classmethod
def get_elements_text_and_html_by_attribute(cls, *args, **kwargs):
return [tag.text_and_html() for tag in cls.tags_by_attribute(*args, **kwargs)]
@classmethod
def get_elements_by_attribute(cls, *args, **kwargs):
return [tag.text_and_html()[0] for tag in cls.tags_by_attribute(*args, **kwargs)]
@classmethod
def get_elements_html_by_attribute(cls, *args, **kwargs):
return [tag.html() for tag in cls.tags_by_attribute(*args, **kwargs)]
@classmethod
def get_element_by_attribute(cls, *args, **kwargs):
tag = next(cls.tags_by_attribute(*args, **kwargs), None)
return tag and tag.text()
@classmethod
def get_element_html_by_attribute(cls, *args, **kwargs):
tag = next(cls.tags_by_attribute(*args, **kwargs), None)
return tag and tag.html()
@classmethod
def get_elements_by_class(cls, class_name, html):
value = cls.class_value_regex(class_name)
return [tag.text() for tag
in cls.tags_by_attribute('class', value, html, escape_value=False)]
@classmethod
def get_elements_html_by_class(cls, class_name, html):
value = cls.class_value_regex(class_name)
return [tag.html() for tag
in cls.tags_by_attribute('class', value, html, escape_value=False)]
@classmethod
def get_elements_text_and_html_by_class(cls, class_name, html):
value = cls.class_value_regex(class_name)
return [tag.text() for tag
in cls.tags_by_attribute('class', value, html, escape_value=False)]
@classmethod
def get_element_html_by_class(cls, class_name, html):
value = cls.class_value_regex(class_name)
tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None)
return tag and tag.html()
@classmethod
def get_element_by_class(cls, class_name, html):
value = cls.class_value_regex(class_name)
tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None)
return tag and tag.text()