[parsing] tweak tag regex

This commit is contained in:
Marcel 2022-11-27 21:26:58 +01:00
parent dbf350c122
commit 7a67a2028f
No known key found for this signature in database
GPG Key ID: 7813C97693AD6AAE

View File

@ -261,7 +261,7 @@ def matching_tag_regex(tag, attribute, value_regex, escape=True):
return rf'''(?x) return rf'''(?x)
<(?:{tag}) <(?:{tag})
(?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? (?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?
\s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q) \s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q)
''' '''
@ -278,7 +278,8 @@ def tags_by_name(cls, tag, html):
def matchfunc(tag_str, _attrs): def matchfunc(tag_str, _attrs):
return tag_str == tag return tag_str == tag
yield from cls.iter_tags(rf'<\s*{re.escape(tag)}[\s>]', html, matchfunc=matchfunc) tag_regex = rf'''<\s*{re.escape(tag)}(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>'''
yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
@classmethod @classmethod
def tags_by_attribute(cls, attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True): def tags_by_attribute(cls, attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):