[parsing] fix: don't push unmatched void tags onto queue

This commit is contained in:
Marcel 2022-11-27 16:22:03 +01:00
parent 176a156c65
commit 8451074b50
No known key found for this signature in database
GPG Key ID: 7813C97693AD6AAE
2 changed files with 15 additions and 6 deletions

View File

@ -186,6 +186,9 @@ def test_get_element_text_and_html_by_tag(self):
self.assertIsNone(get_element_text_and_html_by_tag('article', html))
def test_get_elements_text_and_html_by_tag(self):
class StrictParser(MatchingElementParser):
STRICT = True
test_string = '''
<img src="a.png">
<img src="b.png" />
@ -194,6 +197,10 @@ def test_get_elements_text_and_html_by_tag(self):
items = get_elements_text_and_html_by_tag('img', test_string)
self.assertListEqual(items, [('', '<img src="a.png">'), ('', '<img src="b.png" />')])
self.assertEqual(
StrictParser.get_element_text_and_html_by_tag('use', '<use><img></use>'),
('<img>', '<use><img></use>'))
def test_get_element_text_and_html_by_tag_malformed(self):
inner_text = 'inner text'
malnested_elements = f'<malnested_a><malnested_b>{inner_text}</malnested_a></malnested_b>'

View File

@ -185,17 +185,19 @@ def handle_starttag(self, tag, attrs):
tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group()
tag_obj = tag
tag_is_open = not (tag_text.endswith('/>') or tag in self.VOID_TAGS)
if self.predicate(tag, attrs):
tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
tag_obj.openrange(self._offset, len(tag_text))
if tag_text.endswith('/>') or tag in self.VOID_TAGS:
if tag_is_open:
nesting = []
self._nestedtags[-1].append(nesting)
self._nestedtags.append(nesting)
else:
self._nestedtags[-1].append(tag_obj)
self.callback(tag_obj)
return
nesting = []
self._nestedtags[-1].append(nesting)
self._nestedtags.append(nesting)
self.tagstack.appendleft(tag_obj)
if tag_is_open:
self.tagstack.appendleft(tag_obj)
handle_startendtag = handle_starttag