diff --git a/test/test_parsing.py b/test/test_parsing.py index 75ed8ebf3..880c41a34 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -186,6 +186,9 @@ def test_get_element_text_and_html_by_tag(self): self.assertIsNone(get_element_text_and_html_by_tag('article', html)) def test_get_elements_text_and_html_by_tag(self): + class StrictParser(MatchingElementParser): + STRICT = True + test_string = ''' @@ -194,6 +197,10 @@ def test_get_elements_text_and_html_by_tag(self): items = get_elements_text_and_html_by_tag('img', test_string) self.assertListEqual(items, [('', ''), ('', '')]) + self.assertEqual( + StrictParser.get_element_text_and_html_by_tag('use', ''), + ('', '')) + def test_get_element_text_and_html_by_tag_malformed(self): inner_text = 'inner text' malnested_elements = f'{inner_text}' diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py index bcc48c4d3..8fbb4db14 100644 --- a/yt_dlp/parsing.py +++ b/yt_dlp/parsing.py @@ -185,17 +185,19 @@ def handle_starttag(self, tag, attrs): tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group() tag_obj = tag + tag_is_open = not (tag_text.endswith('/>') or tag in self.VOID_TAGS) if self.predicate(tag, attrs): tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs) tag_obj.openrange(self._offset, len(tag_text)) - if tag_text.endswith('/>') or tag in self.VOID_TAGS: + if tag_is_open: + nesting = [] + self._nestedtags[-1].append(nesting) + self._nestedtags.append(nesting) + else: self._nestedtags[-1].append(tag_obj) self.callback(tag_obj) - return - nesting = [] - self._nestedtags[-1].append(nesting) - self._nestedtags.append(nesting) - self.tagstack.appendleft(tag_obj) + if tag_is_open: + self.tagstack.appendleft(tag_obj) handle_startendtag = handle_starttag