From da0d84258bf8163550f0c952393efb43d44ece17 Mon Sep 17 00:00:00 2001
From: Marcel <flashdagger@googlemail.com>
Date: Thu, 17 Nov 2022 00:11:51 +0100
Subject: [PATCH 01/15] [test/test_utils] refactor
 test_get_element_text_and_html_by_tag()

---
 test/test_utils.py | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)
diff --git a/test/test_utils.py b/test/test_utils.py
index 3045b6d7e1..334423619a 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -4,6 +4,7 @@
 import os
 import re
 import sys
+import textwrap
 import unittest
 
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -1768,32 +1769,31 @@ def test_get_elements_text_and_html_by_attribute(self):
         self.assertEqual(list(get_elements_text_and_html_by_attribute(
             'class', 'foo', '<a class="foo">nice</a><span class="foo">nice</span>', tag='a')), [('nice', '<a class="foo">nice</a>')])
 
-    GET_ELEMENT_BY_TAG_TEST_STRING = '''
-    random text lorem ipsum</p>
-    <div>
-        this should be returned
-        <span>this should also be returned</span>
-        <div>
-            this should also be returned
-        </div>
-        closing tag above should not trick, so this should also be returned
-    </div>
-    but this text should not be returned
-    '''
-    GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[32:276]
-    GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT = GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML[5:-6]
-    GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119]
-    GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7]
-
     def test_get_element_text_and_html_by_tag(self):
-        html = self.GET_ELEMENT_BY_TAG_TEST_STRING
+        get_element_by_tag_test_string = '''
+        random text lorem ipsum</p>
+        <div>
+            this should be returned
+            <span>this should also be returned</span>
+            <div>
+                this should also be returned
+            </div>
+            closing tag above should not trick, so this should also be returned
+        </div>
+        but this text should not be returned
+        '''
+        html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4)
+        get_element_by_tag_res_outerdiv_html = html.strip()[32:276]
+        get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6]
+        get_element_by_tag_res_innerspan_html = html.strip()[78:119]
+        get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7]
 
         self.assertEqual(
             get_element_text_and_html_by_tag('div', html),
-            (self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT, self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML))
+            (get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html))
         self.assertEqual(
             get_element_text_and_html_by_tag('span', html),
-            (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML))
+            (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
         self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
 
     def test_iri_to_uri(self):

From af03fa454299de0a39fb31e257e02d269f7ef6b2 Mon Sep 17 00:00:00 2001
From: Marcel <flashdagger@googlemail.com>
Date: Thu, 17 Nov 2022 01:20:25 +0100
Subject: [PATCH 02/15] [utils] more forgiving html parsing + unit tests

---
 test/test_utils.py | 19 +++++++++++++++++++
 yt_dlp/utils.py    | 12 +++++-------
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/test/test_utils.py b/test/test_utils.py
index 334423619a..022e821a6b 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1796,6 +1796,25 @@ def test_get_element_text_and_html_by_tag(self):
             (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
         self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
 
+    def test_get_element_text_and_html_by_tag_malformed(self):
+        inner_text = 'inner_text'
+        malnested_elements = f'<malnested_a><malnested_b>{inner_text}</malnested_a></malnested_b>'
+        html = f'<div>{malnested_elements}</div>'
+
+        self.assertEqual(get_element_text_and_html_by_tag('div', html), (malnested_elements, html))
+        self.assertEqual(
+            get_element_text_and_html_by_tag('malnested_a', html),
+            (f'<malnested_b>{inner_text}',
+             f'<malnested_a><malnested_b>{inner_text}</malnested_a>'))
+        self.assertEqual(
+            get_element_text_and_html_by_tag('malnested_b', html),
+            (f'{inner_text}</malnested_a>',
+             f'<malnested_b>{inner_text}</malnested_a></malnested_b>'))
+        self.assertRaises(
+            compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}</orphan>')
+        self.assertRaises(
+            compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'<orphan>{html}')
+
     def test_iri_to_uri(self):
         self.assertEqual(
             iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py
index 8c2c5593cc..de058b0e60 100644
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -466,17 +466,13 @@ def close(self):
         pass
 
     def handle_starttag(self, tag, _):
-        self.tagstack.append(tag)
+        self.tagstack.appendleft(tag)
 
     def handle_endtag(self, tag):
         if not self.tagstack:
             raise compat_HTMLParseError('no tags in the stack')
-        while self.tagstack:
-            inner_tag = self.tagstack.pop()
-            if inner_tag == tag:
-                break
-        else:
-            raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
+        with contextlib.suppress(ValueError):
+            self.tagstack.remove(tag)
         if not self.tagstack:
             raise self.HTMLBreakOnClosingTagException()
 
@@ -510,6 +506,8 @@ def find_or_raise(haystack, needle, exc):
             next_closing_tag_end = next_closing_tag_start + len(closing_tag)
             try:
                 parser.feed(html[offset:offset + next_closing_tag_end])
+                if tag not in parser.tagstack:
+                    raise HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException()
                 offset += next_closing_tag_end
             except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
                 return html[content_start:offset + next_closing_tag_start], \

From 5e3894df3fa043b1cd7bc731f5e5954bc17295e2 Mon Sep 17 00:00:00 2001
From: Marcel <flashdagger@googlemail.com>
Date: Tue, 22 Nov 2022 14:07:14 +0100
Subject: [PATCH 03/15] [parsing] add new module containing various HTML parser
 classes as replacement for utils.get_html_... functions

* performance is mostly better for large HTML data and on PyPy
---
 test/test_utils.py |  73 ++++++++++++++-
 yt_dlp/parsing.py  | 219 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 288 insertions(+), 4 deletions(-)
 create mode 100644 yt_dlp/parsing.py

diff --git a/test/test_utils.py b/test/test_utils.py
index 022e821a6b..d9a62258c5 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -21,6 +21,14 @@
     compat_HTMLParseError,
     compat_os_name,
 )
+from yt_dlp.parsing import (
+    HTMLTagParser,
+    FirstMatchingElementParser,
+)
+
+# some testcases don't work with current functions
+get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag
+
 from yt_dlp.utils import (
     Config,
     DateRange,
@@ -60,7 +68,6 @@
     get_element_by_class,
     get_element_html_by_attribute,
     get_element_html_by_class,
-    get_element_text_and_html_by_tag,
     get_elements_by_attribute,
     get_elements_by_class,
     get_elements_html_by_attribute,
@@ -1797,11 +1804,14 @@ def test_get_element_text_and_html_by_tag(self):
         self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
 
     def test_get_element_text_and_html_by_tag_malformed(self):
-        inner_text = 'inner_text'
+        inner_text = 'inner text'
         malnested_elements = f'<malnested_a><malnested_b>{inner_text}</malnested_a></malnested_b>'
-        html = f'<div>{malnested_elements}</div>'
+        commented_html = '<!--<div>inner comment</div>-->'
+        outerdiv_html = f'<div>{malnested_elements}</div>'
+        html = f'{commented_html}{outerdiv_html}'
 
-        self.assertEqual(get_element_text_and_html_by_tag('div', html), (malnested_elements, html))
+        self.assertEqual(
+            get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html))
         self.assertEqual(
             get_element_text_and_html_by_tag('malnested_a', html),
             (f'<malnested_b>{inner_text}',
@@ -1815,6 +1825,61 @@ def test_get_element_text_and_html_by_tag_malformed(self):
         self.assertRaises(
             compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'<orphan>{html}')
 
+    def test_strict_html_parsing(self):
+        class StrictTagParser(HTMLTagParser):
+            STRICT = True
+
+        parser = StrictTagParser()
+        with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"):
+            parser.taglist('</p>', reset=True)
+        with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"):
+            parser.taglist('<div><p>', reset=True)
+        with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '</p>'"):
+            parser.taglist('<div><p></div></p>', reset=True)
+        with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '</p>'"):
+            parser.taglist('<div><p>/p></div>', reset=True)
+        with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"):
+            parser.taglist('<div><p></p<< </div>', reset=True)
+        with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"):
+            parser.taglist('<img>must be empty</img>', reset=True)
+
+    def test_relaxed_html_parsing(self):
+        Tag = HTMLTagParser.Tag
+        parser = HTMLTagParser()
+
+        self.assertEqual(parser.taglist('</p>', reset=True), [])
+        self.assertEqual(parser.taglist('<div><p>', reset=True), [])
+
+        tags = parser.taglist('<div><p></div></p>', reset=True)
+        self.assertEqual(tags, [Tag('div'), Tag('p')])
+
+        tags = parser.taglist('<div><p>/p></div>', reset=True)
+        self.assertEqual(tags, [Tag('div')])
+
+        tags = parser.taglist('<div><p>paragraph</p<ignored /></div>', reset=True)
+        self.assertEqual(tags, [Tag('p'), Tag('div')])
+        self.assertEqual(tags[0].text_and_html(), ('paragraph', '<p>paragraph</p'))
+
+        tags = parser.taglist('<img width="300px">must be empty</img>', reset=True)
+        self.assertEqual(tags, [Tag('img')])
+        self.assertEqual(tags[0].text_and_html(), ('', '<img width="300px">'))
+
+    def test_compliant_html_parsing(self):
+        # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS)
+        Tag = HTMLTagParser.Tag
+        html = '''
+            no error without closing tag: <img>
+            self closing is ok: <img />
+        '''
+        parser = HTMLTagParser()
+        tags = parser.taglist(html, reset=True)
+        self.assertEqual(tags, [Tag('img'), Tag('img')])
+
+        # don't get fooled by '>' in attributes
+        html = '''<img greater_a='1>0' greater_b="1>0">'''
+        tags = parser.taglist(html, reset=True)
+        self.assertEqual(tags[0].text_and_html(), ('', html))
+
     def test_iri_to_uri(self):
         self.assertEqual(
             iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
new file mode 100644
index 0000000000..d0dcf450a0
--- /dev/null
+++ b/yt_dlp/parsing.py
@@ -0,0 +1,219 @@
+import collections
+import contextlib
+import itertools
+import re
+from html.parser import HTMLParser
+
+from .utils import orderedSet
+
+from .compat import compat_HTMLParseError
+
+
+class HTMLTagParser(HTMLParser):
+    """HTML parser which acts as iterator
+    returns found elements as instances of Tag
+    nested elements will be returned before its parents
+
+    strict=True raises compat_HTMLParseError on malformed html
+
+    two modes of usage:
+        # as an lazy iterator:
+        for tag_obj in HTMLTagParser(html):
+            tag_obj.text_and_html()
+
+        # or return a list with all found tag objects
+        # this is faster by factor 2-5 compared to iteration
+        for tag_obj in HTMLTagParser(html).taglist():
+            tag_obj.text_and_html()
+    """
+
+    STRICT = False
+    ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''')
+    CLOSING_TAG_REGEX = re.compile(r'</\s*[^\s<>]+(?:\s*>)?')
+    VOID_TAGS = {
+        'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
+        'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr',
+    }
+
+    class Tag:
+        __slots__ = 'name', 'string', 'start', 'start_len', 'stop', 'attrs'
+
+        def __init__(self, name, *, string='', start=None, stop=None, attrs=()):
+            self.name = name
+            self.string = string
+            self.start = start
+            self.start_len = 0
+            self.stop = stop
+            self.attrs = tuple(attrs)
+
+        def __str__(self):
+            return self.name
+
+        def __repr__(self):
+            return f'{self.__class__.__name__}({str(self)!r})'
+
+        def __eq__(self, other):
+            return self.name == other
+
+        def html(self):
+            return self.string[self.start:self.stop]
+
+        def text_and_html(self):
+            assert isinstance(self.start, int)
+            if not self.start_len:
+                match = HTMLTagParser.ANY_TAG_REGEX.match(self.string[self.start:])
+                assert match
+                self.start_len = len(match.group())
+            if self.stop is None:
+                return '', self.string[self.start: self.start + self.start_len]
+            html = self.html()
+            cidx = html.rindex('</')
+            return html[self.start_len:cidx], html
+
+    class EarlyExitException(Exception):
+        pass
+
+    def __init__(self):
+        super().__init__()
+        self.tagstack = collections.deque()
+        self._offset = self.offset
+        self.found_tags = []
+
+    def predicate(self, tag, attrs):
+        return True
+
+    def callback(self, tag_obj):
+        pass
+
+    def abort(self, last_tag=None):
+        if last_tag:
+            self.found_tags.append(last_tag)
+        raise HTMLTagParser.EarlyExitException()
+
+    def taglist(self, data, reset=True):
+        self.found_tags.clear()
+        if reset:
+            self.reset()
+            self.tagstack.clear()
+        with contextlib.suppress(HTMLTagParser.EarlyExitException):
+            self.feed(data)
+        if self.STRICT and self.tagstack:
+            orphans = ', '.join(map(repr, map(str, orderedSet(self.tagstack, lazy=True))))
+            raise compat_HTMLParseError(f'unclosed tag {orphans}')
+        return self.found_tags
+
+    def updatepos(self, i, j):
+        offset = self._offset = super().updatepos(i, j)
+        return offset
+
+    def handle_starttag(self, tag, attrs):
+        try:
+            # we use internal variable for performance reason
+            tag_text = getattr(self, '_HTMLParser__starttag_text')
+        except AttributeError:
+            tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group()
+        if self.predicate(tag, attrs):
+            obj = self.Tag(
+                tag, string=self.rawdata, start=self._offset, attrs=attrs)
+            obj.start_len = len(tag_text)
+            if tag_text.endswith('/>') or tag in self.VOID_TAGS:
+                if self.callback(obj) is not False:
+                    self.found_tags.append(obj)
+                return
+        else:
+            obj = None
+
+        self.tagstack.appendleft(obj or tag)
+
+    handle_startendtag = handle_starttag
+
+    def handle_endtag(self, tag):
+        if '<' in tag:
+            if self.STRICT:
+                raise compat_HTMLParseError(f'malformed closing tag {tag!r}')
+            tag = tag[:tag.index('<')]
+
+        try:
+            idx = self.tagstack.index(tag)
+            if self.STRICT and idx:
+                open_tags = ''.join(f'</{tag}>' for tag in itertools.islice(self.tagstack, idx))
+                raise compat_HTMLParseError(
+                    f'malnested closing tag {tag!r}, expected after {open_tags!r}')
+            tag_obj = self.tagstack[idx]
+            self.tagstack.remove(tag)
+            if not isinstance(tag_obj, str):
+                # since we landed here we'll always find a closing tag
+                match = self.CLOSING_TAG_REGEX.match(self.rawdata[self._offset:])
+                tag_obj.stop = self._offset + match.end()
+                if self.callback(tag_obj) is not False:
+                    self.found_tags.append(tag_obj)
+        except ValueError as exc:
+            if isinstance(exc, compat_HTMLParseError):
+                raise
+            elif self.STRICT:
+                raise compat_HTMLParseError(f'stray closing tag {tag!r}')
+
+
+class ClassParser(HTMLTagParser):
+    def __init__(self, attribute, matchfunc, stop):
+        super().__init__()
+        self.search_attr = attribute
+        self.matchfunc = matchfunc
+        self.stop = stop
+        self.processing = 0
+
+    def predicate(self, tag, attrs):
+        if self.processing <= 0 and self.stop is not None and self._offset > self.stop:
+            self.abort()
+        string = dict(attrs).get(self.search_attr, '')
+        if self.matchfunc(string):
+            self.processing += 1
+            return True
+        return False
+
+    def callback(self, tag_obj):
+        if self.stop is None:
+            self.abort(tag_obj)
+        self.processing -= 1
+
+    @classmethod
+    def get_elements_html_by_class(cls, class_name, html):
+        regex = re.compile(rf'[\w\- ]*\b{re.escape(class_name)}\b')
+        it = re.finditer(rf'<.+ class=[\'"]{regex.pattern}', html)
+        start = stop = None
+        for match in it:
+            if start is None:
+                start = match.start()
+            else:
+                stop = match.end()
+        if start is None:
+            return []
+        parser = cls('class', lambda x: regex.match(x), stop)
+        return [tag.html() for tag in parser.taglist(html[start:])]
+
+
+class FirstMatchingElementParser(HTMLTagParser):
+    def __init__(self, matchfunc):
+        super().__init__()
+        self.matchfunc = matchfunc
+        self.found = False
+
+    def predicate(self, tag, attrs):
+        if not self.found and self.matchfunc(tag, attrs):
+            self.found = True
+            return True
+        return False
+
+    def callback(self, obj):
+        self.abort(obj)
+
+    @classmethod
+    def get_element_text_and_html_by_tag(cls, tag, html):
+        """
+        For the first element with the specified tag in the given HTML document
+        return its content (text) and the whole element (html)
+        """
+        parser = cls(lambda _tag, _: _tag == tag)
+        for tag_obj in parser.taglist(html):
+            return tag_obj.text_and_html()
+        raise compat_HTMLParseError(f'tag {tag} not found')

From e092ba9922191886c542972461ec27b1d82a466d Mon Sep 17 00:00:00 2001
From: Marcel <flashdagger@googlemail.com>
Date: Tue, 22 Nov 2022 22:37:14 +0100
Subject: [PATCH 04/15] [test] rollback test_utils.py and add related tests to
 test_parsing.py

---
 test/test_parsing.py | 218 +++++++++++++++++++++++++++++++++++++++++++
 test/test_utils.py   | 124 ++++--------------------
 2 files changed, 238 insertions(+), 104 deletions(-)
 create mode 100644 test/test_parsing.py

diff --git a/test/test_parsing.py b/test/test_parsing.py
new file mode 100644
index 0000000000..782a1196df
--- /dev/null
+++ b/test/test_parsing.py
@@ -0,0 +1,218 @@
+import textwrap
+import unittest
+
+from parsing import (
+    FirstMatchingElementParser,
+    HTMLTagParser,
+    MatchingElementParser,
+)
+
+from yt_dlp.compat import compat_HTMLParseError
+
+get_element_by_attribute = FirstMatchingElementParser
+get_element_by_class = FirstMatchingElementParser
+get_element_html_by_attribute = FirstMatchingElementParser
+get_element_html_by_class = FirstMatchingElementParser.get_element_html_by_class
+get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag
+get_elements_by_attribute = MatchingElementParser
+get_elements_by_class = MatchingElementParser
+get_elements_html_by_attribute = MatchingElementParser
+get_elements_html_by_class = FirstMatchingElementParser.get_elements_html_by_class
+get_elements_text_and_html_by_attribute = MatchingElementParser
+
+
+class TestParsing(unittest.TestCase):
+    GET_ELEMENT_BY_CLASS_TEST_STRING = '''
+        <span class="foo bar">nice</span>
+    '''
+
+    def test_get_element_by_class(self):
+        html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
+
+        self.assertEqual(get_element_by_class('foo', html), 'nice')
+        self.assertEqual(get_element_by_class('no-such-class', html), None)
+
+    def test_get_element_html_by_class(self):
+        html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
+
+        self.assertEqual(get_element_html_by_class('foo', html), html.strip())
+        self.assertEqual(get_element_by_class('no-such-class', html), None)
+
+    GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
+        <div itemprop="author" itemscope>foo</div>
+    '''
+
+    def test_get_element_by_attribute(self):
+        html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
+
+        self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
+        self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
+        self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
+
+        html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
+
+        self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
+
+    def test_get_element_html_by_attribute(self):
+        html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
+
+        self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip())
+        self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None)
+        self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None)
+
+        html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
+
+        self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip())
+
+    GET_ELEMENTS_BY_CLASS_TEST_STRING = '''
+        <span class="foo bar">nice</span>
+        <span class="foo bar">also nice</span>
+    '''
+    GET_ELEMENTS_BY_CLASS_RES = [
+        '<span class="foo bar">nice</span>',
+        '<span class="foo bar">also nice</span>'
+    ]
+
+    def test_get_elements_by_class(self):
+        html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+        self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice'])
+        self.assertEqual(get_elements_by_class('no-such-class', html), [])
+
+    def test_get_elements_html_by_class(self):
+        html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+        self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES)
+        self.assertEqual(get_elements_html_by_class('no-such-class', html), [])
+
+    def test_get_elements_by_attribute(self):
+        html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+        self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice'])
+        self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
+        self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
+
+    def test_get_elements_html_by_attribute(self):
+        html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+        self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html),
+                         self.GET_ELEMENTS_BY_CLASS_RES)
+        self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), [])
+        self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), [])
+
+    def test_get_elements_text_and_html_by_attribute(self):
+        html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+        self.assertEqual(
+            get_elements_text_and_html_by_attribute('class', 'foo bar', html),
+            list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES)))
+        self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), [])
+        self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), [])
+
+        self.assertEqual(get_elements_text_and_html_by_attribute(
+            'class', 'foo', '<a class="foo">nice</a><span class="foo">nice</span>', tag='a'),
+            [('nice', '<a class="foo">nice</a>')])
+
+    def test_get_element_text_and_html_by_tag(self):
+        get_element_by_tag_test_string = '''
+        random text lorem ipsum</p>
+        <div>
+            this should be returned
+            <span>this should also be returned</span>
+            <div>
+                this should also be returned
+            </div>
+            closing tag above should not trick, so this should also be returned
+        </div>
+        but this text should not be returned
+        '''
+        html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4)
+        get_element_by_tag_res_outerdiv_html = html.strip()[32:276]
+        get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6]
+        get_element_by_tag_res_innerspan_html = html.strip()[78:119]
+        get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7]
+
+        self.assertEqual(
+            get_element_text_and_html_by_tag('div', html),
+            (get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html))
+        self.assertEqual(
+            get_element_text_and_html_by_tag('span', html),
+            (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
+        self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
+
+    def test_get_element_text_and_html_by_tag_malformed(self):
+        inner_text = 'inner text'
+        malnested_elements = f'<malnested_a><malnested_b>{inner_text}</malnested_a></malnested_b>'
+        commented_html = '<!--<div>inner comment</div>-->'
+        outerdiv_html = f'<div>{malnested_elements}</div>'
+        html = f'{commented_html}{outerdiv_html}'
+
+        self.assertEqual(
+            get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html))
+        self.assertEqual(
+            get_element_text_and_html_by_tag('malnested_a', html),
+            (f'<malnested_b>{inner_text}',
+             f'<malnested_a><malnested_b>{inner_text}</malnested_a>'))
+        self.assertEqual(
+            get_element_text_and_html_by_tag('malnested_b', html),
+            (f'{inner_text}</malnested_a>',
+             f'<malnested_b>{inner_text}</malnested_a></malnested_b>'))
+        self.assertRaises(
+            compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}</orphan>')
+        self.assertRaises(
+            compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'<orphan>{html}')
+
+    def test_strict_html_parsing(self):
+        class StrictTagParser(HTMLTagParser):
+            STRICT = True
+
+        parser = StrictTagParser()
+        with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"):
+            parser.taglist('</p>', reset=True)
+        with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"):
+            parser.taglist('<div><p>', reset=True)
+        with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '</p>'"):
+            parser.taglist('<div><p></div></p>', reset=True)
+        with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '</p>'"):
+            parser.taglist('<div><p>/p></div>', reset=True)
+        with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"):
+            parser.taglist('<div><p></p<< </div>', reset=True)
+        with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"):
+            parser.taglist('<img>must be empty</img>', reset=True)
+
+    def test_relaxed_html_parsing(self):
+        Tag = HTMLTagParser.Tag
+        parser = HTMLTagParser()
+
+        self.assertEqual(parser.taglist('</p>', reset=True), [])
+        self.assertEqual(parser.taglist('<div><p>', reset=True), [])
+
+        tags = parser.taglist('<div><p></div></p>', reset=True)
+        self.assertEqual(tags, [Tag('div'), Tag('p')])
+
+        tags = parser.taglist('<div><p>/p></div>', reset=True)
+        self.assertEqual(tags, [Tag('div')])
+
+        tags = parser.taglist('<div><p>paragraph</p<ignored /></div>', reset=True)
+        self.assertEqual(tags, [Tag('p'), Tag('div')])
+        self.assertEqual(tags[0].text_and_html(), ('paragraph', '<p>paragraph</p'))
+
+        tags = parser.taglist('<img width="300px">must be empty</img>', reset=True)
+        self.assertEqual(tags, [Tag('img')])
+        self.assertEqual(tags[0].text_and_html(), ('', '<img width="300px">'))
+
+    def test_compliant_html_parsing(self):
+        # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS)
+        Tag = HTMLTagParser.Tag
+        html = '''
+            no error without closing tag: <img>
+            self closing is ok: <img />
+        '''
+        parser = HTMLTagParser()
+        tags = parser.taglist(html, reset=True)
+        self.assertEqual(tags, [Tag('img'), Tag('img')])
+
+        # don't get fooled by '>' in attributes
+        html = '''<img greater_a='1>0' greater_b="1>0">'''
+        tags = parser.taglist(html, reset=True)
+        self.assertEqual(tags[0].text_and_html(), ('', html))
diff --git a/test/test_utils.py b/test/test_utils.py
index d9a62258c5..3045b6d7e1 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -4,7 +4,6 @@
 import os
 import re
 import sys
-import textwrap
 import unittest
 
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -21,14 +20,6 @@
     compat_HTMLParseError,
     compat_os_name,
 )
-from yt_dlp.parsing import (
-    HTMLTagParser,
-    FirstMatchingElementParser,
-)
-
-# some testcases don't work with current functions
-get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag
-
 from yt_dlp.utils import (
     Config,
     DateRange,
@@ -68,6 +59,7 @@
     get_element_by_class,
     get_element_html_by_attribute,
     get_element_html_by_class,
+    get_element_text_and_html_by_tag,
     get_elements_by_attribute,
     get_elements_by_class,
     get_elements_html_by_attribute,
@@ -1776,110 +1768,34 @@ def test_get_elements_text_and_html_by_attribute(self):
         self.assertEqual(list(get_elements_text_and_html_by_attribute(
             'class', 'foo', '<a class="foo">nice</a><span class="foo">nice</span>', tag='a')), [('nice', '<a class="foo">nice</a>')])
 
-    def test_get_element_text_and_html_by_tag(self):
-        get_element_by_tag_test_string = '''
-        random text lorem ipsum</p>
+    GET_ELEMENT_BY_TAG_TEST_STRING = '''
+    random text lorem ipsum</p>
+    <div>
+        this should be returned
+        <span>this should also be returned</span>
         <div>
-            this should be returned
-            <span>this should also be returned</span>
-            <div>
-                this should also be returned
-            </div>
-            closing tag above should not trick, so this should also be returned
+            this should also be returned
         </div>
-        but this text should not be returned
-        '''
-        html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4)
-        get_element_by_tag_res_outerdiv_html = html.strip()[32:276]
-        get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6]
-        get_element_by_tag_res_innerspan_html = html.strip()[78:119]
-        get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7]
+        closing tag above should not trick, so this should also be returned
+    </div>
+    but this text should not be returned
+    '''
+    GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[32:276]
+    GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT = GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML[5:-6]
+    GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119]
+    GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7]
+
+    def test_get_element_text_and_html_by_tag(self):
+        html = self.GET_ELEMENT_BY_TAG_TEST_STRING
 
         self.assertEqual(
             get_element_text_and_html_by_tag('div', html),
-            (get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html))
+            (self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT, self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML))
         self.assertEqual(
             get_element_text_and_html_by_tag('span', html),
-            (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
+            (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML))
         self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
 
-    def test_get_element_text_and_html_by_tag_malformed(self):
-        inner_text = 'inner text'
-        malnested_elements = f'<malnested_a><malnested_b>{inner_text}</malnested_a></malnested_b>'
-        commented_html = '<!--<div>inner comment</div>-->'
-        outerdiv_html = f'<div>{malnested_elements}</div>'
-        html = f'{commented_html}{outerdiv_html}'
-
-        self.assertEqual(
-            get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html))
-        self.assertEqual(
-            get_element_text_and_html_by_tag('malnested_a', html),
-            (f'<malnested_b>{inner_text}',
-             f'<malnested_a><malnested_b>{inner_text}</malnested_a>'))
-        self.assertEqual(
-            get_element_text_and_html_by_tag('malnested_b', html),
-            (f'{inner_text}</malnested_a>',
-             f'<malnested_b>{inner_text}</malnested_a></malnested_b>'))
-        self.assertRaises(
-            compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}</orphan>')
-        self.assertRaises(
-            compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'<orphan>{html}')
-
-    def test_strict_html_parsing(self):
-        class StrictTagParser(HTMLTagParser):
-            STRICT = True
-
-        parser = StrictTagParser()
-        with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"):
-            parser.taglist('</p>', reset=True)
-        with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"):
-            parser.taglist('<div><p>', reset=True)
-        with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '</p>'"):
-            parser.taglist('<div><p></div></p>', reset=True)
-        with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '</p>'"):
-            parser.taglist('<div><p>/p></div>', reset=True)
-        with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"):
-            parser.taglist('<div><p></p<< </div>', reset=True)
-        with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"):
-            parser.taglist('<img>must be empty</img>', reset=True)
-
-    def test_relaxed_html_parsing(self):
-        Tag = HTMLTagParser.Tag
-        parser = HTMLTagParser()
-
-        self.assertEqual(parser.taglist('</p>', reset=True), [])
-        self.assertEqual(parser.taglist('<div><p>', reset=True), [])
-
-        tags = parser.taglist('<div><p></div></p>', reset=True)
-        self.assertEqual(tags, [Tag('div'), Tag('p')])
-
-        tags = parser.taglist('<div><p>/p></div>', reset=True)
-        self.assertEqual(tags, [Tag('div')])
-
-        tags = parser.taglist('<div><p>paragraph</p<ignored /></div>', reset=True)
-        self.assertEqual(tags, [Tag('p'), Tag('div')])
-        self.assertEqual(tags[0].text_and_html(), ('paragraph', '<p>paragraph</p'))
-
-        tags = parser.taglist('<img width="300px">must be empty</img>', reset=True)
-        self.assertEqual(tags, [Tag('img')])
-        self.assertEqual(tags[0].text_and_html(), ('', '<img width="300px">'))
-
-    def test_compliant_html_parsing(self):
-        # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS)
-        Tag = HTMLTagParser.Tag
-        html = '''
-            no error without closing tag: <img>
-            self closing is ok: <img />
-        '''
-        parser = HTMLTagParser()
-        tags = parser.taglist(html, reset=True)
-        self.assertEqual(tags, [Tag('img'), Tag('img')])
-
-        # don't get fooled by '>' in attributes
-        html = '''<img greater_a='1>0' greater_b="1>0">'''
-        tags = parser.taglist(html, reset=True)
-        self.assertEqual(tags[0].text_and_html(), ('', html))
-
     def test_iri_to_uri(self):
         self.assertEqual(
             iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),

From 176a156c651defe95f4ed6714ddf47d599ecef50 Mon Sep 17 00:00:00 2001
From: Marcel <flashdagger@googlemail.com>
Date: Tue, 22 Nov 2022 19:58:06 +0100
Subject: [PATCH 05/15] [parsing] rework interface, implemented all
 get_element(s) functions + extract_attributes() as MatchingElementParser
 class methods and improve performance

---
 test/test_parsing.py | 168 +++++++++++++++----
 yt_dlp/parsing.py    | 373 ++++++++++++++++++++++++++++++-------------
 2 files changed, 399 insertions(+), 142 deletions(-)

diff --git a/test/test_parsing.py b/test/test_parsing.py
index 782a1196df..75ed8ebf34 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -1,29 +1,71 @@
 import textwrap
 import unittest
 
-from parsing import (
-    FirstMatchingElementParser,
-    HTMLTagParser,
+from yt_dlp.compat import compat_HTMLParseError
+from yt_dlp.parsing import (
     MatchingElementParser,
+    HTMLCommentRanges,
+    HTMLTagParser,
 )
 
-from yt_dlp.compat import compat_HTMLParseError
-
-get_element_by_attribute = FirstMatchingElementParser
-get_element_by_class = FirstMatchingElementParser
-get_element_html_by_attribute = FirstMatchingElementParser
-get_element_html_by_class = FirstMatchingElementParser.get_element_html_by_class
-get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag
-get_elements_by_attribute = MatchingElementParser
-get_elements_by_class = MatchingElementParser
-get_elements_html_by_attribute = MatchingElementParser
-get_elements_html_by_class = FirstMatchingElementParser.get_elements_html_by_class
-get_elements_text_and_html_by_attribute = MatchingElementParser
+extract_attributes = MatchingElementParser.extract_attributes
+get_element_by_attribute = MatchingElementParser.get_element_by_attribute
+get_element_by_class = MatchingElementParser.get_element_by_class
+get_element_html_by_attribute = MatchingElementParser.get_element_html_by_attribute
+get_element_html_by_class = MatchingElementParser.get_element_html_by_class
+get_element_text_and_html_by_tag = MatchingElementParser.get_element_text_and_html_by_tag
+get_elements_by_attribute = MatchingElementParser.get_elements_by_attribute
+get_elements_by_class = MatchingElementParser.get_elements_by_class
+get_elements_html_by_attribute = MatchingElementParser.get_elements_html_by_attribute
+get_elements_html_by_class = MatchingElementParser.get_elements_html_by_class
+get_elements_text_and_html_by_attribute = MatchingElementParser.get_elements_text_and_html_by_attribute
+get_elements_text_and_html_by_tag = MatchingElementParser.get_elements_text_and_html_by_tag
 
 
 class TestParsing(unittest.TestCase):
+    def test_extract_attributes(self):
+        self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
+        self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
+        self.assertEqual(extract_attributes('<e x=y>'), {'x': 'y'})
+        self.assertEqual(extract_attributes('<e x="a \'b\' c">'), {'x': "a 'b' c"})
+        self.assertEqual(extract_attributes('<e x=\'a "b" c\'>'), {'x': 'a "b" c'})
+        self.assertEqual(extract_attributes('<e x="&#121;">'), {'x': 'y'})
+        self.assertEqual(extract_attributes('<e x="&#x79;">'), {'x': 'y'})
+        self.assertEqual(extract_attributes('<e x="&amp;">'), {'x': '&'})  # XML
+        self.assertEqual(extract_attributes('<e x="&quot;">'), {'x': '"'})
+        self.assertEqual(extract_attributes('<e x="&pound;">'), {'x': '£'})  # HTML 3.2
+        self.assertEqual(extract_attributes('<e x="&lambda;">'), {'x': 'λ'})  # HTML 4.0
+        self.assertEqual(extract_attributes('<e x="&foo">'), {'x': '&foo'})
+        self.assertEqual(extract_attributes('<e x="\'">'), {'x': "'"})
+        self.assertEqual(extract_attributes('<e x=\'"\'>'), {'x': '"'})
+        self.assertEqual(extract_attributes('<e x >'), {'x': None})
+        self.assertEqual(extract_attributes('<e x=y a>'), {'x': 'y', 'a': None})
+        self.assertEqual(extract_attributes('<e x= y>'), {'x': 'y'})
+        self.assertEqual(extract_attributes('<e x=1 y=2 x=3>'), {'y': '2', 'x': '3'})
+        self.assertEqual(extract_attributes('<e \nx=\ny\n>'), {'x': 'y'})
+        self.assertEqual(extract_attributes('<e \nx=\n"y"\n>'), {'x': 'y'})
+        self.assertEqual(extract_attributes("<e \nx=\n'y'\n>"), {'x': 'y'})
+        self.assertEqual(extract_attributes('<e \nx="\ny\n">'), {'x': '\ny\n'})
+        self.assertEqual(extract_attributes('<e CAPS=x>'), {'caps': 'x'})  # Names lowercased
+        self.assertEqual(extract_attributes('<e x=1 X=2>'), {'x': '2'})
+        self.assertEqual(extract_attributes('<e X=1 x=2>'), {'x': '2'})
+        self.assertEqual(extract_attributes('<e _:funny-name1=1>'), {'_:funny-name1': '1'})
+        self.assertEqual(extract_attributes('<e x="Fáilte 世界 \U0001f600">'), {'x': 'Fáilte 世界 \U0001f600'})
+        self.assertEqual(extract_attributes('<e x="décompose&#769;">'), {'x': 'décompose\u0301'})
+        # "Narrow" Python builds don't support unicode code points outside BMP.
+        try:
+            chr(0x10000)
+            supports_outside_bmp = True
+        except ValueError:
+            supports_outside_bmp = False
+        if supports_outside_bmp:
+            self.assertEqual(extract_attributes('<e x="Smile &#128512;!">'), {'x': 'Smile \U0001f600!'})
+        # Malformed HTML should not break attributes extraction on older Python
+        self.assertEqual(extract_attributes('<mal"formed/>'), {})
+
     GET_ELEMENT_BY_CLASS_TEST_STRING = '''
         <span class="foo bar">nice</span>
+        <div class="foo bar">also nice</div>
     '''
 
     def test_get_element_by_class(self):
@@ -35,7 +77,8 @@ def test_get_element_by_class(self):
     def test_get_element_html_by_class(self):
         html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
 
-        self.assertEqual(get_element_html_by_class('foo', html), html.strip())
+        self.assertEqual(get_element_html_by_class('foo', html),
+                         '<span class="foo bar">nice</span>')
         self.assertEqual(get_element_by_class('no-such-class', html), None)
 
     GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
@@ -48,6 +91,7 @@ def test_get_element_by_attribute(self):
         self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
         self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
         self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
+        self.assertEqual(get_element_by_attribute('class', 'foo bar', html, tag='div'), 'also nice')
 
         html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
 
@@ -56,7 +100,8 @@ def test_get_element_by_attribute(self):
     def test_get_element_html_by_attribute(self):
         html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
 
-        self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip())
+        self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html),
+                         '<span class="foo bar">nice</span>')
         self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None)
         self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None)
 
@@ -110,7 +155,7 @@ def test_get_elements_text_and_html_by_attribute(self):
         self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), [])
 
         self.assertEqual(get_elements_text_and_html_by_attribute(
-            'class', 'foo', '<a class="foo">nice</a><span class="foo">nice</span>', tag='a'),
+            'class', 'foo', '<a class="foo">nice</a><span class="foo">not nice</span>', tag='a'),
             [('nice', '<a class="foo">nice</a>')])
 
     def test_get_element_text_and_html_by_tag(self):
@@ -138,7 +183,16 @@ def test_get_element_text_and_html_by_tag(self):
         self.assertEqual(
             get_element_text_and_html_by_tag('span', html),
             (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
-        self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
+        self.assertIsNone(get_element_text_and_html_by_tag('article', html))
+
+    def test_get_elements_text_and_html_by_tag(self):
+        test_string = '''
+            <img src="a.png">
+            <img src="b.png" />
+            <span>ignore</span>
+        '''
+        items = get_elements_text_and_html_by_tag('img', test_string)
+        self.assertListEqual(items, [('', '<img src="a.png">'), ('', '<img src="b.png" />')])
 
     def test_get_element_text_and_html_by_tag_malformed(self):
         inner_text = 'inner text'
@@ -157,10 +211,8 @@ def test_get_element_text_and_html_by_tag_malformed(self):
             get_element_text_and_html_by_tag('malnested_b', html),
             (f'{inner_text}</malnested_a>',
              f'<malnested_b>{inner_text}</malnested_a></malnested_b>'))
-        self.assertRaises(
-            compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}</orphan>')
-        self.assertRaises(
-            compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'<orphan>{html}')
+        self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}</orphan>'))
+        self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'<orphan>{html}'))
 
     def test_strict_html_parsing(self):
         class StrictTagParser(HTMLTagParser):
@@ -188,14 +240,14 @@ def test_relaxed_html_parsing(self):
         self.assertEqual(parser.taglist('<div><p>', reset=True), [])
 
         tags = parser.taglist('<div><p></div></p>', reset=True)
-        self.assertEqual(tags, [Tag('div'), Tag('p')])
+        self.assertEqual(tags, [Tag('p'), Tag('div')])
 
         tags = parser.taglist('<div><p>/p></div>', reset=True)
         self.assertEqual(tags, [Tag('div')])
 
-        tags = parser.taglist('<div><p>paragraph</p<ignored /></div>', reset=True)
-        self.assertEqual(tags, [Tag('p'), Tag('div')])
-        self.assertEqual(tags[0].text_and_html(), ('paragraph', '<p>paragraph</p'))
+        tags = parser.taglist('<div><p>paragraph</p<ignored></div>', reset=True)
+        self.assertEqual(tags, [Tag('div'), Tag('p')])
+        self.assertEqual(tags[1].text_and_html(), ('paragraph', '<p>paragraph</p<ignored>'))
 
         tags = parser.taglist('<img width="300px">must be empty</img>', reset=True)
         self.assertEqual(tags, [Tag('img')])
@@ -216,3 +268,65 @@ def test_compliant_html_parsing(self):
         html = '''<img greater_a='1>0' greater_b="1>0">'''
         tags = parser.taglist(html, reset=True)
         self.assertEqual(tags[0].text_and_html(), ('', html))
+
+    def test_tag_return_order(self):
+        Tag = HTMLTagParser.Tag
+        html = '''
+        <t0>
+            <t1>
+                <t2>
+                    <t3 /> <t4 />
+                </t2>
+            </t1>
+            <t5>
+                <t6 />
+            </t5>
+        </t0>
+        <t7>
+            <t8 />
+        </t7>
+        '''
+        parser = HTMLTagParser()
+        tags = parser.taglist(html, reset=True)
+        self.assertEqual(
+            str(tags), str([Tag('t0'), Tag('t1'), Tag('t2'), Tag('t3'), Tag('t4'),
+                            Tag('t5'), Tag('t6'), Tag('t7'), Tag('t8')]))
+
+        tags = parser.taglist(html, reset=True, depth_first=True)
+        self.assertEqual(
+            str(tags), str([Tag('t3'), Tag('t4'), Tag('t2'), Tag('t1'), Tag('t6'),
+                            Tag('t5'), Tag('t0'), Tag('t8'), Tag('t7')]))
+
+        # return tags in nested order
+        tags = parser.taglist(html, reset=True, depth_first=None)
+        self.assertEqual(
+            str(tags), str([
+                [Tag('t0'),
+                 [Tag('t1'),
+                  [Tag('t2'), Tag('t3'), Tag('t4')]],
+                 [Tag('t5'), Tag('t6')]],
+                [Tag('t7'), Tag('t8')]]))
+
+    def test_within_html_comment(self):
+        def mark_comments(_string, char='^', nochar='-'):
+            cmts = HTMLCommentRanges(_string)
+            return "".join(char if _idx in cmts else nochar for _idx in range(len(_string)))
+
+        html_string = '''
+        no              comments         in            this              line
+        ---------------------------------------------------------------------
+        <!--                 whole line represents a comment              -->
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+        before <!--                      comment                  -->   after
+        -------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^--------
+        here   is   <!-- a comment -->   and   <!-- another comment -->   end
+        ------------^^^^^^^^^^^^^^^^^^---------^^^^^^^^^^^^^^^^^^^^^^^^------
+        this <!-- nested  <!--     comment    -->  ends here --> and not here
+        -----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^----------------------------
+        stray --> comment closings --> are ignored <!-- but not <!-- openings
+        -------------------------------------------^^^^^^^^^^^^^^^^^^^^^^^^^^
+        '''
+
+        lines = textwrap.dedent(html_string).strip().splitlines()
+        for line, marker in zip(lines[0::2], lines[1::2]):
+            self.assertEqual((line, mark_comments(line)), (line, marker))
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index d0dcf450a0..bcc48c4d3e 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -4,47 +4,89 @@
 import re
 from html.parser import HTMLParser
 
+from .compat import compat_HTMLParseError
 from .utils import orderedSet
 
-from .compat import compat_HTMLParseError
+
+def iter_find(string, sub: str):
+    size = len(sub)
+    idx = -size
+    while True:
+        idx = string.find(sub, idx + size)
+        if idx == -1:
+            return
+        yield idx
+
+
+class HTMLCommentRanges:
+    """computes the offsets of HTML comments
+
+    comments start with '<!--' and end with the first '-->' encountered
+    note: markers within quotes are not ignored
+    """
+
+    def __init__(self, html):
+        self._range_iter = self.ranges(html)
+        self._range = next(self._range_iter, None)
+        self._last_offset = 0
+
+    @staticmethod
+    def ranges(string, sopen='<!--', sclose='-->'):
+        assert not (sopen.startswith(sclose) or sclose.startswith(sopen))
+        open_iter = iter_find(string, sopen)
+        close_len = len(sclose)
+        close_iter = (idx + close_len for idx in iter_find(string, sclose))
+        next_open = next(open_iter, None)
+        next_close = next(close_iter, None)
+
+        while True:
+            if next_open is None:
+                return
+            while next_close is not None and next_open > next_close:
+                next_close = next(close_iter, None)
+            yield slice(next_open, next_close)
+            if next_close is None:
+                return
+            while next_open is not None and next_open < next_close:
+                next_open = next(open_iter, None)
+
+    def __contains__(self, offset):
+        assert isinstance(offset, int)
+        assert offset >= self._last_offset, 'offset must be in increasing order'
+        self._last_offset = offset
+        while self._range and self._range.stop is not None and offset >= self._range.stop:
+            self._range = next(self._range_iter, None)
+
+        return not (self._range is None or offset < self._range.start)
 
 
 class HTMLTagParser(HTMLParser):
-    """HTML parser which acts as iterator
-    returns found elements as instances of Tag
-    nested elements will be returned before its parents
+    """HTML parser which returns found elements as instances of 'Tag'
+    when STRICT=True can raise compat_HTMLParseError() on malformed HTML elements
 
-    strict=True raises compat_HTMLParseError on malformed html
-
-    two modes of usage:
-        # as an lazy iterator:
-        for tag_obj in HTMLTagParser(html):
+    usage:
+        parser = HTMLTagParser()
+        for tag_obj in parser.taglist(html):
             tag_obj.text_and_html()
 
-        # or return a list with all found tag objects
-        # this is faster by factor 2-5 compared to iteration
-        for tag_obj in HTMLTagParser(html).taglist():
-            tag_obj.text_and_html()
     """
 
     STRICT = False
     ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''')
-    CLOSING_TAG_REGEX = re.compile(r'</\s*[^\s<>]+(?:\s*>)?')
     VOID_TAGS = {
         'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
         'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr',
     }
 
     class Tag:
-        __slots__ = 'name', 'string', 'start', 'start_len', 'stop', 'attrs'
+        __slots__ = 'name', 'string', 'attrs', '_openrange', '_closerange'
 
-        def __init__(self, name, *, string='', start=None, stop=None, attrs=()):
+        def __init__(self, name, *, string='', attrs=()):
             self.name = name
             self.string = string
-            self.start = start
-            self.start_len = 0
-            self.stop = stop
             self.attrs = tuple(attrs)
+            self._openrange = None
+            self._closerange = None
 
         def __str__(self):
             return self.name
@@ -55,52 +97,81 @@ def __repr__(self):
         def __eq__(self, other):
             return self.name == other
 
+        def openrange(self, offset, startlen=0):
+            if isinstance(offset, slice):
+                self._openrange = offset
+            else:
+                self._openrange = slice(offset, offset + startlen)
+
+        def closerange(self, offset, stoplen=0):
+            if isinstance(offset, slice):
+                self._closerange = offset
+            else:
+                self._closerange = slice(offset, offset + stoplen)
+
+        def opentag(self):
+            return self.string[self._openrange] if self._openrange else ''
+
         def html(self):
-            return self.string[self.start:self.stop]
+            if not self._openrange:
+                return ''
+            if self._closerange:
+                return self.string[self._openrange.start:self._closerange.stop]
+            return self.string[self._openrange]
+
+        def text(self):
+            if self._openrange and self._closerange:
+                return self.string[self._openrange.stop:self._closerange.start]
+            return ''
 
         def text_and_html(self):
-            assert isinstance(self.start, int)
-            if not self.start_len:
-                match = HTMLTagParser.ANY_TAG_REGEX.match(self.string[self.start:])
-                assert match
-                self.start_len = len(match.group())
-            if self.stop is None:
-                return '', self.string[self.start: self.start + self.start_len]
-            html = self.html()
-            cidx = html.rindex('</')
-            return html[self.start_len:cidx], html
+            return self.text(), self.html()
 
-    class EarlyExitException(Exception):
+    class AbortException(Exception):
         pass
 
     def __init__(self):
-        super().__init__()
         self.tagstack = collections.deque()
+        self._nestedtags = [[]]
+        super().__init__()
         self._offset = self.offset
-        self.found_tags = []
 
     def predicate(self, tag, attrs):
+        """ return True for every encountered opening tag that should be processed """
         return True
 
     def callback(self, tag_obj):
-        pass
+        """ this will be called when the requested tag is closed """
 
-    def abort(self, last_tag=None):
-        if last_tag:
-            self.found_tags.append(last_tag)
-        raise HTMLTagParser.EarlyExitException()
+    def reset(self):
+        super().reset()
+        self.tagstack.clear()
+
+    def taglist(self, data, reset=True, depth_first=False):
+        """ parse data and return found tag objects
+        @param data:    html string
+        @param reset:   reset state
+        @param depth_first: return order: as opened (False), as closed (True), nested (None)
+        @return: list of Tag objects
+        """
+        def flatten(_list, first=True):
+            rlist = _list if first or not depth_first else itertools.chain(_list[1:], _list[:1])
+            for item in rlist:
+                if isinstance(item, list):
+                    yield from flatten(item, first=False)
+                else:
+                    yield item
 
-    def taglist(self, data, reset=True):
-        self.found_tags.clear()
         if reset:
             self.reset()
-            self.tagstack.clear()
-        with contextlib.suppress(HTMLTagParser.EarlyExitException):
+        with contextlib.suppress(HTMLTagParser.AbortException):
             self.feed(data)
         if self.STRICT and self.tagstack:
             orphans = ', '.join(map(repr, map(str, orderedSet(self.tagstack, lazy=True))))
             raise compat_HTMLParseError(f'unclosed tag {orphans}')
-        return self.found_tags
+        taglist = self._nestedtags[0] if depth_first is None else list(flatten(self._nestedtags[0]))
+        self._nestedtags = [[]]
+        return taglist
 
     def updatepos(self, i, j):
         offset = self._offset = super().updatepos(i, j)
@@ -108,22 +179,23 @@ def updatepos(self, i, j):
 
     def handle_starttag(self, tag, attrs):
         try:
-            # we use internal variable for performance reason
+            # we use internal variable for performance reasons
             tag_text = getattr(self, '_HTMLParser__starttag_text')
         except AttributeError:
             tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group()
-        if self.predicate(tag, attrs):
-            obj = self.Tag(
-                tag, string=self.rawdata, start=self._offset, attrs=attrs)
-            obj.start_len = len(tag_text)
-            if tag_text.endswith('/>') or tag in self.VOID_TAGS:
-                if self.callback(obj) is not False:
-                    self.found_tags.append(obj)
-                return
-        else:
-            obj = None
 
-        self.tagstack.appendleft(obj or tag)
+        tag_obj = tag
+        if self.predicate(tag, attrs):
+            tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
+            tag_obj.openrange(self._offset, len(tag_text))
+            if tag_text.endswith('/>') or tag in self.VOID_TAGS:
+                self._nestedtags[-1].append(tag_obj)
+                self.callback(tag_obj)
+                return
+            nesting = []
+            self._nestedtags[-1].append(nesting)
+            self._nestedtags.append(nesting)
+        self.tagstack.appendleft(tag_obj)
 
     handle_startendtag = handle_starttag
 
@@ -141,79 +213,150 @@ def handle_endtag(self, tag):
                     f'malnested closing tag {tag!r}, expected after {open_tags!r}')
             tag_obj = self.tagstack[idx]
             self.tagstack.remove(tag)
-            if not isinstance(tag_obj, str):
-                # since we landed here we'll always find a closing tag
-                match = self.CLOSING_TAG_REGEX.match(self.rawdata[self._offset:])
-                tag_obj.stop = self._offset + match.end()
-                if self.callback(tag_obj) is not False:
-                    self.found_tags.append(tag_obj)
+            if isinstance(tag_obj, self.Tag):
+                close_idx = self.rawdata.find('>', self._offset) + 1
+                tag_obj.closerange(self._offset, close_idx - self._offset)
+                self._nestedtags.pop().insert(0, tag_obj)
+                self.callback(tag_obj)
         except ValueError as exc:
             if isinstance(exc, compat_HTMLParseError):
                 raise
-            elif self.STRICT:
-                raise compat_HTMLParseError(f'stray closing tag {tag!r}')
+            if self.STRICT:
+                raise compat_HTMLParseError(f'stray closing tag {tag!r}') from exc
 
 
-class ClassParser(HTMLTagParser):
-    def __init__(self, attribute, matchfunc, stop):
-        super().__init__()
-        self.search_attr = attribute
-        self.matchfunc = matchfunc
-        self.stop = stop
-        self.processing = 0
-
-    def predicate(self, tag, attrs):
-        if self.processing <= 0 and self.stop is not None and self._offset > self.stop:
-            self.abort()
-        string = dict(attrs).get(self.search_attr, '')
-        if self.matchfunc(string):
-            self.processing += 1
-            return True
-        return False
-
-    def callback(self, tag_obj):
-        if self.stop is None:
-            self.abort(tag_obj)
-        self.processing -= 1
-
-    @classmethod
-    def get_elements_html_by_class(cls, class_name, html):
-        regex = re.compile(rf'[\w\- ]*\b{re.escape(class_name)}\b')
-        it = re.finditer(rf'<.+ class=[\'"]{regex.pattern}', html)
-        start = stop = None
-        for match in it:
-            if start is None:
-                start = match.start()
-            else:
-                stop = match.end()
-        if start is None:
-            return []
-        parser = cls('class', lambda x: regex.match(x), stop)
-        return [tag.html() for tag in parser.taglist(html[start:])]
-
-
-class FirstMatchingElementParser(HTMLTagParser):
+class MatchingElementParser(HTMLTagParser):
+    """ optimized version of HTMLTagParser
+    """
     def __init__(self, matchfunc):
         super().__init__()
         self.matchfunc = matchfunc
-        self.found = False
+        self.found_none = True
+
+    def reset(self):
+        super().reset()
+        self.found_none = True
+
+    def callback(self, tag_obj):
+        raise self.AbortException()
 
     def predicate(self, tag, attrs):
-        if not self.found and self.matchfunc(tag, attrs):
-            self.found = True
+        if self.found_none and self.matchfunc(tag, attrs):
+            self.found_none = False
             return True
         return False
 
-    def callback(self, obj):
-        self.abort(obj)
+    @staticmethod
+    def class_value_regex(class_name):
+        return rf'[\w\s\-]*(?<![\w\-]){re.escape(class_name)}(?![\w\-])[\w\s\-]*'
+
+    @staticmethod
+    def matching_tag_regex(tag, attribute, value_regex, escape=True):
+        if isinstance(value_regex, re.Pattern):
+            value_regex = value_regex.pattern
+        elif escape:
+            value_regex = re.escape(value_regex)
+
+        return rf'''(?x)
+            <(?:{tag})
+             (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
+             \s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q)
+            '''
+
+    @classmethod
+    def iter_tags(cls, regex, html, *, matchfunc):
+        comments = HTMLCommentRanges(html)
+        parser = cls(matchfunc)
+        for match in re.finditer(regex, html):
+            if match.start() not in comments:
+                yield from parser.taglist(html[match.start():], reset=True)
+
+    @classmethod
+    def tags_by_name(cls, tag, html):
+        def matchfunc(tag_str, _attrs):
+            return tag_str == tag
+
+        yield from cls.iter_tags(rf'<\s*{re.escape(tag)}[\s>]', html, matchfunc=matchfunc)
+
+    @classmethod
+    def tags_by_attribute(cls, attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
+        def matchfunc(_tag_str, attrs):
+            return any(attr == attribute and re.fullmatch(value, value_str)
+                       for attr, value_str in attrs)
+
+        tag_regex = cls.matching_tag_regex(tag, attribute, value, escape_value)
+        yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
+
+    @classmethod
+    def extract_attributes(cls, html):
+        attr_dict = {}
+
+        def matchfunc(_tag, attrs):
+            attr_dict.update(attrs)
+            raise cls.AbortException()
+
+        with contextlib.suppress(cls.AbortException):
+            cls(matchfunc).feed(html)
+
+        return attr_dict
+
+    @classmethod
+    def get_elements_text_and_html_by_tag(cls, tag, html):
+        return [tag.text_and_html() for tag in cls.tags_by_name(tag, html)]
 
     @classmethod
     def get_element_text_and_html_by_tag(cls, tag, html):
-        """
-        For the first element with the specified tag in the given HTML document
-        return its content (text) and the whole element (html)
-        """
-        parser = cls(lambda _tag, _: _tag == tag)
-        for tag_obj in parser.taglist(html):
-            return tag_obj.text_and_html()
-        raise compat_HTMLParseError(f'tag {tag} not found')
+        tag = next(cls.tags_by_name(tag, html), None)
+        return tag and tag.text_and_html()
+
+    @classmethod
+    def get_elements_text_and_html_by_attribute(cls, *args, **kwargs):
+        return [tag.text_and_html() for tag in cls.tags_by_attribute(*args, **kwargs)]
+
+    @classmethod
+    def get_elements_by_attribute(cls, *args, **kwargs):
+        return [tag.text_and_html()[0] for tag in cls.tags_by_attribute(*args, **kwargs)]
+
+    @classmethod
+    def get_elements_html_by_attribute(cls, *args, **kwargs):
+        return [tag.html() for tag in cls.tags_by_attribute(*args, **kwargs)]
+
+    @classmethod
+    def get_element_by_attribute(cls, *args, **kwargs):
+        tag = next(cls.tags_by_attribute(*args, **kwargs), None)
+        return tag and tag.text()
+
+    @classmethod
+    def get_element_html_by_attribute(cls, *args, **kwargs):
+        tag = next(cls.tags_by_attribute(*args, **kwargs), None)
+        return tag and tag.html()
+
+    @classmethod
+    def get_elements_by_class(cls, class_name, html):
+        value = cls.class_value_regex(class_name)
+        return [tag.text() for tag
+                in cls.tags_by_attribute('class', value, html, escape_value=False)]
+
+    @classmethod
+    def get_elements_html_by_class(cls, class_name, html):
+        value = cls.class_value_regex(class_name)
+        return [tag.html() for tag
+                in cls.tags_by_attribute('class', value, html, escape_value=False)]
+
+    @classmethod
+    def get_elements_text_and_html_by_class(cls, class_name, html):
+        value = cls.class_value_regex(class_name)
+        return [tag.text() for tag
+                in cls.tags_by_attribute('class', value, html, escape_value=False)]
+
+    @classmethod
+    def get_element_html_by_class(cls, class_name, html):
+        value = cls.class_value_regex(class_name)
+        tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None)
+        return tag and tag.html()
+
+    @classmethod
+    def get_element_by_class(cls, class_name, html):
+        value = cls.class_value_regex(class_name)
+        tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None)
+        return tag and tag.text()

From 8451074b501f51cb66c4d5463260320763b9ff69 Mon Sep 17 00:00:00 2001
From: Marcel <flashdagger@googlemail.com>
Date: Sun, 27 Nov 2022 16:22:03 +0100
Subject: [PATCH 06/15] [parsing] fix: don't push unmatched void tags onto
 queue

---
 test/test_parsing.py |  7 +++++++
 yt_dlp/parsing.py    | 14 ++++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/test/test_parsing.py b/test/test_parsing.py
index 75ed8ebf34..880c41a348 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -186,6 +186,9 @@ def test_get_element_text_and_html_by_tag(self):
         self.assertIsNone(get_element_text_and_html_by_tag('article', html))
 
     def test_get_elements_text_and_html_by_tag(self):
+        class StrictParser(MatchingElementParser):
+            STRICT = True
+
         test_string = '''
             <img src="a.png">
             <img src="b.png" />
@@ -194,6 +197,10 @@ def test_get_elements_text_and_html_by_tag(self):
         items = get_elements_text_and_html_by_tag('img', test_string)
         self.assertListEqual(items, [('', '<img src="a.png">'), ('', '<img src="b.png" />')])
 
+        self.assertEqual(
+            StrictParser.get_element_text_and_html_by_tag('use', '<use><img></use>'),
+            ('<img>', '<use><img></use>'))
+
     def test_get_element_text_and_html_by_tag_malformed(self):
         inner_text = 'inner text'
         malnested_elements = f'<malnested_a><malnested_b>{inner_text}</malnested_a></malnested_b>'
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index bcc48c4d3e..8fbb4db14b 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -185,17 +185,19 @@ def handle_starttag(self, tag, attrs):
             tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group()
 
         tag_obj = tag
+        tag_is_open = not (tag_text.endswith('/>') or tag in self.VOID_TAGS)
         if self.predicate(tag, attrs):
             tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
             tag_obj.openrange(self._offset, len(tag_text))
-            if tag_text.endswith('/>') or tag in self.VOID_TAGS:
+            if tag_is_open:
+                nesting = []
+                self._nestedtags[-1].append(nesting)
+                self._nestedtags.append(nesting)
+            else:
                 self._nestedtags[-1].append(tag_obj)
                 self.callback(tag_obj)
-                return
-            nesting = []
-            self._nestedtags[-1].append(nesting)
-            self._nestedtags.append(nesting)
-        self.tagstack.appendleft(tag_obj)
+        if tag_is_open:
+            self.tagstack.appendleft(tag_obj)
 
     handle_startendtag = handle_starttag
 

From dbf350c12291279c0be56cb82922c2fae1c87eb2 Mon Sep 17 00:00:00 2001
From: Marcel <flashdagger@googlemail.com>
Date: Sun, 27 Nov 2022 16:34:06 +0100
Subject: [PATCH 07/15] [parsing] return unclosed matched tags

---
 test/test_parsing.py | 9 +++++----
 yt_dlp/parsing.py    | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/test/test_parsing.py b/test/test_parsing.py
index 880c41a348..5887115185 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -218,8 +218,9 @@ def test_get_element_text_and_html_by_tag_malformed(self):
             get_element_text_and_html_by_tag('malnested_b', html),
             (f'{inner_text}</malnested_a>',
              f'<malnested_b>{inner_text}</malnested_a></malnested_b>'))
+        self.assertEqual(
+            get_element_text_and_html_by_tag('orphan', f'<orphan>{html}'), ('', '<orphan>'))
         self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}</orphan>'))
-        self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'<orphan>{html}'))
 
     def test_strict_html_parsing(self):
         class StrictTagParser(HTMLTagParser):
@@ -244,13 +245,13 @@ def test_relaxed_html_parsing(self):
         parser = HTMLTagParser()
 
         self.assertEqual(parser.taglist('</p>', reset=True), [])
-        self.assertEqual(parser.taglist('<div><p>', reset=True), [])
+        self.assertEqual(parser.taglist('<div><p>', reset=True), [Tag('div'), Tag('p')])
 
         tags = parser.taglist('<div><p></div></p>', reset=True)
-        self.assertEqual(tags, [Tag('p'), Tag('div')])
+        self.assertEqual(tags, [Tag('div'), Tag('p')])
 
         tags = parser.taglist('<div><p>/p></div>', reset=True)
-        self.assertEqual(tags, [Tag('div')])
+        self.assertEqual(tags, [Tag('div'), Tag('p')])
 
         tags = parser.taglist('<div><p>paragraph</p<ignored></div>', reset=True)
         self.assertEqual(tags, [Tag('div'), Tag('p')])
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index 8fbb4db14b..5ecd6b75ca 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -190,7 +190,7 @@ def handle_starttag(self, tag, attrs):
             tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
             tag_obj.openrange(self._offset, len(tag_text))
             if tag_is_open:
-                nesting = []
+                nesting = [tag_obj]
                 self._nestedtags[-1].append(nesting)
                 self._nestedtags.append(nesting)
             else:
@@ -218,7 +218,7 @@ def handle_endtag(self, tag):
             if isinstance(tag_obj, self.Tag):
                 close_idx = self.rawdata.find('>', self._offset) + 1
                 tag_obj.closerange(self._offset, close_idx - self._offset)
-                self._nestedtags.pop().insert(0, tag_obj)
+                self._nestedtags.pop()
                 self.callback(tag_obj)
         except ValueError as exc:
             if isinstance(exc, compat_HTMLParseError):

From 7a67a2028f49f71c2cd4bae0611c2a04e313e840 Mon Sep 17 00:00:00 2001
From: Marcel <flashdagger@googlemail.com>
Date: Sun, 27 Nov 2022 21:26:58 +0100
Subject: [PATCH 08/15] [parsing] tweak tag regex

---
 yt_dlp/parsing.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index 5ecd6b75ca..d2c2609545 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -261,7 +261,7 @@ def matching_tag_regex(tag, attribute, value_regex, escape=True):
 
         return rf'''(?x)
             <(?:{tag})
-             (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
+             (?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?
              \s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q)
             '''
 
@@ -278,7 +278,8 @@ def tags_by_name(cls, tag, html):
         def matchfunc(tag_str, _attrs):
             return tag_str == tag
 
-        yield from cls.iter_tags(rf'<\s*{re.escape(tag)}[\s>]', html, matchfunc=matchfunc)
+        tag_regex = rf'''<\s*{re.escape(tag)}(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>'''
+        yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
 
     @classmethod
     def tags_by_attribute(cls, attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):

From 29278a3323be5106809e43d2977efcd0e3159a4f Mon Sep 17 00:00:00 2001
From: Marcel <flashdagger@googlemail.com>
Date: Sun, 27 Nov 2022 16:56:45 +0100
Subject: [PATCH 09/15] [parsing] fix return value

---
 test/test_parsing.py | 16 +++++++++++++---
 yt_dlp/parsing.py    |  4 ++--
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/test/test_parsing.py b/test/test_parsing.py
index 5887115185..e21299df03 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -195,7 +195,7 @@ class StrictParser(MatchingElementParser):
             <span>ignore</span>
         '''
         items = get_elements_text_and_html_by_tag('img', test_string)
-        self.assertListEqual(items, [('', '<img src="a.png">'), ('', '<img src="b.png" />')])
+        self.assertEqual(items, [('', '<img src="a.png">'), ('', '<img src="b.png" />')])
 
         self.assertEqual(
             StrictParser.get_element_text_and_html_by_tag('use', '<use><img></use>'),
@@ -245,16 +245,26 @@ def test_relaxed_html_parsing(self):
         parser = HTMLTagParser()
 
         self.assertEqual(parser.taglist('</p>', reset=True), [])
-        self.assertEqual(parser.taglist('<div><p>', reset=True), [Tag('div'), Tag('p')])
+
+        tags = parser.taglist('<div><p>', reset=True)
+        self.assertEqual(tags, [Tag('div'), Tag('p')])
+        self.assertEqual(tags[0].text_and_html(), ('', '<div>'))
+        self.assertEqual(tags[1].text_and_html(), ('', '<p>'))
 
         tags = parser.taglist('<div><p></div></p>', reset=True)
         self.assertEqual(tags, [Tag('div'), Tag('p')])
+        self.assertEqual(tags[0].text_and_html(), ('<p>', '<div><p></div>'))
+        self.assertEqual(tags[1].text_and_html(), ('</div>', '<p></div></p>'))
 
         tags = parser.taglist('<div><p>/p></div>', reset=True)
         self.assertEqual(tags, [Tag('div'), Tag('p')])
+        self.assertEqual(tags[0].text_and_html(), ('<p>/p>', '<div><p>/p></div>'))
+        self.assertEqual(tags[1].text_and_html(), ('', '<p>'))
 
         tags = parser.taglist('<div><p>paragraph</p<ignored></div>', reset=True)
         self.assertEqual(tags, [Tag('div'), Tag('p')])
+        self.assertEqual(tags[0].text_and_html(),
+                         ('<p>paragraph</p<ignored>', '<div><p>paragraph</p<ignored></div>'))
         self.assertEqual(tags[1].text_and_html(), ('paragraph', '<p>paragraph</p<ignored>'))
 
         tags = parser.taglist('<img width="300px">must be empty</img>', reset=True)
@@ -315,7 +325,7 @@ def test_tag_return_order(self):
                  [Tag('t5'), Tag('t6')]],
                 [Tag('t7'), Tag('t8')]]))
 
-    def test_within_html_comment(self):
+    def test_html_comment_ranges(self):
         def mark_comments(_string, char='^', nochar='-'):
             cmts = HTMLCommentRanges(_string)
             return "".join(char if _idx in cmts else nochar for _idx in range(len(_string)))
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index d2c2609545..8751cd5f9d 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -318,7 +318,7 @@ def get_elements_text_and_html_by_attribute(cls, *args, **kwargs):
 
     @classmethod
     def get_elements_by_attribute(cls, *args, **kwargs):
-        return [tag.text_and_html()[0] for tag in cls.tags_by_attribute(*args, **kwargs)]
+        return [tag.text() for tag in cls.tags_by_attribute(*args, **kwargs)]
 
     @classmethod
     def get_elements_html_by_attribute(cls, *args, **kwargs):
@@ -349,7 +349,7 @@ def get_elements_html_by_class(cls, class_name, html):
     @classmethod
     def get_elements_text_and_html_by_class(cls, class_name, html):
         value = cls.class_value_regex(class_name)
-        return [tag.text() for tag
+        return [tag.text_and_html() for tag
                 in cls.tags_by_attribute('class', value, html, escape_value=False)]
 
     @classmethod

From 6169b3eca81ccde2d6c0116295b2c38e807befb2 Mon Sep 17 00:00:00 2001
From: Marcel <flashdagger@googlemail.com>
Date: Tue, 29 Nov 2022 00:25:52 +0100
Subject: [PATCH 10/15] [parsing] replace HTMLCommentRanges with
 HTMLIgnoreRanges

* ignore matches within CDATA elements and comments
---
 test/test_parsing.py | 25 +++++++++-------
 yt_dlp/parsing.py    | 71 +++++++++++++++++---------------------------
 2 files changed, 43 insertions(+), 53 deletions(-)

diff --git a/test/test_parsing.py b/test/test_parsing.py
index e21299df03..1898ee8ab1 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -4,7 +4,7 @@
 from yt_dlp.compat import compat_HTMLParseError
 from yt_dlp.parsing import (
     MatchingElementParser,
-    HTMLCommentRanges,
+    HTMLIgnoreRanges,
     HTMLTagParser,
 )
 
@@ -325,26 +325,31 @@ def test_tag_return_order(self):
                  [Tag('t5'), Tag('t6')]],
                 [Tag('t7'), Tag('t8')]]))
 
-    def test_html_comment_ranges(self):
+    def test_html_ignored_ranges(self):
         def mark_comments(_string, char='^', nochar='-'):
-            cmts = HTMLCommentRanges(_string)
+            cmts = HTMLIgnoreRanges(_string)
             return "".join(char if _idx in cmts else nochar for _idx in range(len(_string)))
 
         html_string = '''
         no              comments         in            this              line
         ---------------------------------------------------------------------
         <!--                 whole line represents a comment              -->
-        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+        ----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^---
         before <!--                      comment                  -->   after
-        -------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^--------
+        -----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-----------
+        this is a leftover comment -->     <!-- a new comment without closing
+        ^^^^^^^^^^^^^^^^^^^^^^^^^^^------------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
         here   is   <!-- a comment -->   and   <!-- another comment -->   end
-        ------------^^^^^^^^^^^^^^^^^^---------^^^^^^^^^^^^^^^^^^^^^^^^------
-        this <!-- nested  <!--     comment    -->  ends here --> and not here
-        -----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^----------------------------
-        stray --> comment closings --> are ignored <!-- but not <!-- openings
-        -------------------------------------------^^^^^^^^^^^^^^^^^^^^^^^^^^
+        ----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^---------
+        <script> ignore here </script>            <script> and here </script>
+        --------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^---------
         '''
 
         lines = textwrap.dedent(html_string).strip().splitlines()
         for line, marker in zip(lines[0::2], lines[1::2]):
             self.assertEqual((line, mark_comments(line)), (line, marker))
+
+        # yet we must be able to match script elements
+        test_string = '''<script type="text/javascript">var foo = 'bar';</script>'''
+        items = get_element_text_and_html_by_tag('script', test_string)
+        self.assertEqual(items, ("var foo = 'bar';", test_string))
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index 8751cd5f9d..1698591e34 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -8,56 +8,41 @@
 from .utils import orderedSet
 
 
-def iter_find(string, sub: str):
-    size = len(sub)
-    idx = -size
-    while True:
-        idx = string.find(sub, idx + size)
-        if idx == -1:
-            return
-        yield idx
+class HTMLIgnoreRanges:
+    """check if an offset is within CDATA content elements (script, style) or XML comments
 
+        note:
+            * given offsets must be in increasing order
+            * no detection of nested constructs (e.g. comments within script tags)
 
-class HTMLCommentRanges:
-    """computes the offsets of HTML comments
-
-    comments start with '<!--' and end with the first '-->' encountered
-    note: markers within quotes are not ignored
+        usage:
+            ranges = HTMLIgnoreRanges(html)
+            if offset in ranges:
+                ...
     """
+    REGEX = re.compile(r'<!--|-->|</?\s*(?:script|style)\b[^>]*>')
 
     def __init__(self, html):
-        self._range_iter = self.ranges(html)
-        self._range = next(self._range_iter, None)
-        self._last_offset = 0
-
-    @staticmethod
-    def ranges(string, sopen='<!--', sclose='-->'):
-        assert not (sopen.startswith(sclose) or sclose.startswith(sopen))
-        open_iter = iter_find(string, sopen)
-        close_len = len(sclose)
-        close_iter = (idx + close_len for idx in iter_find(string, sclose))
-        next_open = next(open_iter, None)
-        next_close = next(close_iter, None)
-
-        while True:
-            if next_open is None:
-                return
-            while next_close is not None and next_open > next_close:
-                next_close = next(close_iter, None)
-            yield slice(next_open, next_close)
-            if next_close is None:
-                return
-            while next_open is not None and next_open < next_close:
-                next_open = next(open_iter, None)
+        self.html = html
+        self._last_match = None
+        self._final = False
 
     def __contains__(self, offset):
         assert isinstance(offset, int)
-        assert offset >= self._last_offset, 'offset must be in increasing order'
-        self._last_offset = offset
-        while self._range and self._range.stop is not None and offset >= self._range.stop:
-            self._range = next(self._range_iter, None)
 
-        return not (self._range is None or offset < self._range.start)
+        if not self._final and (self._last_match is None or offset >= self._last_match.end()):
+            match = self.REGEX.search(self.html, offset)
+            if match:
+                self._last_match = match
+            else:
+                self._final = True
+
+        if self._last_match is None:
+            return False
+        match_string = self._last_match.group()
+        if match_string.startswith('</') or match_string == '-->':
+            return offset < self._last_match.start()
+        return offset >= self._last_match.end()
 
 
 class HTMLTagParser(HTMLParser):
@@ -267,10 +252,10 @@ def matching_tag_regex(tag, attribute, value_regex, escape=True):
 
     @classmethod
     def iter_tags(cls, regex, html, *, matchfunc):
-        comments = HTMLCommentRanges(html)
+        ignored = HTMLIgnoreRanges(html)
         parser = cls(matchfunc)
         for match in re.finditer(regex, html):
-            if match.start() not in comments:
+            if match.start() not in ignored:
                 yield from parser.taglist(html[match.start():], reset=True)
 
     @classmethod

From 65f91148fc6fcbce967d775527edb95b567db0cb Mon Sep 17 00:00:00 2001
From: Marcel <flashdagger@googlemail.com>
Date: Tue, 29 Nov 2022 15:01:18 +0100
Subject: [PATCH 11/15] [parsing] search for case-insensitive tag names

---
 test/test_parsing.py | 4 ++++
 yt_dlp/parsing.py    | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/test/test_parsing.py b/test/test_parsing.py
index 1898ee8ab1..8a36beda44 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -222,6 +222,10 @@ def test_get_element_text_and_html_by_tag_malformed(self):
             get_element_text_and_html_by_tag('orphan', f'<orphan>{html}'), ('', '<orphan>'))
         self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}</orphan>'))
 
+        # ignore case on tags
+        ci_html = f'<SpAn>{html}</sPaN>'
+        self.assertEqual(get_element_text_and_html_by_tag('span', ci_html), (html, ci_html))
+
     def test_strict_html_parsing(self):
         class StrictTagParser(HTMLTagParser):
             STRICT = True
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index 1698591e34..1db6704dd2 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -245,7 +245,7 @@ def matching_tag_regex(tag, attribute, value_regex, escape=True):
             value_regex = re.escape(value_regex)
 
         return rf'''(?x)
-            <(?:{tag})
+            <(?i:{tag})
              (?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?
              \s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q)
             '''
@@ -263,7 +263,7 @@ def tags_by_name(cls, tag, html):
         def matchfunc(tag_str, _attrs):
             return tag_str == tag
 
-        tag_regex = rf'''<\s*{re.escape(tag)}(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>'''
+        tag_regex = rf'''<\s*(?i:{re.escape(tag)})(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>'''
         yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
 
     @classmethod

From 8d87bb4d91ed732bc08bd39ce114bdcca63abf68 Mon Sep 17 00:00:00 2001
From: Marcel <flashdagger@googlemail.com>
Date: Wed, 30 Nov 2022 17:21:09 +0100
Subject: [PATCH 12/15] [parsing] unify tag nesting

---
 test/test_parsing.py | 6 +++---
 yt_dlp/parsing.py    | 5 ++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/test/test_parsing.py b/test/test_parsing.py
index 8a36beda44..a7e7ec7d46 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -325,9 +325,9 @@ def test_tag_return_order(self):
             str(tags), str([
                 [Tag('t0'),
                  [Tag('t1'),
-                  [Tag('t2'), Tag('t3'), Tag('t4')]],
-                 [Tag('t5'), Tag('t6')]],
-                [Tag('t7'), Tag('t8')]]))
+                  [Tag('t2'), [Tag('t3')], [Tag('t4')]]],
+                 [Tag('t5'), [Tag('t6')]]],
+                [Tag('t7'), [Tag('t8')]]]))
 
     def test_html_ignored_ranges(self):
         def mark_comments(_string, char='^', nochar='-'):
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index 1db6704dd2..c6748d2d8f 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -174,12 +174,11 @@ def handle_starttag(self, tag, attrs):
         if self.predicate(tag, attrs):
             tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
             tag_obj.openrange(self._offset, len(tag_text))
+            nesting = [tag_obj]
+            self._nestedtags[-1].append(nesting)
             if tag_is_open:
-                nesting = [tag_obj]
-                self._nestedtags[-1].append(nesting)
                 self._nestedtags.append(nesting)
             else:
-                self._nestedtags[-1].append(tag_obj)
                 self.callback(tag_obj)
         if tag_is_open:
             self.tagstack.appendleft(tag_obj)

From 7a9dd3d35fa793f8f6fd1bff7ab9d500e025f9b4 Mon Sep 17 00:00:00 2001
From: Marcel <flashdagger@googlemail.com>
Date: Fri, 2 Dec 2022 20:54:04 +0100
Subject: [PATCH 13/15] [parsing] inline tag_obj.closerange()

---
 yt_dlp/parsing.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index c6748d2d8f..256ba8e6c7 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -200,8 +200,7 @@ def handle_endtag(self, tag):
             tag_obj = self.tagstack[idx]
             self.tagstack.remove(tag)
             if isinstance(tag_obj, self.Tag):
-                close_idx = self.rawdata.find('>', self._offset) + 1
-                tag_obj.closerange(self._offset, close_idx - self._offset)
+                tag_obj.closerange(slice(self._offset, self.rawdata.find('>', self._offset) + 1))
                 self._nestedtags.pop()
                 self.callback(tag_obj)
         except ValueError as exc:

From c34166d7c8d64f065eb05a6447e268a7b7dc3e6e Mon Sep 17 00:00:00 2001
From: flashdagger <flashdagger@googlemail.com>
Date: Mon, 13 Nov 2023 06:54:28 +0100
Subject: [PATCH 14/15] [parsing] support uppercase SCRIPT tags as suggested by
 github-advanced-security bot

---
 test/test_parsing.py | 2 +-
 yt_dlp/parsing.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_parsing.py b/test/test_parsing.py
index a7e7ec7d46..0e006298f7 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -345,7 +345,7 @@ def mark_comments(_string, char='^', nochar='-'):
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^------------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
         here   is   <!-- a comment -->   and   <!-- another comment -->   end
         ----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^---------
-        <script> ignore here </script>            <script> and here </script>
+        <script> ignore here </script>            <SCRIPT> and here </SCRIPT>
         --------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^---------
         '''
 
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index 256ba8e6c7..f4aaf1ac4f 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -20,7 +20,7 @@ class HTMLIgnoreRanges:
             if offset in ranges:
                 ...
     """
-    REGEX = re.compile(r'<!--|-->|</?\s*(?:script|style)\b[^>]*>')
+    REGEX = re.compile(r'<!--|-->|</?\s*(?:script|style)\b[^>]*>', flags=re.IGNORECASE)
 
     def __init__(self, html):
         self.html = html

From a91d9e1084ca87472b952d189eb897dc8a52fec5 Mon Sep 17 00:00:00 2001
From: flashdagger <flashdagger@googlemail.com>
Date: Mon, 13 Nov 2023 07:14:14 +0100
Subject: [PATCH 15/15] [parsing] support comment end tag '--!>' as suggested
 by github-advanced-security bot

---
 test/test_parsing.py | 2 +-
 yt_dlp/parsing.py    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_parsing.py b/test/test_parsing.py
index 0e006298f7..9641df91df 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -343,7 +343,7 @@ def mark_comments(_string, char='^', nochar='-'):
         -----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-----------
         this is a leftover comment -->     <!-- a new comment without closing
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^------------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-        here   is   <!-- a comment -->   and   <!-- another comment -->   end
+        here   is   <!-- a comment -->   and   <!-- another comment --!>  end
         ----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^---------
         <script> ignore here </script>            <SCRIPT> and here </SCRIPT>
         --------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^---------
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index f4aaf1ac4f..72d7e448bd 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -20,7 +20,7 @@ class HTMLIgnoreRanges:
             if offset in ranges:
                 ...
     """
-    REGEX = re.compile(r'<!--|-->|</?\s*(?:script|style)\b[^>]*>', flags=re.IGNORECASE)
+    REGEX = re.compile(r'<!--|--!?>|</?\s*(?:script|style)\b[^>]*>', flags=re.IGNORECASE)
 
     def __init__(self, html):
         self.html = html
@@ -40,7 +40,7 @@ def __contains__(self, offset):
         if self._last_match is None:
             return False
         match_string = self._last_match.group()
-        if match_string.startswith('</') or match_string == '-->':
+        if match_string.startswith('</') or match_string in ('-->', '--!>'):
             return offset < self._last_match.start()
         return offset >= self._last_match.end()