', reset=True)
+ with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"):
+ parser.taglist('
must be empty', reset=True)
+
+ def test_relaxed_html_parsing(self):
+ Tag = HTMLTagParser.Tag
+ parser = HTMLTagParser()
+
+ self.assertEqual(parser.taglist('', reset=True), [])
+ self.assertEqual(parser.taglist('
', reset=True), [])
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('div'), Tag('p')])
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('div')])
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('p'), Tag('div')])
+ self.assertEqual(tags[0].text_and_html(), ('paragraph', '
paragraph
must be empty', reset=True)
+ self.assertEqual(tags, [Tag('img')])
+ self.assertEqual(tags[0].text_and_html(), ('', '
'))
+
+ def test_compliant_html_parsing(self):
+ # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS)
+ Tag = HTMLTagParser.Tag
+ html = '''
+ no error without closing tag:
+ self closing is ok:
+ '''
+ parser = HTMLTagParser()
+ tags = parser.taglist(html, reset=True)
+ self.assertEqual(tags, [Tag('img'), Tag('img')])
+
+ # don't get fooled by '>' in attributes
+ html = '''
'''
+ tags = parser.taglist(html, reset=True)
+ self.assertEqual(tags[0].text_and_html(), ('', html))
+
def test_iri_to_uri(self):
self.assertEqual(
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
new file mode 100644
index 0000000000..d0dcf450a0
--- /dev/null
+++ b/yt_dlp/parsing.py
@@ -0,0 +1,219 @@
+import collections
+import contextlib
+import itertools
+import re
+from html.parser import HTMLParser
+
+from .utils import orderedSet
+
+from .compat import compat_HTMLParseError
+
+
+class HTMLTagParser(HTMLParser):
+ """HTML parser which acts as iterator
+ returns found elements as instances of Tag
+ nested elements will be returned before its parents
+
+ strict=True raises compat_HTMLParseError on malformed html
+
+ two modes of usage:
+ # as an lazy iterator:
+ for tag_obj in HTMLTagParser(html):
+ tag_obj.text_and_html()
+
+ # or return a list with all found tag objects
+ # this is faster by factor 2-5 compared to iteration
+ for tag_obj in HTMLTagParser(html).taglist():
+ tag_obj.text_and_html()
+ """
+
+ STRICT = False
+ ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''')
+ CLOSING_TAG_REGEX = re.compile(r'\s*[^\s<>]+(?:\s*>)?')
+ VOID_TAGS = {
+ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
+ 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr',
+ }
+
+ class Tag:
+ __slots__ = 'name', 'string', 'start', 'start_len', 'stop', 'attrs'
+
+ def __init__(self, name, *, string='', start=None, stop=None, attrs=()):
+ self.name = name
+ self.string = string
+ self.start = start
+ self.start_len = 0
+ self.stop = stop
+ self.attrs = tuple(attrs)
+
+ def __str__(self):
+ return self.name
+
+ def __repr__(self):
+ return f'{self.__class__.__name__}({str(self)!r})'
+
+ def __eq__(self, other):
+ return self.name == other
+
+ def html(self):
+ return self.string[self.start:self.stop]
+
+ def text_and_html(self):
+ assert isinstance(self.start, int)
+ if not self.start_len:
+ match = HTMLTagParser.ANY_TAG_REGEX.match(self.string[self.start:])
+ assert match
+ self.start_len = len(match.group())
+ if self.stop is None:
+ return '', self.string[self.start: self.start + self.start_len]
+ html = self.html()
+ cidx = html.rindex('')
+ return html[self.start_len:cidx], html
+
+ class EarlyExitException(Exception):
+ pass
+
+ def __init__(self):
+ super().__init__()
+ self.tagstack = collections.deque()
+ self._offset = self.offset
+ self.found_tags = []
+
+ def predicate(self, tag, attrs):
+ return True
+
+ def callback(self, tag_obj):
+ pass
+
+ def abort(self, last_tag=None):
+ if last_tag:
+ self.found_tags.append(last_tag)
+ raise HTMLTagParser.EarlyExitException()
+
+ def taglist(self, data, reset=True):
+ self.found_tags.clear()
+ if reset:
+ self.reset()
+ self.tagstack.clear()
+ with contextlib.suppress(HTMLTagParser.EarlyExitException):
+ self.feed(data)
+ if self.STRICT and self.tagstack:
+ orphans = ', '.join(map(repr, map(str, orderedSet(self.tagstack, lazy=True))))
+ raise compat_HTMLParseError(f'unclosed tag {orphans}')
+ return self.found_tags
+
+ def updatepos(self, i, j):
+ offset = self._offset = super().updatepos(i, j)
+ return offset
+
+ def handle_starttag(self, tag, attrs):
+ try:
+ # we use internal variable for performance reason
+ tag_text = getattr(self, '_HTMLParser__starttag_text')
+ except AttributeError:
+ tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group()
+ if self.predicate(tag, attrs):
+ obj = self.Tag(
+ tag, string=self.rawdata, start=self._offset, attrs=attrs)
+ obj.start_len = len(tag_text)
+ if tag_text.endswith('/>') or tag in self.VOID_TAGS:
+ if self.callback(obj) is not False:
+ self.found_tags.append(obj)
+ return
+ else:
+ obj = None
+
+ self.tagstack.appendleft(obj or tag)
+
+ handle_startendtag = handle_starttag
+
+ def handle_endtag(self, tag):
+ if '<' in tag:
+ if self.STRICT:
+ raise compat_HTMLParseError(f'malformed closing tag {tag!r}')
+ tag = tag[:tag.index('<')]
+
+ try:
+ idx = self.tagstack.index(tag)
+ if self.STRICT and idx:
+ open_tags = ''.join(f'{tag}>' for tag in itertools.islice(self.tagstack, idx))
+ raise compat_HTMLParseError(
+ f'malnested closing tag {tag!r}, expected after {open_tags!r}')
+ tag_obj = self.tagstack[idx]
+ self.tagstack.remove(tag)
+ if not isinstance(tag_obj, str):
+ # since we landed here we'll always find a closing tag
+ match = self.CLOSING_TAG_REGEX.match(self.rawdata[self._offset:])
+ tag_obj.stop = self._offset + match.end()
+ if self.callback(tag_obj) is not False:
+ self.found_tags.append(tag_obj)
+ except ValueError as exc:
+ if isinstance(exc, compat_HTMLParseError):
+ raise
+ elif self.STRICT:
+ raise compat_HTMLParseError(f'stray closing tag {tag!r}')
+
+
+class ClassParser(HTMLTagParser):
+ def __init__(self, attribute, matchfunc, stop):
+ super().__init__()
+ self.search_attr = attribute
+ self.matchfunc = matchfunc
+ self.stop = stop
+ self.processing = 0
+
+ def predicate(self, tag, attrs):
+ if self.processing <= 0 and self.stop is not None and self._offset > self.stop:
+ self.abort()
+ string = dict(attrs).get(self.search_attr, '')
+ if self.matchfunc(string):
+ self.processing += 1
+ return True
+ return False
+
+ def callback(self, tag_obj):
+ if self.stop is None:
+ self.abort(tag_obj)
+ self.processing -= 1
+
+ @classmethod
+ def get_elements_html_by_class(cls, class_name, html):
+ regex = re.compile(rf'[\w\- ]*\b{re.escape(class_name)}\b')
+ it = re.finditer(rf'<.+ class=[\'"]{regex.pattern}', html)
+ start = stop = None
+ for match in it:
+ if start is None:
+ start = match.start()
+ else:
+ stop = match.end()
+ if start is None:
+ return []
+ parser = cls('class', lambda x: regex.match(x), stop)
+ return [tag.html() for tag in parser.taglist(html[start:])]
+
+
+class FirstMatchingElementParser(HTMLTagParser):
+ def __init__(self, matchfunc):
+ super().__init__()
+ self.matchfunc = matchfunc
+ self.found = False
+
+ def predicate(self, tag, attrs):
+ if not self.found and self.matchfunc(tag, attrs):
+ self.found = True
+ return True
+ return False
+
+ def callback(self, obj):
+ self.abort(obj)
+
+ @classmethod
+ def get_element_text_and_html_by_tag(cls, tag, html):
+ """
+ For the first element with the specified tag in the given HTML document
+ return its content (text) and the whole element (html)
+ """
+ parser = cls(lambda _tag, _: _tag == tag)
+ for tag_obj in parser.taglist(html):
+ return tag_obj.text_and_html()
+ raise compat_HTMLParseError(f'tag {tag} not found')
From e092ba9922191886c542972461ec27b1d82a466d Mon Sep 17 00:00:00 2001
From: Marcel
Date: Tue, 22 Nov 2022 22:37:14 +0100
Subject: [PATCH 04/15] [test] rollback test_utils.py and add related tests to
test_parsing.py
---
test/test_parsing.py | 218 +++++++++++++++++++++++++++++++++++++++++++
test/test_utils.py | 124 ++++--------------------
2 files changed, 238 insertions(+), 104 deletions(-)
create mode 100644 test/test_parsing.py
diff --git a/test/test_parsing.py b/test/test_parsing.py
new file mode 100644
index 0000000000..782a1196df
--- /dev/null
+++ b/test/test_parsing.py
@@ -0,0 +1,218 @@
+import textwrap
+import unittest
+
+from parsing import (
+ FirstMatchingElementParser,
+ HTMLTagParser,
+ MatchingElementParser,
+)
+
+from yt_dlp.compat import compat_HTMLParseError
+
+get_element_by_attribute = FirstMatchingElementParser
+get_element_by_class = FirstMatchingElementParser
+get_element_html_by_attribute = FirstMatchingElementParser
+get_element_html_by_class = FirstMatchingElementParser.get_element_html_by_class
+get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag
+get_elements_by_attribute = MatchingElementParser
+get_elements_by_class = MatchingElementParser
+get_elements_html_by_attribute = MatchingElementParser
+get_elements_html_by_class = FirstMatchingElementParser.get_elements_html_by_class
+get_elements_text_and_html_by_attribute = MatchingElementParser
+
+
+class TestParsing(unittest.TestCase):
+ GET_ELEMENT_BY_CLASS_TEST_STRING = '''
+ nice
+ '''
+
+ def test_get_element_by_class(self):
+ html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_element_by_class('foo', html), 'nice')
+ self.assertEqual(get_element_by_class('no-such-class', html), None)
+
+ def test_get_element_html_by_class(self):
+ html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_element_html_by_class('foo', html), html.strip())
+ self.assertEqual(get_element_by_class('no-such-class', html), None)
+
+ GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
+ foo
+ '''
+
+ def test_get_element_by_attribute(self):
+ html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
+ self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
+ self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
+
+ html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
+
+ self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
+
+ def test_get_element_html_by_attribute(self):
+ html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip())
+ self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None)
+ self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None)
+
+ html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
+
+ self.assertEqual(get_element_html_by_attribute('itemprop', 'author', html), html.strip())
+
+ GET_ELEMENTS_BY_CLASS_TEST_STRING = '''
+ nice
+ also nice
+ '''
+ GET_ELEMENTS_BY_CLASS_RES = [
+ 'nice',
+ 'also nice'
+ ]
+
+ def test_get_elements_by_class(self):
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_elements_by_class('foo', html), ['nice', 'also nice'])
+ self.assertEqual(get_elements_by_class('no-such-class', html), [])
+
+ def test_get_elements_html_by_class(self):
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_elements_html_by_class('foo', html), self.GET_ELEMENTS_BY_CLASS_RES)
+ self.assertEqual(get_elements_html_by_class('no-such-class', html), [])
+
+ def test_get_elements_by_attribute(self):
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_elements_by_attribute('class', 'foo bar', html), ['nice', 'also nice'])
+ self.assertEqual(get_elements_by_attribute('class', 'foo', html), [])
+ self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), [])
+
+ def test_get_elements_html_by_attribute(self):
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+ self.assertEqual(get_elements_html_by_attribute('class', 'foo bar', html),
+ self.GET_ELEMENTS_BY_CLASS_RES)
+ self.assertEqual(get_elements_html_by_attribute('class', 'foo', html), [])
+ self.assertEqual(get_elements_html_by_attribute('class', 'no-such-foo', html), [])
+
+ def test_get_elements_text_and_html_by_attribute(self):
+ html = self.GET_ELEMENTS_BY_CLASS_TEST_STRING
+
+ self.assertEqual(
+ get_elements_text_and_html_by_attribute('class', 'foo bar', html),
+ list(zip(['nice', 'also nice'], self.GET_ELEMENTS_BY_CLASS_RES)))
+ self.assertEqual(get_elements_text_and_html_by_attribute('class', 'foo', html), [])
+ self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), [])
+
+ self.assertEqual(get_elements_text_and_html_by_attribute(
+ 'class', 'foo', 'nicenice', tag='a'),
+ [('nice', 'nice')])
+
+ def test_get_element_text_and_html_by_tag(self):
+ get_element_by_tag_test_string = '''
+ random text lorem ipsum
+
+ this should be returned
+
this should also be returned
+
+ this should also be returned
+
+ closing tag above should not trick, so this should also be returned
+
+ but this text should not be returned
+ '''
+ html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4)
+ get_element_by_tag_res_outerdiv_html = html.strip()[32:276]
+ get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6]
+ get_element_by_tag_res_innerspan_html = html.strip()[78:119]
+ get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7]
+
+ self.assertEqual(
+ get_element_text_and_html_by_tag('div', html),
+ (get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html))
+ self.assertEqual(
+ get_element_text_and_html_by_tag('span', html),
+ (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
+ self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
+
+ def test_get_element_text_and_html_by_tag_malformed(self):
+ inner_text = 'inner text'
+ malnested_elements = f'{inner_text}'
+ commented_html = ''
+ outerdiv_html = f'{malnested_elements}
'
+ html = f'{commented_html}{outerdiv_html}'
+
+ self.assertEqual(
+ get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html))
+ self.assertEqual(
+ get_element_text_and_html_by_tag('malnested_a', html),
+ (f'{inner_text}',
+ f'{inner_text}'))
+ self.assertEqual(
+ get_element_text_and_html_by_tag('malnested_b', html),
+ (f'{inner_text}',
+ f'{inner_text}'))
+ self.assertRaises(
+ compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}')
+ self.assertRaises(
+ compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}')
+
+ def test_strict_html_parsing(self):
+ class StrictTagParser(HTMLTagParser):
+ STRICT = True
+
+ parser = StrictTagParser()
+ with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"):
+ parser.taglist('', reset=True)
+ with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"):
+ parser.taglist('', reset=True)
+ with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '
'"):
+ parser.taglist('
', reset=True)
+ with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after ''"):
+ parser.taglist('
', reset=True)
+ with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"):
+ parser.taglist('
', reset=True)
+ with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"):
+ parser.taglist('
must be empty', reset=True)
+
+ def test_relaxed_html_parsing(self):
+ Tag = HTMLTagParser.Tag
+ parser = HTMLTagParser()
+
+ self.assertEqual(parser.taglist('', reset=True), [])
+ self.assertEqual(parser.taglist('
', reset=True), [])
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('div'), Tag('p')])
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('div')])
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('p'), Tag('div')])
+ self.assertEqual(tags[0].text_and_html(), ('paragraph', '
paragraph
must be empty', reset=True)
+ self.assertEqual(tags, [Tag('img')])
+ self.assertEqual(tags[0].text_and_html(), ('', '
'))
+
+ def test_compliant_html_parsing(self):
+ # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS)
+ Tag = HTMLTagParser.Tag
+ html = '''
+ no error without closing tag:
+ self closing is ok:
+ '''
+ parser = HTMLTagParser()
+ tags = parser.taglist(html, reset=True)
+ self.assertEqual(tags, [Tag('img'), Tag('img')])
+
+ # don't get fooled by '>' in attributes
+ html = '''
'''
+ tags = parser.taglist(html, reset=True)
+ self.assertEqual(tags[0].text_and_html(), ('', html))
diff --git a/test/test_utils.py b/test/test_utils.py
index d9a62258c5..3045b6d7e1 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -4,7 +4,6 @@
import os
import re
import sys
-import textwrap
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -21,14 +20,6 @@
compat_HTMLParseError,
compat_os_name,
)
-from yt_dlp.parsing import (
- HTMLTagParser,
- FirstMatchingElementParser,
-)
-
-# some testcases don't work with current functions
-get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag
-
from yt_dlp.utils import (
Config,
DateRange,
@@ -68,6 +59,7 @@
get_element_by_class,
get_element_html_by_attribute,
get_element_html_by_class,
+ get_element_text_and_html_by_tag,
get_elements_by_attribute,
get_elements_by_class,
get_elements_html_by_attribute,
@@ -1776,110 +1768,34 @@ def test_get_elements_text_and_html_by_attribute(self):
self.assertEqual(list(get_elements_text_and_html_by_attribute(
'class', 'foo', '
nicenice', tag='a')), [('nice', '
nice')])
- def test_get_element_text_and_html_by_tag(self):
- get_element_by_tag_test_string = '''
- random text lorem ipsum
+ GET_ELEMENT_BY_TAG_TEST_STRING = '''
+ random text lorem ipsum
+
+ this should be returned
+
this should also be returned
- this should be returned
-
this should also be returned
-
- this should also be returned
-
- closing tag above should not trick, so this should also be returned
+ this should also be returned
- but this text should not be returned
- '''
- html = textwrap.indent(textwrap.dedent(get_element_by_tag_test_string), ' ' * 4)
- get_element_by_tag_res_outerdiv_html = html.strip()[32:276]
- get_element_by_tag_res_outerdiv_text = get_element_by_tag_res_outerdiv_html[5:-6]
- get_element_by_tag_res_innerspan_html = html.strip()[78:119]
- get_element_by_tag_res_innerspan_text = get_element_by_tag_res_innerspan_html[6:-7]
+ closing tag above should not trick, so this should also be returned
+
+ but this text should not be returned
+ '''
+ GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[32:276]
+ GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT = GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML[5:-6]
+ GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML = GET_ELEMENT_BY_TAG_TEST_STRING.strip()[78:119]
+ GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT = GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML[6:-7]
+
+ def test_get_element_text_and_html_by_tag(self):
+ html = self.GET_ELEMENT_BY_TAG_TEST_STRING
self.assertEqual(
get_element_text_and_html_by_tag('div', html),
- (get_element_by_tag_res_outerdiv_text, get_element_by_tag_res_outerdiv_html))
+ (self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_TEXT, self.GET_ELEMENT_BY_TAG_RES_OUTERDIV_HTML))
self.assertEqual(
get_element_text_and_html_by_tag('span', html),
- (get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
+ (self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_TEXT, self.GET_ELEMENT_BY_TAG_RES_INNERSPAN_HTML))
self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
- def test_get_element_text_and_html_by_tag_malformed(self):
- inner_text = 'inner text'
- malnested_elements = f'
{inner_text}'
- commented_html = ''
- outerdiv_html = f'
{malnested_elements}
'
- html = f'{commented_html}{outerdiv_html}'
-
- self.assertEqual(
- get_element_text_and_html_by_tag('div', html), (malnested_elements, outerdiv_html))
- self.assertEqual(
- get_element_text_and_html_by_tag('malnested_a', html),
- (f'
{inner_text}',
- f'{inner_text}'))
- self.assertEqual(
- get_element_text_and_html_by_tag('malnested_b', html),
- (f'{inner_text}',
- f'{inner_text}'))
- self.assertRaises(
- compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}')
- self.assertRaises(
- compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}')
-
- def test_strict_html_parsing(self):
- class StrictTagParser(HTMLTagParser):
- STRICT = True
-
- parser = StrictTagParser()
- with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'p'"):
- parser.taglist('', reset=True)
- with self.assertRaisesRegex(compat_HTMLParseError, "unclosed tag 'p', 'div'"):
- parser.taglist('', reset=True)
- with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after '
'"):
- parser.taglist('
', reset=True)
- with self.assertRaisesRegex(compat_HTMLParseError, "malnested closing tag 'div', expected after ''"):
- parser.taglist('
', reset=True)
- with self.assertRaisesRegex(compat_HTMLParseError, "malformed closing tag 'p<<'"):
- parser.taglist('
', reset=True)
- with self.assertRaisesRegex(compat_HTMLParseError, "stray closing tag 'img'"):
- parser.taglist('
must be empty', reset=True)
-
- def test_relaxed_html_parsing(self):
- Tag = HTMLTagParser.Tag
- parser = HTMLTagParser()
-
- self.assertEqual(parser.taglist('', reset=True), [])
- self.assertEqual(parser.taglist('
', reset=True), [])
-
- tags = parser.taglist('
', reset=True)
- self.assertEqual(tags, [Tag('div'), Tag('p')])
-
- tags = parser.taglist('
', reset=True)
- self.assertEqual(tags, [Tag('div')])
-
- tags = parser.taglist('
', reset=True)
- self.assertEqual(tags, [Tag('p'), Tag('div')])
- self.assertEqual(tags[0].text_and_html(), ('paragraph', '
paragraph
must be empty', reset=True)
- self.assertEqual(tags, [Tag('img')])
- self.assertEqual(tags[0].text_and_html(), ('', '
'))
-
- def test_compliant_html_parsing(self):
- # certain elements don't need to be closed (see HTMLTagParser.VOID_TAGS)
- Tag = HTMLTagParser.Tag
- html = '''
- no error without closing tag:
- self closing is ok:
- '''
- parser = HTMLTagParser()
- tags = parser.taglist(html, reset=True)
- self.assertEqual(tags, [Tag('img'), Tag('img')])
-
- # don't get fooled by '>' in attributes
- html = '''
'''
- tags = parser.taglist(html, reset=True)
- self.assertEqual(tags[0].text_and_html(), ('', html))
-
def test_iri_to_uri(self):
self.assertEqual(
iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'),
From 176a156c651defe95f4ed6714ddf47d599ecef50 Mon Sep 17 00:00:00 2001
From: Marcel
Date: Tue, 22 Nov 2022 19:58:06 +0100
Subject: [PATCH 05/15] [parsing] rework interface, implemented all
get_element(s) functions + extract_attributes() as MatchingElementParser
class methods and improve performance
---
test/test_parsing.py | 168 +++++++++++++++----
yt_dlp/parsing.py | 373 ++++++++++++++++++++++++++++++-------------
2 files changed, 399 insertions(+), 142 deletions(-)
diff --git a/test/test_parsing.py b/test/test_parsing.py
index 782a1196df..75ed8ebf34 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -1,29 +1,71 @@
import textwrap
import unittest
-from parsing import (
- FirstMatchingElementParser,
- HTMLTagParser,
+from yt_dlp.compat import compat_HTMLParseError
+from yt_dlp.parsing import (
MatchingElementParser,
+ HTMLCommentRanges,
+ HTMLTagParser,
)
-from yt_dlp.compat import compat_HTMLParseError
-
-get_element_by_attribute = FirstMatchingElementParser
-get_element_by_class = FirstMatchingElementParser
-get_element_html_by_attribute = FirstMatchingElementParser
-get_element_html_by_class = FirstMatchingElementParser.get_element_html_by_class
-get_element_text_and_html_by_tag = FirstMatchingElementParser.get_element_text_and_html_by_tag
-get_elements_by_attribute = MatchingElementParser
-get_elements_by_class = MatchingElementParser
-get_elements_html_by_attribute = MatchingElementParser
-get_elements_html_by_class = FirstMatchingElementParser.get_elements_html_by_class
-get_elements_text_and_html_by_attribute = MatchingElementParser
+extract_attributes = MatchingElementParser.extract_attributes
+get_element_by_attribute = MatchingElementParser.get_element_by_attribute
+get_element_by_class = MatchingElementParser.get_element_by_class
+get_element_html_by_attribute = MatchingElementParser.get_element_html_by_attribute
+get_element_html_by_class = MatchingElementParser.get_element_html_by_class
+get_element_text_and_html_by_tag = MatchingElementParser.get_element_text_and_html_by_tag
+get_elements_by_attribute = MatchingElementParser.get_elements_by_attribute
+get_elements_by_class = MatchingElementParser.get_elements_by_class
+get_elements_html_by_attribute = MatchingElementParser.get_elements_html_by_attribute
+get_elements_html_by_class = MatchingElementParser.get_elements_html_by_class
+get_elements_text_and_html_by_attribute = MatchingElementParser.get_elements_text_and_html_by_attribute
+get_elements_text_and_html_by_tag = MatchingElementParser.get_elements_text_and_html_by_tag
class TestParsing(unittest.TestCase):
+ def test_extract_attributes(self):
+ self.assertEqual(extract_attributes(''), {'x': 'y'})
+ self.assertEqual(extract_attributes(""), {'x': 'y'})
+ self.assertEqual(extract_attributes(''), {'x': 'y'})
+ self.assertEqual(extract_attributes(''), {'x': "a 'b' c"})
+ self.assertEqual(extract_attributes(''), {'x': 'a "b" c'})
+ self.assertEqual(extract_attributes(''), {'x': 'y'})
+ self.assertEqual(extract_attributes(''), {'x': 'y'})
+ self.assertEqual(extract_attributes(''), {'x': '&'}) # XML
+ self.assertEqual(extract_attributes(''), {'x': '"'})
+ self.assertEqual(extract_attributes(''), {'x': '£'}) # HTML 3.2
+ self.assertEqual(extract_attributes(''), {'x': 'λ'}) # HTML 4.0
+ self.assertEqual(extract_attributes(''), {'x': '&foo'})
+ self.assertEqual(extract_attributes(''), {'x': "'"})
+ self.assertEqual(extract_attributes(''), {'x': '"'})
+ self.assertEqual(extract_attributes(''), {'x': None})
+ self.assertEqual(extract_attributes(''), {'x': 'y', 'a': None})
+ self.assertEqual(extract_attributes(''), {'x': 'y'})
+ self.assertEqual(extract_attributes(''), {'y': '2', 'x': '3'})
+ self.assertEqual(extract_attributes(''), {'x': 'y'})
+ self.assertEqual(extract_attributes(''), {'x': 'y'})
+ self.assertEqual(extract_attributes(""), {'x': 'y'})
+ self.assertEqual(extract_attributes(''), {'x': '\ny\n'})
+ self.assertEqual(extract_attributes(''), {'caps': 'x'}) # Names lowercased
+ self.assertEqual(extract_attributes(''), {'x': '2'})
+ self.assertEqual(extract_attributes(''), {'x': '2'})
+ self.assertEqual(extract_attributes(''), {'_:funny-name1': '1'})
+ self.assertEqual(extract_attributes(''), {'x': 'Fáilte 世界 \U0001f600'})
+ self.assertEqual(extract_attributes(''), {'x': 'décompose\u0301'})
+ # "Narrow" Python builds don't support unicode code points outside BMP.
+ try:
+ chr(0x10000)
+ supports_outside_bmp = True
+ except ValueError:
+ supports_outside_bmp = False
+ if supports_outside_bmp:
+ self.assertEqual(extract_attributes(''), {'x': 'Smile \U0001f600!'})
+ # Malformed HTML should not break attributes extraction on older Python
+ self.assertEqual(extract_attributes(''), {})
+
GET_ELEMENT_BY_CLASS_TEST_STRING = '''
nice
+ also nice
'''
def test_get_element_by_class(self):
@@ -35,7 +77,8 @@ def test_get_element_by_class(self):
def test_get_element_html_by_class(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
- self.assertEqual(get_element_html_by_class('foo', html), html.strip())
+ self.assertEqual(get_element_html_by_class('foo', html),
+ 'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None)
GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING = '''
@@ -48,6 +91,7 @@ def test_get_element_by_attribute(self):
self.assertEqual(get_element_by_attribute('class', 'foo bar', html), 'nice')
self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
+ self.assertEqual(get_element_by_attribute('class', 'foo bar', html, tag='div'), 'also nice')
html = self.GET_ELEMENT_BY_ATTRIBUTE_TEST_STRING
@@ -56,7 +100,8 @@ def test_get_element_by_attribute(self):
def test_get_element_html_by_attribute(self):
html = self.GET_ELEMENT_BY_CLASS_TEST_STRING
- self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html), html.strip())
+ self.assertEqual(get_element_html_by_attribute('class', 'foo bar', html),
+ 'nice')
self.assertEqual(get_element_html_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_html_by_attribute('class', 'no-such-foo', html), None)
@@ -110,7 +155,7 @@ def test_get_elements_text_and_html_by_attribute(self):
self.assertEqual(get_elements_text_and_html_by_attribute('class', 'no-such-foo', html), [])
self.assertEqual(get_elements_text_and_html_by_attribute(
- 'class', 'foo', 'nicenice', tag='a'),
+ 'class', 'foo', 'nicenot nice', tag='a'),
[('nice', 'nice')])
def test_get_element_text_and_html_by_tag(self):
@@ -138,7 +183,16 @@ def test_get_element_text_and_html_by_tag(self):
self.assertEqual(
get_element_text_and_html_by_tag('span', html),
(get_element_by_tag_res_innerspan_text, get_element_by_tag_res_innerspan_html))
- self.assertRaises(compat_HTMLParseError, get_element_text_and_html_by_tag, 'article', html)
+ self.assertIsNone(get_element_text_and_html_by_tag('article', html))
+
+ def test_get_elements_text_and_html_by_tag(self):
+ test_string = '''
+
+
+ ignore
+ '''
+ items = get_elements_text_and_html_by_tag('img', test_string)
+ self.assertListEqual(items, [('', ''), ('', '')])
def test_get_element_text_and_html_by_tag_malformed(self):
inner_text = 'inner text'
@@ -157,10 +211,8 @@ def test_get_element_text_and_html_by_tag_malformed(self):
get_element_text_and_html_by_tag('malnested_b', html),
(f'{inner_text}',
f'{inner_text}'))
- self.assertRaises(
- compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}')
- self.assertRaises(
- compat_HTMLParseError, get_element_text_and_html_by_tag, 'orphan', f'{html}')
+ self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}'))
+ self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}'))
def test_strict_html_parsing(self):
class StrictTagParser(HTMLTagParser):
@@ -188,14 +240,14 @@ def test_relaxed_html_parsing(self):
self.assertEqual(parser.taglist('', reset=True), [])
tags = parser.taglist('
', reset=True)
- self.assertEqual(tags, [Tag('div'), Tag('p')])
+ self.assertEqual(tags, [Tag('p'), Tag('div')])
tags = parser.taglist('
', reset=True)
self.assertEqual(tags, [Tag('div')])
- tags = parser.taglist('
', reset=True)
- self.assertEqual(tags, [Tag('p'), Tag('div')])
- self.assertEqual(tags[0].text_and_html(), ('paragraph', '
paragraph
paragraph
', reset=True)
+ self.assertEqual(tags, [Tag('div'), Tag('p')])
+ self.assertEqual(tags[1].text_and_html(), ('paragraph', 'paragraph
'))
tags = parser.taglist('must be empty', reset=True)
self.assertEqual(tags, [Tag('img')])
@@ -216,3 +268,65 @@ def test_compliant_html_parsing(self):
html = ''''''
tags = parser.taglist(html, reset=True)
self.assertEqual(tags[0].text_and_html(), ('', html))
+
+ def test_tag_return_order(self):
+ Tag = HTMLTagParser.Tag
+ html = '''
+
+
+
+
+
+
+
+
+
+
+
+
+
+ '''
+ parser = HTMLTagParser()
+ tags = parser.taglist(html, reset=True)
+ self.assertEqual(
+ str(tags), str([Tag('t0'), Tag('t1'), Tag('t2'), Tag('t3'), Tag('t4'),
+ Tag('t5'), Tag('t6'), Tag('t7'), Tag('t8')]))
+
+ tags = parser.taglist(html, reset=True, depth_first=True)
+ self.assertEqual(
+ str(tags), str([Tag('t3'), Tag('t4'), Tag('t2'), Tag('t1'), Tag('t6'),
+ Tag('t5'), Tag('t0'), Tag('t8'), Tag('t7')]))
+
+ # return tags in nested order
+ tags = parser.taglist(html, reset=True, depth_first=None)
+ self.assertEqual(
+ str(tags), str([
+ [Tag('t0'),
+ [Tag('t1'),
+ [Tag('t2'), Tag('t3'), Tag('t4')]],
+ [Tag('t5'), Tag('t6')]],
+ [Tag('t7'), Tag('t8')]]))
+
+ def test_within_html_comment(self):
+ def mark_comments(_string, char='^', nochar='-'):
+ cmts = HTMLCommentRanges(_string)
+ return "".join(char if _idx in cmts else nochar for _idx in range(len(_string)))
+
+ html_string = '''
+ no comments in this line
+ ---------------------------------------------------------------------
+
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ before after
+ -------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^--------
+ here is and end
+ ------------^^^^^^^^^^^^^^^^^^---------^^^^^^^^^^^^^^^^^^^^^^^^------
+ this ends here --> and not here
+ -----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^----------------------------
+ stray --> comment closings --> are ignored ' encountered
+ note: markers within quotes are not ignored
+ """
+
+ def __init__(self, html):
+ self._range_iter = self.ranges(html)
+ self._range = next(self._range_iter, None)
+ self._last_offset = 0
+
+ @staticmethod
+ def ranges(string, sopen=''):
+ assert not (sopen.startswith(sclose) or sclose.startswith(sopen))
+ open_iter = iter_find(string, sopen)
+ close_len = len(sclose)
+ close_iter = (idx + close_len for idx in iter_find(string, sclose))
+ next_open = next(open_iter, None)
+ next_close = next(close_iter, None)
+
+ while True:
+ if next_open is None:
+ return
+ while next_close is not None and next_open > next_close:
+ next_close = next(close_iter, None)
+ yield slice(next_open, next_close)
+ if next_close is None:
+ return
+ while next_open is not None and next_open < next_close:
+ next_open = next(open_iter, None)
+
+ def __contains__(self, offset):
+ assert isinstance(offset, int)
+ assert offset >= self._last_offset, 'offset must be in increasing order'
+ self._last_offset = offset
+ while self._range and self._range.stop is not None and offset >= self._range.stop:
+ self._range = next(self._range_iter, None)
+
+ return not (self._range is None or offset < self._range.start)
class HTMLTagParser(HTMLParser):
- """HTML parser which acts as iterator
- returns found elements as instances of Tag
- nested elements will be returned before its parents
+ """HTML parser which returns found elements as instances of 'Tag'
+ when STRICT=True can raise compat_HTMLParseError() on malformed HTML elements
- strict=True raises compat_HTMLParseError on malformed html
-
- two modes of usage:
- # as an lazy iterator:
- for tag_obj in HTMLTagParser(html):
+ usage:
+ parser = HTMLTagParser()
+ for tag_obj in parser.taglist(html):
tag_obj.text_and_html()
- # or return a list with all found tag objects
- # this is faster by factor 2-5 compared to iteration
- for tag_obj in HTMLTagParser(html).taglist():
- tag_obj.text_and_html()
"""
STRICT = False
ANY_TAG_REGEX = re.compile(r'''<(?:"[^"]*"|'[^']*'|[^"'>])*?>''')
- CLOSING_TAG_REGEX = re.compile(r'\s*[^\s<>]+(?:\s*>)?')
VOID_TAGS = {
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr',
}
class Tag:
- __slots__ = 'name', 'string', 'start', 'start_len', 'stop', 'attrs'
+ __slots__ = 'name', 'string', 'attrs', '_openrange', '_closerange'
- def __init__(self, name, *, string='', start=None, stop=None, attrs=()):
+ def __init__(self, name, *, string='', attrs=()):
self.name = name
self.string = string
- self.start = start
- self.start_len = 0
- self.stop = stop
self.attrs = tuple(attrs)
+ self._openrange = None
+ self._closerange = None
def __str__(self):
return self.name
@@ -55,52 +97,81 @@ def __repr__(self):
def __eq__(self, other):
return self.name == other
+ def openrange(self, offset, startlen=0):
+ if isinstance(offset, slice):
+ self._openrange = offset
+ else:
+ self._openrange = slice(offset, offset + startlen)
+
+ def closerange(self, offset, stoplen=0):
+ if isinstance(offset, slice):
+ self._closerange = offset
+ else:
+ self._closerange = slice(offset, offset + stoplen)
+
+ def opentag(self):
+ return self.string[self._openrange] if self._openrange else ''
+
def html(self):
- return self.string[self.start:self.stop]
+ if not self._openrange:
+ return ''
+ if self._closerange:
+ return self.string[self._openrange.start:self._closerange.stop]
+ return self.string[self._openrange]
+
+ def text(self):
+ if self._openrange and self._closerange:
+ return self.string[self._openrange.stop:self._closerange.start]
+ return ''
def text_and_html(self):
- assert isinstance(self.start, int)
- if not self.start_len:
- match = HTMLTagParser.ANY_TAG_REGEX.match(self.string[self.start:])
- assert match
- self.start_len = len(match.group())
- if self.stop is None:
- return '', self.string[self.start: self.start + self.start_len]
- html = self.html()
- cidx = html.rindex('')
- return html[self.start_len:cidx], html
+ return self.text(), self.html()
- class EarlyExitException(Exception):
+ class AbortException(Exception):
pass
def __init__(self):
- super().__init__()
self.tagstack = collections.deque()
+ self._nestedtags = [[]]
+ super().__init__()
self._offset = self.offset
- self.found_tags = []
def predicate(self, tag, attrs):
+ """ return True for every encountered opening tag that should be processed """
return True
def callback(self, tag_obj):
- pass
+ """ this will be called when the requested tag is closed """
- def abort(self, last_tag=None):
- if last_tag:
- self.found_tags.append(last_tag)
- raise HTMLTagParser.EarlyExitException()
+ def reset(self):
+ super().reset()
+ self.tagstack.clear()
+
+ def taglist(self, data, reset=True, depth_first=False):
+ """ parse data and return found tag objects
+ @param data: html string
+ @param reset: reset state
+ @param depth_first: return order: as opened (False), as closed (True), nested (None)
+ @return: list of Tag objects
+ """
+ def flatten(_list, first=True):
+ rlist = _list if first or not depth_first else itertools.chain(_list[1:], _list[:1])
+ for item in rlist:
+ if isinstance(item, list):
+ yield from flatten(item, first=False)
+ else:
+ yield item
- def taglist(self, data, reset=True):
- self.found_tags.clear()
if reset:
self.reset()
- self.tagstack.clear()
- with contextlib.suppress(HTMLTagParser.EarlyExitException):
+ with contextlib.suppress(HTMLTagParser.AbortException):
self.feed(data)
if self.STRICT and self.tagstack:
orphans = ', '.join(map(repr, map(str, orderedSet(self.tagstack, lazy=True))))
raise compat_HTMLParseError(f'unclosed tag {orphans}')
- return self.found_tags
+ taglist = self._nestedtags[0] if depth_first is None else list(flatten(self._nestedtags[0]))
+ self._nestedtags = [[]]
+ return taglist
def updatepos(self, i, j):
offset = self._offset = super().updatepos(i, j)
@@ -108,22 +179,23 @@ def updatepos(self, i, j):
def handle_starttag(self, tag, attrs):
try:
- # we use internal variable for performance reason
+ # we use internal variable for performance reasons
tag_text = getattr(self, '_HTMLParser__starttag_text')
except AttributeError:
tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group()
- if self.predicate(tag, attrs):
- obj = self.Tag(
- tag, string=self.rawdata, start=self._offset, attrs=attrs)
- obj.start_len = len(tag_text)
- if tag_text.endswith('/>') or tag in self.VOID_TAGS:
- if self.callback(obj) is not False:
- self.found_tags.append(obj)
- return
- else:
- obj = None
- self.tagstack.appendleft(obj or tag)
+ tag_obj = tag
+ if self.predicate(tag, attrs):
+ tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
+ tag_obj.openrange(self._offset, len(tag_text))
+ if tag_text.endswith('/>') or tag in self.VOID_TAGS:
+ self._nestedtags[-1].append(tag_obj)
+ self.callback(tag_obj)
+ return
+ nesting = []
+ self._nestedtags[-1].append(nesting)
+ self._nestedtags.append(nesting)
+ self.tagstack.appendleft(tag_obj)
handle_startendtag = handle_starttag
@@ -141,79 +213,150 @@ def handle_endtag(self, tag):
f'malnested closing tag {tag!r}, expected after {open_tags!r}')
tag_obj = self.tagstack[idx]
self.tagstack.remove(tag)
- if not isinstance(tag_obj, str):
- # since we landed here we'll always find a closing tag
- match = self.CLOSING_TAG_REGEX.match(self.rawdata[self._offset:])
- tag_obj.stop = self._offset + match.end()
- if self.callback(tag_obj) is not False:
- self.found_tags.append(tag_obj)
+ if isinstance(tag_obj, self.Tag):
+ close_idx = self.rawdata.find('>', self._offset) + 1
+ tag_obj.closerange(self._offset, close_idx - self._offset)
+ self._nestedtags.pop().insert(0, tag_obj)
+ self.callback(tag_obj)
except ValueError as exc:
if isinstance(exc, compat_HTMLParseError):
raise
- elif self.STRICT:
- raise compat_HTMLParseError(f'stray closing tag {tag!r}')
+ if self.STRICT:
+ raise compat_HTMLParseError(f'stray closing tag {tag!r}') from exc
-class ClassParser(HTMLTagParser):
- def __init__(self, attribute, matchfunc, stop):
- super().__init__()
- self.search_attr = attribute
- self.matchfunc = matchfunc
- self.stop = stop
- self.processing = 0
-
- def predicate(self, tag, attrs):
- if self.processing <= 0 and self.stop is not None and self._offset > self.stop:
- self.abort()
- string = dict(attrs).get(self.search_attr, '')
- if self.matchfunc(string):
- self.processing += 1
- return True
- return False
-
- def callback(self, tag_obj):
- if self.stop is None:
- self.abort(tag_obj)
- self.processing -= 1
-
- @classmethod
- def get_elements_html_by_class(cls, class_name, html):
- regex = re.compile(rf'[\w\- ]*\b{re.escape(class_name)}\b')
- it = re.finditer(rf'<.+ class=[\'"]{regex.pattern}', html)
- start = stop = None
- for match in it:
- if start is None:
- start = match.start()
- else:
- stop = match.end()
- if start is None:
- return []
- parser = cls('class', lambda x: regex.match(x), stop)
- return [tag.html() for tag in parser.taglist(html[start:])]
-
-
-class FirstMatchingElementParser(HTMLTagParser):
+class MatchingElementParser(HTMLTagParser):
+ """ optimized version of HTMLTagParser
+ """
def __init__(self, matchfunc):
super().__init__()
self.matchfunc = matchfunc
- self.found = False
+ self.found_none = True
+
+ def reset(self):
+ super().reset()
+ self.found_none = True
+
+ def callback(self, tag_obj):
+ raise self.AbortException()
def predicate(self, tag, attrs):
- if not self.found and self.matchfunc(tag, attrs):
- self.found = True
+ if self.found_none and self.matchfunc(tag, attrs):
+ self.found_none = False
return True
return False
- def callback(self, obj):
- self.abort(obj)
+ @staticmethod
+ def class_value_regex(class_name):
+ return rf'[\w\s\-]*(?"']|"[^"]*"|'[^']*')*)?
+ \s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q)
+ '''
+
+ @classmethod
+ def iter_tags(cls, regex, html, *, matchfunc):
+ comments = HTMLCommentRanges(html)
+ parser = cls(matchfunc)
+ for match in re.finditer(regex, html):
+ if match.start() not in comments:
+ yield from parser.taglist(html[match.start():], reset=True)
+
+ @classmethod
+ def tags_by_name(cls, tag, html):
+ def matchfunc(tag_str, _attrs):
+ return tag_str == tag
+
+ yield from cls.iter_tags(rf'<\s*{re.escape(tag)}[\s>]', html, matchfunc=matchfunc)
+
+ @classmethod
+ def tags_by_attribute(cls, attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
+ def matchfunc(_tag_str, attrs):
+ return any(attr == attribute and re.fullmatch(value, value_str)
+ for attr, value_str in attrs)
+
+ tag_regex = cls.matching_tag_regex(tag, attribute, value, escape_value)
+ yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
+
+ @classmethod
+ def extract_attributes(cls, html):
+ attr_dict = {}
+
+ def matchfunc(_tag, attrs):
+ attr_dict.update(attrs)
+ raise cls.AbortException()
+
+ with contextlib.suppress(cls.AbortException):
+ cls(matchfunc).feed(html)
+
+ return attr_dict
+
+ @classmethod
+ def get_elements_text_and_html_by_tag(cls, tag, html):
+ return [tag.text_and_html() for tag in cls.tags_by_name(tag, html)]
@classmethod
def get_element_text_and_html_by_tag(cls, tag, html):
- """
- For the first element with the specified tag in the given HTML document
- return its content (text) and the whole element (html)
- """
- parser = cls(lambda _tag, _: _tag == tag)
- for tag_obj in parser.taglist(html):
- return tag_obj.text_and_html()
- raise compat_HTMLParseError(f'tag {tag} not found')
+ tag = next(cls.tags_by_name(tag, html), None)
+ return tag and tag.text_and_html()
+
+ @classmethod
+ def get_elements_text_and_html_by_attribute(cls, *args, **kwargs):
+ return [tag.text_and_html() for tag in cls.tags_by_attribute(*args, **kwargs)]
+
+ @classmethod
+ def get_elements_by_attribute(cls, *args, **kwargs):
+ return [tag.text_and_html()[0] for tag in cls.tags_by_attribute(*args, **kwargs)]
+
+ @classmethod
+ def get_elements_html_by_attribute(cls, *args, **kwargs):
+ return [tag.html() for tag in cls.tags_by_attribute(*args, **kwargs)]
+
+ @classmethod
+ def get_element_by_attribute(cls, *args, **kwargs):
+ tag = next(cls.tags_by_attribute(*args, **kwargs), None)
+ return tag and tag.text()
+
+ @classmethod
+ def get_element_html_by_attribute(cls, *args, **kwargs):
+ tag = next(cls.tags_by_attribute(*args, **kwargs), None)
+ return tag and tag.html()
+
+ @classmethod
+ def get_elements_by_class(cls, class_name, html):
+ value = cls.class_value_regex(class_name)
+ return [tag.text() for tag
+ in cls.tags_by_attribute('class', value, html, escape_value=False)]
+
+ @classmethod
+ def get_elements_html_by_class(cls, class_name, html):
+ value = cls.class_value_regex(class_name)
+ return [tag.html() for tag
+ in cls.tags_by_attribute('class', value, html, escape_value=False)]
+
+ @classmethod
+ def get_elements_text_and_html_by_class(cls, class_name, html):
+ value = cls.class_value_regex(class_name)
+ return [tag.text() for tag
+ in cls.tags_by_attribute('class', value, html, escape_value=False)]
+
+ @classmethod
+ def get_element_html_by_class(cls, class_name, html):
+ value = cls.class_value_regex(class_name)
+ tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None)
+ return tag and tag.html()
+
+ @classmethod
+ def get_element_by_class(cls, class_name, html):
+ value = cls.class_value_regex(class_name)
+ tag = next(cls.tags_by_attribute('class', value, html, escape_value=False), None)
+ return tag and tag.text()
From 8451074b501f51cb66c4d5463260320763b9ff69 Mon Sep 17 00:00:00 2001
From: Marcel
Date: Sun, 27 Nov 2022 16:22:03 +0100
Subject: [PATCH 06/15] [parsing] fix: don't push unmatched void tags onto
queue
---
test/test_parsing.py | 7 +++++++
yt_dlp/parsing.py | 14 ++++++++------
2 files changed, 15 insertions(+), 6 deletions(-)
diff --git a/test/test_parsing.py b/test/test_parsing.py
index 75ed8ebf34..880c41a348 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -186,6 +186,9 @@ def test_get_element_text_and_html_by_tag(self):
self.assertIsNone(get_element_text_and_html_by_tag('article', html))
def test_get_elements_text_and_html_by_tag(self):
+ class StrictParser(MatchingElementParser):
+ STRICT = True
+
test_string = '''
@@ -194,6 +197,10 @@ def test_get_elements_text_and_html_by_tag(self):
items = get_elements_text_and_html_by_tag('img', test_string)
self.assertListEqual(items, [('', ''), ('', '')])
+ self.assertEqual(
+ StrictParser.get_element_text_and_html_by_tag('use', ''),
+ ('', ''))
+
def test_get_element_text_and_html_by_tag_malformed(self):
inner_text = 'inner text'
malnested_elements = f'{inner_text}'
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index bcc48c4d3e..8fbb4db14b 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -185,17 +185,19 @@ def handle_starttag(self, tag, attrs):
tag_text = HTMLTagParser.ANY_TAG_REGEX.match(self.rawdata[self._offset:]).group()
tag_obj = tag
+ tag_is_open = not (tag_text.endswith('/>') or tag in self.VOID_TAGS)
if self.predicate(tag, attrs):
tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
tag_obj.openrange(self._offset, len(tag_text))
- if tag_text.endswith('/>') or tag in self.VOID_TAGS:
+ if tag_is_open:
+ nesting = []
+ self._nestedtags[-1].append(nesting)
+ self._nestedtags.append(nesting)
+ else:
self._nestedtags[-1].append(tag_obj)
self.callback(tag_obj)
- return
- nesting = []
- self._nestedtags[-1].append(nesting)
- self._nestedtags.append(nesting)
- self.tagstack.appendleft(tag_obj)
+ if tag_is_open:
+ self.tagstack.appendleft(tag_obj)
handle_startendtag = handle_starttag
From dbf350c12291279c0be56cb82922c2fae1c87eb2 Mon Sep 17 00:00:00 2001
From: Marcel
Date: Sun, 27 Nov 2022 16:34:06 +0100
Subject: [PATCH 07/15] [parsing] return unclosed matched tags
---
test/test_parsing.py | 9 +++++----
yt_dlp/parsing.py | 4 ++--
2 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/test/test_parsing.py b/test/test_parsing.py
index 880c41a348..5887115185 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -218,8 +218,9 @@ def test_get_element_text_and_html_by_tag_malformed(self):
get_element_text_and_html_by_tag('malnested_b', html),
(f'{inner_text}',
f'{inner_text}'))
+ self.assertEqual(
+ get_element_text_and_html_by_tag('orphan', f'{html}'), ('', ''))
self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}'))
- self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}'))
def test_strict_html_parsing(self):
class StrictTagParser(HTMLTagParser):
@@ -244,13 +245,13 @@ def test_relaxed_html_parsing(self):
parser = HTMLTagParser()
self.assertEqual(parser.taglist('', reset=True), [])
- self.assertEqual(parser.taglist('', reset=True), [])
+ self.assertEqual(parser.taglist('
', reset=True), [Tag('div'), Tag('p')])
tags = parser.taglist('
', reset=True)
- self.assertEqual(tags, [Tag('p'), Tag('div')])
+ self.assertEqual(tags, [Tag('div'), Tag('p')])
tags = parser.taglist('
', reset=True)
- self.assertEqual(tags, [Tag('div')])
+ self.assertEqual(tags, [Tag('div'), Tag('p')])
tags = parser.taglist('
', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index 8fbb4db14b..5ecd6b75ca 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -190,7 +190,7 @@ def handle_starttag(self, tag, attrs):
tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
tag_obj.openrange(self._offset, len(tag_text))
if tag_is_open:
- nesting = []
+ nesting = [tag_obj]
self._nestedtags[-1].append(nesting)
self._nestedtags.append(nesting)
else:
@@ -218,7 +218,7 @@ def handle_endtag(self, tag):
if isinstance(tag_obj, self.Tag):
close_idx = self.rawdata.find('>', self._offset) + 1
tag_obj.closerange(self._offset, close_idx - self._offset)
- self._nestedtags.pop().insert(0, tag_obj)
+ self._nestedtags.pop()
self.callback(tag_obj)
except ValueError as exc:
if isinstance(exc, compat_HTMLParseError):
From 7a67a2028f49f71c2cd4bae0611c2a04e313e840 Mon Sep 17 00:00:00 2001
From: Marcel
Date: Sun, 27 Nov 2022 21:26:58 +0100
Subject: [PATCH 08/15] [parsing] tweak tag regex
---
yt_dlp/parsing.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index 5ecd6b75ca..d2c2609545 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -261,7 +261,7 @@ def matching_tag_regex(tag, attribute, value_regex, escape=True):
return rf'''(?x)
<(?:{tag})
- (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
+ (?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?
\s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q)
'''
@@ -278,7 +278,8 @@ def tags_by_name(cls, tag, html):
def matchfunc(tag_str, _attrs):
return tag_str == tag
- yield from cls.iter_tags(rf'<\s*{re.escape(tag)}[\s>]', html, matchfunc=matchfunc)
+ tag_regex = rf'''<\s*{re.escape(tag)}(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>'''
+ yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
@classmethod
def tags_by_attribute(cls, attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True):
From 29278a3323be5106809e43d2977efcd0e3159a4f Mon Sep 17 00:00:00 2001
From: Marcel
Date: Sun, 27 Nov 2022 16:56:45 +0100
Subject: [PATCH 09/15] [parsing] fix return value
---
test/test_parsing.py | 16 +++++++++++++---
yt_dlp/parsing.py | 4 ++--
2 files changed, 15 insertions(+), 5 deletions(-)
diff --git a/test/test_parsing.py b/test/test_parsing.py
index 5887115185..e21299df03 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -195,7 +195,7 @@ class StrictParser(MatchingElementParser):
ignore
'''
items = get_elements_text_and_html_by_tag('img', test_string)
- self.assertListEqual(items, [('', ''), ('', '')])
+ self.assertEqual(items, [('', ''), ('', '')])
self.assertEqual(
StrictParser.get_element_text_and_html_by_tag('use', ''),
@@ -245,16 +245,26 @@ def test_relaxed_html_parsing(self):
parser = HTMLTagParser()
self.assertEqual(parser.taglist('', reset=True), [])
- self.assertEqual(parser.taglist('', reset=True), [Tag('div'), Tag('p')])
+
+ tags = parser.taglist('
', reset=True)
+ self.assertEqual(tags, [Tag('div'), Tag('p')])
+ self.assertEqual(tags[0].text_and_html(), ('', '
'))
+ self.assertEqual(tags[1].text_and_html(), ('', '
'))
tags = parser.taglist('
', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
+ self.assertEqual(tags[0].text_and_html(), ('
', '
'))
+ self.assertEqual(tags[1].text_and_html(), ('
', '
'))
tags = parser.taglist('
', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
+ self.assertEqual(tags[0].text_and_html(), ('
/p>', '
'))
+ self.assertEqual(tags[1].text_and_html(), ('', '
'))
tags = parser.taglist('
', reset=True)
self.assertEqual(tags, [Tag('div'), Tag('p')])
+ self.assertEqual(tags[0].text_and_html(),
+ ('
paragraph
', '
'))
self.assertEqual(tags[1].text_and_html(), ('paragraph', '
paragraph
'))
tags = parser.taglist('
must be empty', reset=True)
@@ -315,7 +325,7 @@ def test_tag_return_order(self):
[Tag('t5'), Tag('t6')]],
[Tag('t7'), Tag('t8')]]))
- def test_within_html_comment(self):
+ def test_html_comment_ranges(self):
def mark_comments(_string, char='^', nochar='-'):
cmts = HTMLCommentRanges(_string)
return "".join(char if _idx in cmts else nochar for _idx in range(len(_string)))
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index d2c2609545..8751cd5f9d 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -318,7 +318,7 @@ def get_elements_text_and_html_by_attribute(cls, *args, **kwargs):
@classmethod
def get_elements_by_attribute(cls, *args, **kwargs):
- return [tag.text_and_html()[0] for tag in cls.tags_by_attribute(*args, **kwargs)]
+ return [tag.text() for tag in cls.tags_by_attribute(*args, **kwargs)]
@classmethod
def get_elements_html_by_attribute(cls, *args, **kwargs):
@@ -349,7 +349,7 @@ def get_elements_html_by_class(cls, class_name, html):
@classmethod
def get_elements_text_and_html_by_class(cls, class_name, html):
value = cls.class_value_regex(class_name)
- return [tag.text() for tag
+ return [tag.text_and_html() for tag
in cls.tags_by_attribute('class', value, html, escape_value=False)]
@classmethod
From 6169b3eca81ccde2d6c0116295b2c38e807befb2 Mon Sep 17 00:00:00 2001
From: Marcel
Date: Tue, 29 Nov 2022 00:25:52 +0100
Subject: [PATCH 10/15] [parsing] replace HTMLCommentRanges with
HTMLIgnoreRanges
* ignore matches within CDATA elements and comments
---
test/test_parsing.py | 25 +++++++++-------
yt_dlp/parsing.py | 71 +++++++++++++++++---------------------------
2 files changed, 43 insertions(+), 53 deletions(-)
diff --git a/test/test_parsing.py b/test/test_parsing.py
index e21299df03..1898ee8ab1 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -4,7 +4,7 @@
from yt_dlp.compat import compat_HTMLParseError
from yt_dlp.parsing import (
MatchingElementParser,
- HTMLCommentRanges,
+ HTMLIgnoreRanges,
HTMLTagParser,
)
@@ -325,26 +325,31 @@ def test_tag_return_order(self):
[Tag('t5'), Tag('t6')]],
[Tag('t7'), Tag('t8')]]))
- def test_html_comment_ranges(self):
+ def test_html_ignored_ranges(self):
def mark_comments(_string, char='^', nochar='-'):
- cmts = HTMLCommentRanges(_string)
+ cmts = HTMLIgnoreRanges(_string)
return "".join(char if _idx in cmts else nochar for _idx in range(len(_string)))
html_string = '''
no comments in this line
---------------------------------------------------------------------
- ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ ----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^---
before after
- -------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^--------
+ -----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-----------
+ this is a leftover comment --> and end
- ------------^^^^^^^^^^^^^^^^^^---------^^^^^^^^^^^^^^^^^^^^^^^^------
- this ends here --> and not here
- -----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^----------------------------
- stray --> comment closings --> are ignored ' encountered
- note: markers within quotes are not ignored
+ usage:
+ ranges = HTMLIgnoreRanges(html)
+ if offset in ranges:
+ ...
"""
+ REGEX = re.compile(r'|?\s*(?:script|style)\b[^>]*>')
def __init__(self, html):
- self._range_iter = self.ranges(html)
- self._range = next(self._range_iter, None)
- self._last_offset = 0
-
- @staticmethod
- def ranges(string, sopen=''):
- assert not (sopen.startswith(sclose) or sclose.startswith(sopen))
- open_iter = iter_find(string, sopen)
- close_len = len(sclose)
- close_iter = (idx + close_len for idx in iter_find(string, sclose))
- next_open = next(open_iter, None)
- next_close = next(close_iter, None)
-
- while True:
- if next_open is None:
- return
- while next_close is not None and next_open > next_close:
- next_close = next(close_iter, None)
- yield slice(next_open, next_close)
- if next_close is None:
- return
- while next_open is not None and next_open < next_close:
- next_open = next(open_iter, None)
+ self.html = html
+ self._last_match = None
+ self._final = False
def __contains__(self, offset):
assert isinstance(offset, int)
- assert offset >= self._last_offset, 'offset must be in increasing order'
- self._last_offset = offset
- while self._range and self._range.stop is not None and offset >= self._range.stop:
- self._range = next(self._range_iter, None)
- return not (self._range is None or offset < self._range.start)
+ if not self._final and (self._last_match is None or offset >= self._last_match.end()):
+ match = self.REGEX.search(self.html, offset)
+ if match:
+ self._last_match = match
+ else:
+ self._final = True
+
+ if self._last_match is None:
+ return False
+ match_string = self._last_match.group()
+ if match_string.startswith('') or match_string == '-->':
+ return offset < self._last_match.start()
+ return offset >= self._last_match.end()
class HTMLTagParser(HTMLParser):
@@ -267,10 +252,10 @@ def matching_tag_regex(tag, attribute, value_regex, escape=True):
@classmethod
def iter_tags(cls, regex, html, *, matchfunc):
- comments = HTMLCommentRanges(html)
+ ignored = HTMLIgnoreRanges(html)
parser = cls(matchfunc)
for match in re.finditer(regex, html):
- if match.start() not in comments:
+ if match.start() not in ignored:
yield from parser.taglist(html[match.start():], reset=True)
@classmethod
From 65f91148fc6fcbce967d775527edb95b567db0cb Mon Sep 17 00:00:00 2001
From: Marcel
Date: Tue, 29 Nov 2022 15:01:18 +0100
Subject: [PATCH 11/15] [parsing] search for case-insensitive tag names
---
test/test_parsing.py | 4 ++++
yt_dlp/parsing.py | 4 ++--
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/test/test_parsing.py b/test/test_parsing.py
index 1898ee8ab1..8a36beda44 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -222,6 +222,10 @@ def test_get_element_text_and_html_by_tag_malformed(self):
get_element_text_and_html_by_tag('orphan', f'{html}'), ('', ''))
self.assertIsNone(get_element_text_and_html_by_tag('orphan', f'{html}'))
+ # ignore case on tags
+ ci_html = f'{html}'
+ self.assertEqual(get_element_text_and_html_by_tag('span', ci_html), (html, ci_html))
+
def test_strict_html_parsing(self):
class StrictTagParser(HTMLTagParser):
STRICT = True
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index 1698591e34..1db6704dd2 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -245,7 +245,7 @@ def matching_tag_regex(tag, attribute, value_regex, escape=True):
value_regex = re.escape(value_regex)
return rf'''(?x)
- <(?:{tag})
+ <(?i:{tag})
(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?
\s{re.escape(attribute)}\s*=\s*(?P<_q>['"])(?-x:{value_regex})(?P=_q)
'''
@@ -263,7 +263,7 @@ def tags_by_name(cls, tag, html):
def matchfunc(tag_str, _attrs):
return tag_str == tag
- tag_regex = rf'''<\s*{re.escape(tag)}(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>'''
+ tag_regex = rf'''<\s*(?i:{re.escape(tag)})(?:\s(?:[^>"'\\]|"[^"\\]*"|'[^'\\]*')*)?>'''
yield from cls.iter_tags(tag_regex, html, matchfunc=matchfunc)
@classmethod
From 8d87bb4d91ed732bc08bd39ce114bdcca63abf68 Mon Sep 17 00:00:00 2001
From: Marcel
Date: Wed, 30 Nov 2022 17:21:09 +0100
Subject: [PATCH 12/15] [parsing] unify tag nesting
---
test/test_parsing.py | 6 +++---
yt_dlp/parsing.py | 5 ++---
2 files changed, 5 insertions(+), 6 deletions(-)
diff --git a/test/test_parsing.py b/test/test_parsing.py
index 8a36beda44..a7e7ec7d46 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -325,9 +325,9 @@ def test_tag_return_order(self):
str(tags), str([
[Tag('t0'),
[Tag('t1'),
- [Tag('t2'), Tag('t3'), Tag('t4')]],
- [Tag('t5'), Tag('t6')]],
- [Tag('t7'), Tag('t8')]]))
+ [Tag('t2'), [Tag('t3')], [Tag('t4')]]],
+ [Tag('t5'), [Tag('t6')]]],
+ [Tag('t7'), [Tag('t8')]]]))
def test_html_ignored_ranges(self):
def mark_comments(_string, char='^', nochar='-'):
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index 1db6704dd2..c6748d2d8f 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -174,12 +174,11 @@ def handle_starttag(self, tag, attrs):
if self.predicate(tag, attrs):
tag_obj = self.Tag(tag, string=self.rawdata, attrs=attrs)
tag_obj.openrange(self._offset, len(tag_text))
+ nesting = [tag_obj]
+ self._nestedtags[-1].append(nesting)
if tag_is_open:
- nesting = [tag_obj]
- self._nestedtags[-1].append(nesting)
self._nestedtags.append(nesting)
else:
- self._nestedtags[-1].append(tag_obj)
self.callback(tag_obj)
if tag_is_open:
self.tagstack.appendleft(tag_obj)
From 7a9dd3d35fa793f8f6fd1bff7ab9d500e025f9b4 Mon Sep 17 00:00:00 2001
From: Marcel
Date: Fri, 2 Dec 2022 20:54:04 +0100
Subject: [PATCH 13/15] [parsing] inline tag_obj.closerange()
---
yt_dlp/parsing.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index c6748d2d8f..256ba8e6c7 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -200,8 +200,7 @@ def handle_endtag(self, tag):
tag_obj = self.tagstack[idx]
self.tagstack.remove(tag)
if isinstance(tag_obj, self.Tag):
- close_idx = self.rawdata.find('>', self._offset) + 1
- tag_obj.closerange(self._offset, close_idx - self._offset)
+ tag_obj.closerange(slice(self._offset, self.rawdata.find('>', self._offset) + 1))
self._nestedtags.pop()
self.callback(tag_obj)
except ValueError as exc:
From c34166d7c8d64f065eb05a6447e268a7b7dc3e6e Mon Sep 17 00:00:00 2001
From: flashdagger
Date: Mon, 13 Nov 2023 06:54:28 +0100
Subject: [PATCH 14/15] [parsing] support uppercase SCRIPT tags as suggested by
github-advanced-security bot
---
test/test_parsing.py | 2 +-
yt_dlp/parsing.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/test/test_parsing.py b/test/test_parsing.py
index a7e7ec7d46..0e006298f7 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -345,7 +345,7 @@ def mark_comments(_string, char='^', nochar='-'):
^^^^^^^^^^^^^^^^^^^^^^^^^^^------------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
here is and end
----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^---------
-
+
--------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^---------
'''
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index 256ba8e6c7..f4aaf1ac4f 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -20,7 +20,7 @@ class HTMLIgnoreRanges:
if offset in ranges:
...
"""
- REGEX = re.compile(r'|?\s*(?:script|style)\b[^>]*>')
+ REGEX = re.compile(r'|?\s*(?:script|style)\b[^>]*>', flags=re.IGNORECASE)
def __init__(self, html):
self.html = html
From a91d9e1084ca87472b952d189eb897dc8a52fec5 Mon Sep 17 00:00:00 2001
From: flashdagger
Date: Mon, 13 Nov 2023 07:14:14 +0100
Subject: [PATCH 15/15] [parsing] support comment end tag '--!>' as suggested
by github-advanced-security bot
---
test/test_parsing.py | 2 +-
yt_dlp/parsing.py | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/test/test_parsing.py b/test/test_parsing.py
index 0e006298f7..9641df91df 100644
--- a/test/test_parsing.py
+++ b/test/test_parsing.py
@@ -343,7 +343,7 @@ def mark_comments(_string, char='^', nochar='-'):
-----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-----------
this is a leftover comment --> and end
+ here is and end
----------------^^^^^^^^^^^----------------^^^^^^^^^^^^^^^^^---------
--------^^^^^^^^^^^^^-----------------------------^^^^^^^^^^---------
diff --git a/yt_dlp/parsing.py b/yt_dlp/parsing.py
index f4aaf1ac4f..72d7e448bd 100644
--- a/yt_dlp/parsing.py
+++ b/yt_dlp/parsing.py
@@ -20,7 +20,7 @@ class HTMLIgnoreRanges:
if offset in ranges:
...
"""
- REGEX = re.compile(r'|?\s*(?:script|style)\b[^>]*>', flags=re.IGNORECASE)
+ REGEX = re.compile(r'':
+ if match_string.startswith('') or match_string in ('-->', '--!>'):
return offset < self._last_match.start()
return offset >= self._last_match.end()