diff --git a/test/test_parsing.py b/test/test_parsing.py index e21299df0..1898ee8ab 100644 --- a/test/test_parsing.py +++ b/test/test_parsing.py @@ -4,7 +4,7 @@ from yt_dlp.compat import compat_HTMLParseError from yt_dlp.parsing import ( MatchingElementParser, - HTMLCommentRanges, + HTMLIgnoreRanges, HTMLTagParser, ) @@ -325,26 +325,31 @@ def test_tag_return_order(self): [Tag('t5'), Tag('t6')]], [Tag('t7'), Tag('t8')]])) - def test_html_comment_ranges(self): + def test_html_ignored_ranges(self): def mark_comments(_string, char='^', nochar='-'): - cmts = HTMLCommentRanges(_string) + cmts = HTMLIgnoreRanges(_string) return "".join(char if _idx in cmts else nochar for _idx in range(len(_string))) html_string = ''' no comments in this line --------------------------------------------------------------------- - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^--- before after - -------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^-------- + -----------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^----------- + this is a leftover comment --> and end - ------------^^^^^^^^^^^^^^^^^^---------^^^^^^^^^^^^^^^^^^^^^^^^------ - this ends here --> and not here - -----^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^---------------------------- - stray --> comment closings --> are ignored ' encountered - note: markers within quotes are not ignored + usage: + ranges = HTMLIgnoreRanges(html) + if offset in ranges: + ... """ + REGEX = re.compile(r'|]*>') def __init__(self, html): - self._range_iter = self.ranges(html) - self._range = next(self._range_iter, None) - self._last_offset = 0 - - @staticmethod - def ranges(string, sopen=''): - assert not (sopen.startswith(sclose) or sclose.startswith(sopen)) - open_iter = iter_find(string, sopen) - close_len = len(sclose) - close_iter = (idx + close_len for idx in iter_find(string, sclose)) - next_open = next(open_iter, None) - next_close = next(close_iter, None) - - while True: - if next_open is None: - return - while next_close is not None and next_open > next_close: - next_close = next(close_iter, None) - yield slice(next_open, next_close) - if next_close is None: - return - while next_open is not None and next_open < next_close: - next_open = next(open_iter, None) + self.html = html + self._last_match = None + self._final = False def __contains__(self, offset): assert isinstance(offset, int) - assert offset >= self._last_offset, 'offset must be in increasing order' - self._last_offset = offset - while self._range and self._range.stop is not None and offset >= self._range.stop: - self._range = next(self._range_iter, None) - return not (self._range is None or offset < self._range.start) + if not self._final and (self._last_match is None or offset >= self._last_match.end()): + match = self.REGEX.search(self.html, offset) + if match: + self._last_match = match + else: + self._final = True + + if self._last_match is None: + return False + match_string = self._last_match.group() + if match_string.startswith('': + return offset < self._last_match.start() + return offset >= self._last_match.end() class HTMLTagParser(HTMLParser): @@ -267,10 +252,10 @@ def matching_tag_regex(tag, attribute, value_regex, escape=True): @classmethod def iter_tags(cls, regex, html, *, matchfunc): - comments = HTMLCommentRanges(html) + ignored = HTMLIgnoreRanges(html) parser = cls(matchfunc) for match in re.finditer(regex, html): - if match.start() not in comments: + if match.start() not in ignored: yield from parser.taglist(html[match.start():], reset=True) @classmethod