[core] _VALID_URLS implementation

- implemented _VALID_URLS extractor property - improvement suggested by @Grub4k
2024-11-30 07:28:19 +01:00 · 2022-12-15 16:46:37 +01:00 · 2022-12-15 16:46:37 +01:00 · db96683cf1
commit db96683cf1
parent c733555106
2 changed files with 10 additions and 2 deletions
--- a/devscripts/make_lazy_extractors.py
+++ b/devscripts/make_lazy_extractors.py
@ -14,7 +14,7 @@

 NO_ATTR = object()
 STATIC_CLASS_PROPERTIES = [
-    'IE_NAME', '_ENABLED', '_VALID_URL',  # Used for URL matching
+    'IE_NAME', '_ENABLED', '_VALID_URL', '_VALID_URLS',  # Used for URL matching
    '_WORKING', 'IE_DESC', '_NETRC_MACHINE', 'SEARCH_KEY',  # Used for --extractor-descriptions
    'age_limit',  # Used for --age-limit (evaluated)
    '_RETURN_TYPE',  # Accessed in CLI only with instance (evaluated)
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@ -447,7 +447,7 @@ class InfoExtractor:


    Subclasses of this should also be added to the list of extractors and
-    should define a _VALID_URL regexp and, re-define the _real_extract() and
+    should define a _VALID_URL regexp (or a list of _VALID_URLS) and, re-define the _real_extract() and
    (optionally) _real_initialize() methods.

    Subclasses may also override suitable() if necessary, but ensure the function
@ -508,6 +508,7 @@ class InfoExtractor:
    IE_DESC = None
    SEARCH_KEY = None
    _VALID_URL = None
+    _VALID_URLS = []
    _EMBED_REGEX = []

    def _login_hint(self, method=NO_DEFAULT, netrc=None):
@ -534,6 +535,13 @@ def __init__(self, downloader=None):
    def _match_valid_url(cls, url):
        if cls._VALID_URL is False:
            return None
+
+        if cls._VALID_URLS:
+            if '_VALID_URLS_RE' not in cls.__dict__:
+                cls._VALID_URLS_RE = tuple(map(re.compile, cls._VALID_URLS))
+            return next(filter(None, (
+                valid_url_re.match(url) for valid_url_re in cls._VALID_URLS_RE)), None)
+
        # This does not use has/getattr intentionally - we want to know whether
        # we have cached the regexp for *this* class, whereas getattr would also
        # match the superclass