[errarhiiv] Add new extractor (closes #24434)

2024-11-30 07:38:18 +01:00 · 2021-06-07 14:14:40 +02:00 · 2021-06-07 14:14:40 +02:00 · 8109332379
commit 8109332379
parent c2350cac24
2 changed files with 505 additions and 0 deletions
--- a/youtube_dl/extractor/errarhiiv.py
+++ b/youtube_dl/extractor/errarhiiv.py
@ -0,0 +1,501 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import locale
+
+from datetime import datetime, date
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (ExtractorError, unified_timestamp, parse_duration,
+                     orderedSet, clean_html)
+
+
+class ERRArhiivBaseIE(InfoExtractor):
+    _GEO_COUNTRIES = ['EE']
+
+    def _real_initialize(self):
+        # Have to set Estonian locale for date parsing
+        locale.setlocale(locale.LC_TIME, 'et_EE.UTF-8')
+
+
+class ERRArhiivIE(ERRArhiivBaseIE):
+    IE_DESC = 'TV and radio shows, movies and documentaries aired in ETV (Estonia)'
+    _VALID_URL = r'https?://arhiiv\.err\.ee/vaata/(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url':
+        'https://arhiiv.err.ee/vaata/eesti-aja-lood-okupatsioonid-muusad-soja-varjus',
+        'md5': 'bb8b0b04fac8173d3deb47f07a43342d',
+        'info_dict': {
+            'id': 'eesti-aja-lood-okupatsioonid-muusad-soja-varjus',
+            'display_id': 'eesti-aja-lood-okupatsioonid-muusad-soja-varjus',
+            'ext': 'mp4',
+            'title': 'Eesti aja lood. Okupatsioonid - Muusad sõja varjus',
+            'thumbnail':
+            'https://arhiiv.err.ee//thumbnails/2009-002267-0068_0001_D10_EESTI-AJA-LOOD-OKUPATSIOONID_th.jpg',
+            'description': 'md5:36772936a0982571ce23aa0dad1f6231',
+            'upload_date': '20100513',
+            'uploader': 'ERR',
+            'timestamp': 1273709330,
+        }
+    }, {
+        'url': 'https://arhiiv.err.ee/vaata/tallinn-mai-juuni-1976',
+        'md5': 'b5eae775571497cb7cf3ba7c3cd34625',
+        'info_dict': {
+            'id': 'tallinn-mai-juuni-1976',
+            'display_id': 'tallinn-mai-juuni-1976',
+            'ext': 'mp4',
+            'title': 'Tallinn. Mai-juuni 1976',
+            'thumbnail':
+            'https://arhiiv.err.ee//thumbnails/1976-085466-0001_0002_D10_TALLINN-MAI-JUUNI-1976_th.jpg',
+            'upload_date': '20190709',
+            'uploader': 'ERR',
+            'timestamp': 1562679643,
+        }
+    }, {
+        'url': 'https://arhiiv.err.ee/vaata/linnulaul-linnulaul-34-rukkiraak',
+        'md5': '4f9f659c9c6c6a99c01f423895ac377e',
+        'info_dict': {
+            'id': 'linnulaul-linnulaul-34-rukkiraak',
+            'display_id': 'linnulaul-linnulaul-34-rukkiraak',
+            'ext': 'm4a',
+            'title': 'Linnulaul - 34. Rukkirääk',
+            'thumbnail':
+            'https://arhiiv.err.ee//thumbnails/default_audio_th.jpg',
+            'description': 'md5:d41739b0c8e250a3435216afc98c8741',
+            'release_date': '20020530',
+            'channel': '2002 EESTI RAADIO',
+            'uploader': 'ERR',
+        }
+    }]
+
+    def _log_debug_message(self, msg):
+        """Writes debug message only if verbose flag is set"""
+        if self._downloader.params.get('verbose', False):
+            self.to_screen('[debug] ' + msg)
+
+    def _report_properties(self, info):
+        """Writes debug info about extracted properties"""
+        if not info:
+            return
+        for key in sorted(filter(lambda k: k != 'chapters', info)):
+            self.to_screen("[debug] %s: '%s'" % (key, info[key]))
+
+        if 'chapters' in info:
+            for (idx, chapter) in enumerate(info['chapters'], start=1):
+                self.to_screen(
+                    "[debug] Chapter %d: %s - %s '%s'" %
+                    (idx, self._format_duration(chapter['start_time'], always_minutes=True),
+                     self._format_duration(
+                         chapter['end_time'], always_minutes=True), chapter['title']))
+
+    @classmethod
+    def _clean_up_properties(cls, info):
+        """Deletes internally used properties"""
+        if not info:
+            return
+        info.pop('iso', None)
+        info.pop('file', None)
+
+    def _extract_properties(self, webpage):
+        info = dict()
+        title = self._html_search_regex(
+            r'<head>[^<]*<title>([^|]+)[^<]*?</title>',
+            webpage,
+            'title',
+            flags=re.DOTALL)
+        if title:
+            info['title'] = title.strip().strip('.')
+
+        description = self._html_search_regex(
+            r'<h2>\s*?Kirjeldus\s*?</h2>\s*?<p>(.*?)</p>',
+            webpage,
+            'description',
+            flags=re.DOTALL,
+            default=None)
+        if description:
+            info['description'] = description.strip()
+
+        rights = self._html_search_regex(
+            r'<div[^>]+id=(["\'])rights\1[^>]*>(?P<rights>.*?)</div>',
+            webpage,
+            'rights',
+            flags=re.DOTALL,
+            group='rights',
+            default=None)
+        if rights:
+            # Remove ugly unnecessary whitespace
+            info['license'] = ' '.join(rights.split())
+
+        thumbnail = self._search_regex(
+            r"css\('background-image', 'url\(\"(.+?).jpg\"\)'\)",
+            webpage,
+            'thumbnail',
+            flags=re.DOTALL,
+            default=None)
+        if thumbnail:
+            info['thumbnail'] = thumbnail + '_th.jpg'
+
+        res = re.findall(
+            r'<th[^>]*>([^<:\*\.]+)[^<]*?</th>[^<]*?<td[^>]*>(.*?)</td>',
+            webpage, re.DOTALL)
+        year = None
+        if res:
+            for (name, value) in res:
+                name = name.strip()
+                value = value.strip()
+                if name == 'Sarja pealkiri':
+                    info['series'] = value
+                elif name == 'Osa nr':
+                    info['episode_number'] = int(value)
+                elif name == 'Uudislugu':
+                    info['description'] = value
+                elif name == 'ISO':
+                    info['iso'] = value
+                elif name == 'Fail':
+                    info['file'] = value
+                elif name == 'Märksõnad':
+                    # tags can be:
+                    #   * simple like 'huumor';
+                    #   * complex like 'intervjuud/vestlusringid';
+                    #   * weird like 'meedia (raadio, tv, press)'.
+                    # See e.g. 'https://arhiiv.err.ee/vaata/homme-on-esimene-aprill'
+                    tags = re.sub(r'\(|\)|,|/', ' ', clean_html(value)).split()
+                    if tags:
+                        info['tags'] = sorted(
+                            map(lambda s: s.strip().lower(), tags))
+                elif name in ['Aasta', 'Võtte aasta']:
+                    year = value
+                elif name in ['Autorid', 'Režissöör', 'Toimetajad', 'Esinejad']:
+                    if 'creator' not in info:
+                        info['creator'] = set()
+                    info['creator'] = info['creator'] | set(
+                        re.split(r'\s*,\s*', value))
+                elif name == 'Fonogrammi tootja':
+                    info['channel'] = value
+                    mobj = re.search(r'([0-9]{4})', value)
+                    if not year and mobj:
+                        year = mobj.group(0)
+                elif name == 'Kestus':
+                    info['duration'] = parse_duration(value)
+                elif name == 'Kategooria':
+                    categories = re.split(r'\s*&rarr;\s*|\s*,\s*', value)
+                    info['categories'] = sorted(
+                        filter(
+                            lambda s: s != 'Muu',
+                            set(map(lambda s: s.capitalize(),
+                                    categories))))
+                elif name == 'Registreerimise kuupäev':
+                    try:
+                        info['upload_date'] = datetime.strptime(
+                            value, '%d.%m.%Y').date().strftime('%Y%m%d')
+                    except ValueError as ex:
+                        self._log_debug_message(
+                            "Failed to parse upload_date '%s' %s" %
+                            (value, ex))
+                elif name == 'Registreerimise aeg':
+                    info['timestamp'] = unified_timestamp(value)
+                elif name in ['Esmaeeter', 'Eetris']:
+                    try:
+                        info['release_date'] = datetime.strptime(
+                            value, '%d.%m.%Y').date()
+                    except ValueError as ex:
+                        self._log_debug_message(
+                            "Failed to parse release_date '%s' %s" %
+                            (value, ex))
+                    if 'release_date' not in info:
+                        try:
+                            info['release_date'] = datetime.strptime(
+                                value, "%B %Y").date()
+                        except ValueError as ex:
+                            self._log_debug_message(
+                                "Failed to parse release_date '%s' %s" %
+                                (value, ex))
+                        except TypeError as ex:
+                            self._log_debug_message(
+                                "Failed to parse release_date '%s' %s" %
+                                (value, ex))
+                    if 'release_date' not in info:
+                        # Try for a year yyyy
+                        mobj = re.search(r'([0-9]{4})', value)
+                        if mobj:
+                            info['release_date'] = date(year=int(
+                                mobj.group(0)), day=1, month=1)
+                    if 'release_date' in info:
+                        info['release_date'] = info['release_date'].strftime(
+                            '%Y%m%d')
+
+        if year and 'release_date' not in info:
+            info['release_date'] = date(
+                year=int(year), day=1, month=1).strftime('%Y%m%d')
+
+        if 'release_date' in info and not year:
+            mobj = re.match(r'\d{4}', info['release_date'])
+            if mobj:
+                year = mobj.group(0)
+
+        if 'channel' not in info:
+            channel = list()
+            if year:
+                channel.append(year)
+            channel.append('ERR')
+            info['channel'] = ' '.join(channel)
+
+        info['uploader'] = 'ERR'
+        info['is_live'] = False
+
+        if 'creator' in info:
+            info['creator'] = ', '.join(sorted(info['creator']))
+
+        if 'series' in info:
+            episode = info['title']
+            prefix = info['series'].upper()
+            if episode.upper().startswith(prefix + ': ' + prefix):
+                # ERR Arhiiv sometimes mangles episode's title by
+                # adding series name twice as prefix.  This hack
+                # corrects it.
+                episode = episode[len(prefix + ': ' + prefix):]
+            elif episode.upper().startswith(prefix):
+                episode = episode[len(prefix):]
+
+            if episode.startswith(': '):
+                episode = episode[len(': '):]
+            elif episode.startswith('. '):
+                episode = episode[len('. '):]
+
+            info['episode'] = episode.strip()
+            if not episode:
+                self.report_warning("Episode name reduced to 'none'")
+
+        if 'episode' in info:
+            info['title'] = info['series'] + ' - ' + info['episode']
+
+        if 'series' in info and year:
+            info['season'] = year
+
+        return info
+
+    @classmethod
+    def _format_duration(cls, duration, always_minutes=False, always_hours=False):
+        '''Formats duration as HH:MM:SS'''
+        if duration is None:
+            return None
+        minutes, seconds = divmod(duration, 60)
+        hours, minutes = divmod(minutes, 60)
+        rval = '%02d' % seconds
+        if always_hours or always_minutes or minutes or hours:
+            rval = '%02d:' % minutes + rval
+        if always_hours or hours:
+            rval = '%02d:' % hours + rval
+
+        return rval
+
+    def _extract_chapters(self, webpage, total_duration):
+        res = re.findall(
+            r'<tr>[^<]*<td[^>]+class=(["\'])data\1[^>]*>([^<]+)</td>'
+            r'[^<]*<td[^>]+class=(["\'])data\3[^>]*>.*?</td>'
+            r'[^<]*<td[^>]+class=(["\'])data\4[^>]*>(.*?)</td>[^<]*</tr>',
+            webpage, re.DOTALL)
+        chapters = list()
+        prev_chapter = dict()
+        correction = 0
+        for match in res:
+            chapter = dict()
+            duration = parse_duration(match[1])
+            if not prev_chapter:
+                # ERR Arhiiv sometimes adds some arbitrary amount of seconds to
+                # all timings. This hack corrects it by subtracting the first
+                # chapter's start_time from all subsequent timings.
+                correction = duration
+            duration -= correction
+            chapter['start_time'] = duration
+            chapter['title'] = match[4].strip()
+            if prev_chapter:
+                prev_chapter['end_time'] = duration
+            chapters.append(chapter)
+            prev_chapter = chapter
+        prev_chapter['end_time'] = total_duration
+
+        return chapters
+
+    def _real_extract(self, url):
+        info = dict()
+
+        video_id = self._match_id(url)
+        self._log_debug_message("Extracted id '%s'" % video_id)
+        info['display_id'] = video_id
+        info['id'] = video_id
+        info['webpage_url'] = url
+
+        webpage = self._download_webpage(url, video_id)
+
+        master_url = self._search_regex(r"var\s+src\s*=\s*'(//.+?\.m3u8)';",
+                                        webpage,
+                                        'master_url',
+                                        flags=re.DOTALL,
+                                        fatal=False)
+
+        self._log_debug_message("Extracted master_url '%s'" % master_url)
+
+        info.update(self._extract_properties(webpage))
+
+        if not master_url:
+            error_msg = 'Cannot extract master_url. Video or audio %s is not available' % video_id
+            if 'iso' in info or re.match(r'.*(?:TIFF|JPEG).*', info.get('file', '')):
+                error_msg += ", url referres to a photo."
+            raise ExtractorError(error_msg, expected=True)
+
+        m3u8_formats = []
+        try:
+            m3u8_formats = self._extract_m3u8_formats(master_url, video_id)
+        except ExtractorError as ex:
+            if isinstance(ex.cause, compat_HTTPError) and ex.cause.code == 404:
+                mobj = re.search(r'\.urlset/master.m3u8', master_url)
+                if mobj:
+                    self.report_warning(
+                        "master_url links to nonexistent resource '%s'" %
+                        master_url)
+            raise ex
+
+        for m3u8_format in m3u8_formats:
+            if not m3u8_format['ext']:
+                mobj = re.search(r'\.(\w{3})/', m3u8_format['url'])
+                if mobj:
+                    m3u8_format['ext'] = mobj.group(1)
+                else:
+                    m3u8_format['ext'] = 'mp4' if m3u8_format[
+                        'vcodec'] != 'none' else 'm4a'
+        if m3u8_formats:
+            self._sort_formats(m3u8_formats)
+            info['formats'] = m3u8_formats
+
+        if 'duration' in info:
+            chapters = self._extract_chapters(webpage, info['duration'])
+            if chapters:
+                info['chapters'] = chapters
+
+        if self._downloader.params.get('verbose', False):
+            self._report_properties(info)
+
+        self._clean_up_properties(info)
+
+        return info
+
+
+class ERRArhiivPlaylistIE(ERRArhiivBaseIE):
+    IE_DESC = 'arhiiv.err.ee playlists and search results'
+    _ERRARHIIV_SERVICES = 'seeria|samast-seeriast|sarnased|otsi|tapsem-otsing|show-category-single-files'
+    _VALID_URL = r'(?P<prefix>https?://arhiiv\.err\.ee)/(?P<service>%(services)s)[/?#]*(?P<id>[^/?#]*)' % {
+        'services': _ERRARHIIV_SERVICES
+    }
+    _TESTS = [{
+        'url': 'https://arhiiv.err.ee/seeria/linnuaabits/info/0/default/koik',
+        'info_dict': {
+            'id': 'linnuaabits',
+            'title': "Linnuaabits",
+        },
+        'playlist_mincount': 71,
+    }, {
+        'url': 'https://arhiiv.err.ee/seeria/linnulaul',
+        'info_dict': {
+            'id': 'linnulaul',
+            'title': "Linnulaul",
+        },
+        'playlist_mincount': 10,
+    }, {
+        'url':
+        'https://arhiiv.err.ee/seeria/eesti-aja-lood-okupatsioonid/info/0/default/koik',
+        'info_dict': {
+            'id': 'eesti-aja-lood-okupatsioonid',
+            'title': "Eesti aja lood. Okupatsioonid",
+        },
+        'playlist_mincount': 46,
+    }, {
+        'url':
+        'https://arhiiv.err.ee/samast-seeriast/ak-filmikroonika-1958-1991-linnuturg-keskturul/default/1',
+        'info_dict': {
+            'id': 'ak-filmikroonika-1958-1991',
+            'title': "AK filmikroonika 1958-1991",
+        },
+        'playlist_count': 10,
+    }, {
+        'url':
+        'https://arhiiv.err.ee/sarnased/ensv-ensv-kaadri-taga/default/1',
+        'info_dict': {
+            'id': 'ensv',
+            'title': "EnsV - Sarnased saated",
+        },
+        'playlist_count': 10,
+    }, {
+        'url': 'https://arhiiv.err.ee/otsi/reliikvia/default/koik',
+        'info_dict': {
+            'id': None,
+            'title': "Otsingutulemused `reliikvia`",
+        },
+        'playlist_mincount': 161,
+    }, {
+        'url': 'https://arhiiv.err.ee/otsi/reliikvia/default/3',
+        'info_dict': {
+            'id': None,
+            'title': "Otsingutulemused `reliikvia`",
+        },
+        'playlist_mincount': 10,
+    }, {
+        'url':
+        'https://arhiiv.err.ee/tapsem-otsing?searchphrase=kahur&searchfrom_video=video&searchfrom_audio=audio',
+        'info_dict': {
+            'id': None,
+            'title': "Otsingutulemused",
+        },
+        'playlist_mincount': 10,
+    }]
+
+    def _guess_id_from_title(self, title):
+        if not title:
+            return None
+        playlist_id = title.lower()
+        playlist_id = ' '.join(playlist_id.split())
+        playlist_id = playlist_id.replace('õ', 'o')
+        playlist_id = playlist_id.replace('ö', 'o')
+        playlist_id = playlist_id.replace('ä', 'a')
+        playlist_id = playlist_id.replace('ü', 'u')
+        playlist_id = playlist_id.replace(' ', '-')
+        playlist_id = re.sub(r",|\.|:|\+|\?|!|'|\"|;|\*|\\|/|\|", "",
+                             playlist_id)
+        return playlist_id
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        service = mobj.group('service')
+        playlist_id = mobj.group('id') if service in [
+            'seeria', 'show-category-single-files'
+        ] else None
+        prefix = mobj.group('prefix')
+        webpage = self._download_webpage(url, playlist_id)
+        title = self._html_search_regex(
+            r'<head>[^<]*<title>([^|]+)[^<]*?</title>',
+            webpage,
+            'title',
+            flags=re.DOTALL,
+            fatal=False)
+        if title:
+            title = title.strip().strip('.')
+
+        if title and not playlist_id and service not in [
+                'otsi', 'tapsem-otsing', 'show-category-single-files'
+        ]:
+            playlist_id = self._guess_id_from_title(title)
+
+        if title and service == 'sarnased':
+            title += ' - Sarnased saated'
+
+        res = re.findall(
+            r'<h2[^>]*>[^<]*<a\s+href=(["\'])(/vaata/[^"\']+)\1[^>]*>',
+            webpage, re.DOTALL)
+
+        url_list = orderedSet([prefix + match[1] for match in res])
+
+        entries = [self.url_result(item_url, ie='ERRArhiiv') for item_url in url_list]
+
+        return self.playlist_result(entries, playlist_id, title)
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -345,6 +345,10 @@ from .embedly import EmbedlyIE
 from .engadget import EngadgetIE
 from .eporner import EpornerIE
 from .eroprofile import EroProfileIE
+from .errarhiiv import (
+    ERRArhiivIE,
+    ERRArhiivPlaylistIE,
+)
 from .escapist import EscapistIE
 from .espn import (
    ESPNIE,