From 8109332379eed14e62ea49d422a6886ac551cf25 Mon Sep 17 00:00:00 2001 From: smarbaa <19408035+smarbaa@users.noreply.github.com> Date: Mon, 7 Jun 2021 14:14:40 +0200 Subject: [PATCH] [errarhiiv] Add new extractor (closes #24434) --- youtube_dl/extractor/errarhiiv.py | 501 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 4 + 2 files changed, 505 insertions(+) create mode 100644 youtube_dl/extractor/errarhiiv.py diff --git a/youtube_dl/extractor/errarhiiv.py b/youtube_dl/extractor/errarhiiv.py new file mode 100644 index 000000000..e4f611d41 --- /dev/null +++ b/youtube_dl/extractor/errarhiiv.py @@ -0,0 +1,501 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import locale + +from datetime import datetime, date + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import (ExtractorError, unified_timestamp, parse_duration, + orderedSet, clean_html) + + +class ERRArhiivBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['EE'] + + def _real_initialize(self): + # Have to set Estonian locale for date parsing + locale.setlocale(locale.LC_TIME, 'et_EE.UTF-8') + + +class ERRArhiivIE(ERRArhiivBaseIE): + IE_DESC = 'TV and radio shows, movies and documentaries aired in ETV (Estonia)' + _VALID_URL = r'https?://arhiiv\.err\.ee/vaata/(?P[^/?#]+)' + _TESTS = [{ + 'url': + 'https://arhiiv.err.ee/vaata/eesti-aja-lood-okupatsioonid-muusad-soja-varjus', + 'md5': 'bb8b0b04fac8173d3deb47f07a43342d', + 'info_dict': { + 'id': 'eesti-aja-lood-okupatsioonid-muusad-soja-varjus', + 'display_id': 'eesti-aja-lood-okupatsioonid-muusad-soja-varjus', + 'ext': 'mp4', + 'title': 'Eesti aja lood. Okupatsioonid - Muusad sõja varjus', + 'thumbnail': + 'https://arhiiv.err.ee//thumbnails/2009-002267-0068_0001_D10_EESTI-AJA-LOOD-OKUPATSIOONID_th.jpg', + 'description': 'md5:36772936a0982571ce23aa0dad1f6231', + 'upload_date': '20100513', + 'uploader': 'ERR', + 'timestamp': 1273709330, + } + }, { + 'url': 'https://arhiiv.err.ee/vaata/tallinn-mai-juuni-1976', + 'md5': 'b5eae775571497cb7cf3ba7c3cd34625', + 'info_dict': { + 'id': 'tallinn-mai-juuni-1976', + 'display_id': 'tallinn-mai-juuni-1976', + 'ext': 'mp4', + 'title': 'Tallinn. Mai-juuni 1976', + 'thumbnail': + 'https://arhiiv.err.ee//thumbnails/1976-085466-0001_0002_D10_TALLINN-MAI-JUUNI-1976_th.jpg', + 'upload_date': '20190709', + 'uploader': 'ERR', + 'timestamp': 1562679643, + } + }, { + 'url': 'https://arhiiv.err.ee/vaata/linnulaul-linnulaul-34-rukkiraak', + 'md5': '4f9f659c9c6c6a99c01f423895ac377e', + 'info_dict': { + 'id': 'linnulaul-linnulaul-34-rukkiraak', + 'display_id': 'linnulaul-linnulaul-34-rukkiraak', + 'ext': 'm4a', + 'title': 'Linnulaul - 34. Rukkirääk', + 'thumbnail': + 'https://arhiiv.err.ee//thumbnails/default_audio_th.jpg', + 'description': 'md5:d41739b0c8e250a3435216afc98c8741', + 'release_date': '20020530', + 'channel': '2002 EESTI RAADIO', + 'uploader': 'ERR', + } + }] + + def _log_debug_message(self, msg): + """Writes debug message only if verbose flag is set""" + if self._downloader.params.get('verbose', False): + self.to_screen('[debug] ' + msg) + + def _report_properties(self, info): + """Writes debug info about extracted properties""" + if not info: + return + for key in sorted(filter(lambda k: k != 'chapters', info)): + self.to_screen("[debug] %s: '%s'" % (key, info[key])) + + if 'chapters' in info: + for (idx, chapter) in enumerate(info['chapters'], start=1): + self.to_screen( + "[debug] Chapter %d: %s - %s '%s'" % + (idx, self._format_duration(chapter['start_time'], always_minutes=True), + self._format_duration( + chapter['end_time'], always_minutes=True), chapter['title'])) + + @classmethod + def _clean_up_properties(cls, info): + """Deletes internally used properties""" + if not info: + return + info.pop('iso', None) + info.pop('file', None) + + def _extract_properties(self, webpage): + info = dict() + title = self._html_search_regex( + r'[^<]*([^|]+)[^<]*?', + webpage, + 'title', + flags=re.DOTALL) + if title: + info['title'] = title.strip().strip('.') + + description = self._html_search_regex( + r'

\s*?Kirjeldus\s*?

\s*?

(.*?)

', + webpage, + 'description', + flags=re.DOTALL, + default=None) + if description: + info['description'] = description.strip() + + rights = self._html_search_regex( + r']+id=(["\'])rights\1[^>]*>(?P.*?)', + webpage, + 'rights', + flags=re.DOTALL, + group='rights', + default=None) + if rights: + # Remove ugly unnecessary whitespace + info['license'] = ' '.join(rights.split()) + + thumbnail = self._search_regex( + r"css\('background-image', 'url\(\"(.+?).jpg\"\)'\)", + webpage, + 'thumbnail', + flags=re.DOTALL, + default=None) + if thumbnail: + info['thumbnail'] = thumbnail + '_th.jpg' + + res = re.findall( + r']*>([^<:\*\.]+)[^<]*?[^<]*?]*>(.*?)', + webpage, re.DOTALL) + year = None + if res: + for (name, value) in res: + name = name.strip() + value = value.strip() + if name == 'Sarja pealkiri': + info['series'] = value + elif name == 'Osa nr': + info['episode_number'] = int(value) + elif name == 'Uudislugu': + info['description'] = value + elif name == 'ISO': + info['iso'] = value + elif name == 'Fail': + info['file'] = value + elif name == 'Märksõnad': + # tags can be: + # * simple like 'huumor'; + # * complex like 'intervjuud/vestlusringid'; + # * weird like 'meedia (raadio, tv, press)'. + # See e.g. 'https://arhiiv.err.ee/vaata/homme-on-esimene-aprill' + tags = re.sub(r'\(|\)|,|/', ' ', clean_html(value)).split() + if tags: + info['tags'] = sorted( + map(lambda s: s.strip().lower(), tags)) + elif name in ['Aasta', 'Võtte aasta']: + year = value + elif name in ['Autorid', 'Režissöör', 'Toimetajad', 'Esinejad']: + if 'creator' not in info: + info['creator'] = set() + info['creator'] = info['creator'] | set( + re.split(r'\s*,\s*', value)) + elif name == 'Fonogrammi tootja': + info['channel'] = value + mobj = re.search(r'([0-9]{4})', value) + if not year and mobj: + year = mobj.group(0) + elif name == 'Kestus': + info['duration'] = parse_duration(value) + elif name == 'Kategooria': + categories = re.split(r'\s*→\s*|\s*,\s*', value) + info['categories'] = sorted( + filter( + lambda s: s != 'Muu', + set(map(lambda s: s.capitalize(), + categories)))) + elif name == 'Registreerimise kuupäev': + try: + info['upload_date'] = datetime.strptime( + value, '%d.%m.%Y').date().strftime('%Y%m%d') + except ValueError as ex: + self._log_debug_message( + "Failed to parse upload_date '%s' %s" % + (value, ex)) + elif name == 'Registreerimise aeg': + info['timestamp'] = unified_timestamp(value) + elif name in ['Esmaeeter', 'Eetris']: + try: + info['release_date'] = datetime.strptime( + value, '%d.%m.%Y').date() + except ValueError as ex: + self._log_debug_message( + "Failed to parse release_date '%s' %s" % + (value, ex)) + if 'release_date' not in info: + try: + info['release_date'] = datetime.strptime( + value, "%B %Y").date() + except ValueError as ex: + self._log_debug_message( + "Failed to parse release_date '%s' %s" % + (value, ex)) + except TypeError as ex: + self._log_debug_message( + "Failed to parse release_date '%s' %s" % + (value, ex)) + if 'release_date' not in info: + # Try for a year yyyy + mobj = re.search(r'([0-9]{4})', value) + if mobj: + info['release_date'] = date(year=int( + mobj.group(0)), day=1, month=1) + if 'release_date' in info: + info['release_date'] = info['release_date'].strftime( + '%Y%m%d') + + if year and 'release_date' not in info: + info['release_date'] = date( + year=int(year), day=1, month=1).strftime('%Y%m%d') + + if 'release_date' in info and not year: + mobj = re.match(r'\d{4}', info['release_date']) + if mobj: + year = mobj.group(0) + + if 'channel' not in info: + channel = list() + if year: + channel.append(year) + channel.append('ERR') + info['channel'] = ' '.join(channel) + + info['uploader'] = 'ERR' + info['is_live'] = False + + if 'creator' in info: + info['creator'] = ', '.join(sorted(info['creator'])) + + if 'series' in info: + episode = info['title'] + prefix = info['series'].upper() + if episode.upper().startswith(prefix + ': ' + prefix): + # ERR Arhiiv sometimes mangles episode's title by + # adding series name twice as prefix. This hack + # corrects it. + episode = episode[len(prefix + ': ' + prefix):] + elif episode.upper().startswith(prefix): + episode = episode[len(prefix):] + + if episode.startswith(': '): + episode = episode[len(': '):] + elif episode.startswith('. '): + episode = episode[len('. '):] + + info['episode'] = episode.strip() + if not episode: + self.report_warning("Episode name reduced to 'none'") + + if 'episode' in info: + info['title'] = info['series'] + ' - ' + info['episode'] + + if 'series' in info and year: + info['season'] = year + + return info + + @classmethod + def _format_duration(cls, duration, always_minutes=False, always_hours=False): + '''Formats duration as HH:MM:SS''' + if duration is None: + return None + minutes, seconds = divmod(duration, 60) + hours, minutes = divmod(minutes, 60) + rval = '%02d' % seconds + if always_hours or always_minutes or minutes or hours: + rval = '%02d:' % minutes + rval + if always_hours or hours: + rval = '%02d:' % hours + rval + + return rval + + def _extract_chapters(self, webpage, total_duration): + res = re.findall( + r'[^<]*]+class=(["\'])data\1[^>]*>([^<]+)' + r'[^<]*]+class=(["\'])data\3[^>]*>.*?' + r'[^<]*]+class=(["\'])data\4[^>]*>(.*?)[^<]*', + webpage, re.DOTALL) + chapters = list() + prev_chapter = dict() + correction = 0 + for match in res: + chapter = dict() + duration = parse_duration(match[1]) + if not prev_chapter: + # ERR Arhiiv sometimes adds some arbitrary amount of seconds to + # all timings. This hack corrects it by subtracting the first + # chapter's start_time from all subsequent timings. + correction = duration + duration -= correction + chapter['start_time'] = duration + chapter['title'] = match[4].strip() + if prev_chapter: + prev_chapter['end_time'] = duration + chapters.append(chapter) + prev_chapter = chapter + prev_chapter['end_time'] = total_duration + + return chapters + + def _real_extract(self, url): + info = dict() + + video_id = self._match_id(url) + self._log_debug_message("Extracted id '%s'" % video_id) + info['display_id'] = video_id + info['id'] = video_id + info['webpage_url'] = url + + webpage = self._download_webpage(url, video_id) + + master_url = self._search_regex(r"var\s+src\s*=\s*'(//.+?\.m3u8)';", + webpage, + 'master_url', + flags=re.DOTALL, + fatal=False) + + self._log_debug_message("Extracted master_url '%s'" % master_url) + + info.update(self._extract_properties(webpage)) + + if not master_url: + error_msg = 'Cannot extract master_url. Video or audio %s is not available' % video_id + if 'iso' in info or re.match(r'.*(?:TIFF|JPEG).*', info.get('file', '')): + error_msg += ", url referres to a photo." + raise ExtractorError(error_msg, expected=True) + + m3u8_formats = [] + try: + m3u8_formats = self._extract_m3u8_formats(master_url, video_id) + except ExtractorError as ex: + if isinstance(ex.cause, compat_HTTPError) and ex.cause.code == 404: + mobj = re.search(r'\.urlset/master.m3u8', master_url) + if mobj: + self.report_warning( + "master_url links to nonexistent resource '%s'" % + master_url) + raise ex + + for m3u8_format in m3u8_formats: + if not m3u8_format['ext']: + mobj = re.search(r'\.(\w{3})/', m3u8_format['url']) + if mobj: + m3u8_format['ext'] = mobj.group(1) + else: + m3u8_format['ext'] = 'mp4' if m3u8_format[ + 'vcodec'] != 'none' else 'm4a' + if m3u8_formats: + self._sort_formats(m3u8_formats) + info['formats'] = m3u8_formats + + if 'duration' in info: + chapters = self._extract_chapters(webpage, info['duration']) + if chapters: + info['chapters'] = chapters + + if self._downloader.params.get('verbose', False): + self._report_properties(info) + + self._clean_up_properties(info) + + return info + + +class ERRArhiivPlaylistIE(ERRArhiivBaseIE): + IE_DESC = 'arhiiv.err.ee playlists and search results' + _ERRARHIIV_SERVICES = 'seeria|samast-seeriast|sarnased|otsi|tapsem-otsing|show-category-single-files' + _VALID_URL = r'(?Phttps?://arhiiv\.err\.ee)/(?P%(services)s)[/?#]*(?P[^/?#]*)' % { + 'services': _ERRARHIIV_SERVICES + } + _TESTS = [{ + 'url': 'https://arhiiv.err.ee/seeria/linnuaabits/info/0/default/koik', + 'info_dict': { + 'id': 'linnuaabits', + 'title': "Linnuaabits", + }, + 'playlist_mincount': 71, + }, { + 'url': 'https://arhiiv.err.ee/seeria/linnulaul', + 'info_dict': { + 'id': 'linnulaul', + 'title': "Linnulaul", + }, + 'playlist_mincount': 10, + }, { + 'url': + 'https://arhiiv.err.ee/seeria/eesti-aja-lood-okupatsioonid/info/0/default/koik', + 'info_dict': { + 'id': 'eesti-aja-lood-okupatsioonid', + 'title': "Eesti aja lood. Okupatsioonid", + }, + 'playlist_mincount': 46, + }, { + 'url': + 'https://arhiiv.err.ee/samast-seeriast/ak-filmikroonika-1958-1991-linnuturg-keskturul/default/1', + 'info_dict': { + 'id': 'ak-filmikroonika-1958-1991', + 'title': "AK filmikroonika 1958-1991", + }, + 'playlist_count': 10, + }, { + 'url': + 'https://arhiiv.err.ee/sarnased/ensv-ensv-kaadri-taga/default/1', + 'info_dict': { + 'id': 'ensv', + 'title': "EnsV - Sarnased saated", + }, + 'playlist_count': 10, + }, { + 'url': 'https://arhiiv.err.ee/otsi/reliikvia/default/koik', + 'info_dict': { + 'id': None, + 'title': "Otsingutulemused `reliikvia`", + }, + 'playlist_mincount': 161, + }, { + 'url': 'https://arhiiv.err.ee/otsi/reliikvia/default/3', + 'info_dict': { + 'id': None, + 'title': "Otsingutulemused `reliikvia`", + }, + 'playlist_mincount': 10, + }, { + 'url': + 'https://arhiiv.err.ee/tapsem-otsing?searchphrase=kahur&searchfrom_video=video&searchfrom_audio=audio', + 'info_dict': { + 'id': None, + 'title': "Otsingutulemused", + }, + 'playlist_mincount': 10, + }] + + def _guess_id_from_title(self, title): + if not title: + return None + playlist_id = title.lower() + playlist_id = ' '.join(playlist_id.split()) + playlist_id = playlist_id.replace('õ', 'o') + playlist_id = playlist_id.replace('ö', 'o') + playlist_id = playlist_id.replace('ä', 'a') + playlist_id = playlist_id.replace('ü', 'u') + playlist_id = playlist_id.replace(' ', '-') + playlist_id = re.sub(r",|\.|:|\+|\?|!|'|\"|;|\*|\\|/|\|", "", + playlist_id) + return playlist_id + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + service = mobj.group('service') + playlist_id = mobj.group('id') if service in [ + 'seeria', 'show-category-single-files' + ] else None + prefix = mobj.group('prefix') + webpage = self._download_webpage(url, playlist_id) + title = self._html_search_regex( + r'[^<]*([^|]+)[^<]*?', + webpage, + 'title', + flags=re.DOTALL, + fatal=False) + if title: + title = title.strip().strip('.') + + if title and not playlist_id and service not in [ + 'otsi', 'tapsem-otsing', 'show-category-single-files' + ]: + playlist_id = self._guess_id_from_title(title) + + if title and service == 'sarnased': + title += ' - Sarnased saated' + + res = re.findall( + r']*>[^<]*]*>', + webpage, re.DOTALL) + + url_list = orderedSet([prefix + match[1] for match in res]) + + entries = [self.url_result(item_url, ie='ERRArhiiv') for item_url in url_list] + + return self.playlist_result(entries, playlist_id, title) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 402e542ae..ed5ae4583 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -345,6 +345,10 @@ from .embedly import EmbedlyIE from .engadget import EngadgetIE from .eporner import EpornerIE from .eroprofile import EroProfileIE +from .errarhiiv import ( + ERRArhiivIE, + ERRArhiivPlaylistIE, +) from .escapist import EscapistIE from .espn import ( ESPNIE,