1
0
mirror of https://github.com/yt-dlp/yt-dlp.git synced 2025-03-25 14:45:03 +01:00

[ie/icareus] Add IcareusNext extractor, for new-style Icareus sites

This commit is contained in:
Aarni Koskela 2025-01-14 15:39:52 +02:00
parent a3c0321825
commit 22ed863fad
2 changed files with 174 additions and 1 deletions
yt_dlp/extractor

@ -839,7 +839,10 @@ from .huya import (
from .hypem import HypemIE from .hypem import HypemIE
from .hypergryph import MonsterSirenHypergryphMusicIE from .hypergryph import MonsterSirenHypergryphMusicIE
from .hytale import HytaleIE from .hytale import HytaleIE
from .icareus import IcareusIE from .icareus import (
IcareusIE,
IcareusNextIE,
)
from .ichinanalive import ( from .ichinanalive import (
IchinanaLiveClipIE, IchinanaLiveClipIE,
IchinanaLiveIE, IchinanaLiveIE,

@ -1,11 +1,15 @@
import json
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from .. import traverse_obj
from ..utils import ( from ..utils import (
ExtractorError,
clean_html, clean_html,
determine_ext, determine_ext,
get_element_by_class, get_element_by_class,
int_or_none, int_or_none,
js_to_json,
merge_dicts, merge_dicts,
parse_bitrate, parse_bitrate,
parse_resolution, parse_resolution,
@ -177,3 +181,169 @@ class IcareusIE(InfoExtractor):
'description': clean_html(info.get('description')), 'description': clean_html(info.get('description')),
'thumbnails': thumbnails if thumbnails[0]['url'] else None, 'thumbnails': thumbnails if thumbnails[0]['url'] else None,
}, info) }, info)
class IcareusNextIE(InfoExtractor):
_DOMAINS = '|'.join(
re.escape(domain)
for domain in (
'players.icareus.com',
'helsinkikanava.fi',
)
)
_VALID_URL = (
rf'(?P<base_url>https?://(?:www\.)?(?:{_DOMAINS}))/(?P<language>.+?)/(video|event)/details/(?P<id>\d+)',
r'https?://players.icareus.com/(?P<brand>.+?)/embed/vod/(?P<id>\d+)',
)
_TESTS = [
{ # Regular VOD
'url': 'https://www.helsinkikanava.fi/fi/video/details/68021894',
'md5': '3e048a91cd6be16d34b98a1548ceed27',
'info_dict': {
'id': '68021894',
'ext': 'mp4',
'title': 'Perheiden parhaaksi',
'description': 'md5:fe4e4ec742a34f53022f3a0409b0f6e7',
'thumbnail': 'https://dvcf59enpgt5y.cloudfront.net/image/image_gallery?img_id=68021900',
},
},
{ # Recorded livestream
'url': 'https://www.helsinkikanava.fi/fi/event/details/76241489',
'md5': 'a063a7ef36969ced44af9fe3d10a7f47',
'info_dict': {
'id': '76241489',
'ext': 'mp4',
'title': 'Helsingin kaupungin ja HUSin tiedotustilaisuus koronaepidemiatilanteesta 24.11.2020',
'description': 'md5:3129d041c6fbbcdc7fe68d9a938fef1c',
'thumbnail': 'https://dvcf59enpgt5y.cloudfront.net/image/image_gallery?img_id=76288630',
},
},
{ # Embedded player
'url': 'https://players.icareus.com/elonet/embed/vod/256250758',
'md5': '420616d561582b9491f0a622b1a3d831',
'info_dict': {
'id': '256250758',
'ext': 'mp4',
'title': 'Shell Hurriganes',
'description': 'Shell Hurriganes',
'thumbnail': 'https://dvcf59enpgt5y.cloudfront.net/image/image_gallery?img_id=266941624',
},
},
]
def _is_playback_data_dict(self, element, display_id):
if isinstance(element, dict):
if 'src' in element and 'videoInfo' in element and str_or_none(element.get('id')) == str(display_id):
return True
return False
def _find_playback_data(self, webpage: str, display_id: str):
# Adapted from Goplay
nextjs_data = traverse_obj(
re.findall(r'<script[^>]*>\s*self\.__next_f\.push\(\s*(\[.+?])\s*\);?\s*</script>', webpage),
(
...,
{js_to_json},
{json.loads},
...,
{
lambda s: self._search_json(
r'\w+\s*:\s*',
s,
'next js data',
None,
contains_pattern=r'\[(?s:.+)\]',
default=None,
),
},
...,
),
)
for element in nextjs_data:
if self._is_playback_data_dict(element, display_id):
return element
# If the playback data is not found in the first pass, try to find it in the children of the RSC data
for element in traverse_obj(nextjs_data, (..., 'children', ...)):
if self._is_playback_data_dict(element, display_id):
return element
return None
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
playback_data = self._find_playback_data(webpage, display_id)
if playback_data is None:
raise ExtractorError('No playback data found', expected=True, video_id=display_id)
video_id = str(playback_data['id'])
video_info = playback_data['videoInfo']
subtitles = {}
for sub_info in video_info.get('subtitles') or []:
_, sdesc, surl = sub_info[:3]
sub_name = remove_end(sdesc.split(' ')[0], ':')
subtitles[sub_name] = [{'url': url_or_none(surl)}]
formats = []
for audio_url_datum in video_info.get('audio_urls') or []:
audio_url = audio_url_datum.get('url')
if audio_url is None:
continue
formats.append(
{
'format': audio_url_datum.get('name'),
'format_id': 'audio',
'vcodec': 'none',
'url': audio_url,
'tbr': None,
},
)
for url_datum in video_info.get('urls') or []:
video_url = url_or_none(url_datum.get('url'))
if video_url is None:
continue
ext = determine_ext(video_url)
if ext == 'm3u8':
fmts, subs = self._extract_m3u8_formats_and_subtitles(
video_url,
video_id,
'mp4',
m3u8_id='hls',
fatal=False,
)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
else:
pass # TODO: unsupported for now, no examples of this
# This is weird, but it's the more robust way to find the video file URL for now
if m := re.search(r'\{\\"videoFileUrl\\":\\"(http.+?)\\"', webpage):
try:
if video_file_url := url_or_none(json.loads(f'"{m.group(1)}"')):
formats.append(
{
'url': video_file_url,
'format_id': 'download',
},
)
except json.JSONDecodeError:
pass
thumbnails = []
if thumbnail := url_or_none(video_info.get('thumbnail')):
thumbnails.append({'url': thumbnail})
description = clean_html(self._html_search_meta(['description'], webpage))
title = clean_html(self._html_search_meta(['og:title'], webpage))
return {
'id': video_id,
'title': title,
'formats': formats,
'subtitles': subtitles,
'description': description,
'thumbnails': thumbnails or None,
}