mirror of
https://github.com/yt-dlp/yt-dlp.git
synced 2025-03-25 14:45:03 +01:00
[ie/icareus] Add IcareusNext extractor, for new-style Icareus sites
This commit is contained in:
parent
a3c0321825
commit
22ed863fad
yt_dlp/extractor
@ -839,7 +839,10 @@ from .huya import (
|
|||||||
from .hypem import HypemIE
|
from .hypem import HypemIE
|
||||||
from .hypergryph import MonsterSirenHypergryphMusicIE
|
from .hypergryph import MonsterSirenHypergryphMusicIE
|
||||||
from .hytale import HytaleIE
|
from .hytale import HytaleIE
|
||||||
from .icareus import IcareusIE
|
from .icareus import (
|
||||||
|
IcareusIE,
|
||||||
|
IcareusNextIE,
|
||||||
|
)
|
||||||
from .ichinanalive import (
|
from .ichinanalive import (
|
||||||
IchinanaLiveClipIE,
|
IchinanaLiveClipIE,
|
||||||
IchinanaLiveIE,
|
IchinanaLiveIE,
|
||||||
|
@ -1,11 +1,15 @@
|
|||||||
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
|
from .. import traverse_obj
|
||||||
from ..utils import (
|
from ..utils import (
|
||||||
|
ExtractorError,
|
||||||
clean_html,
|
clean_html,
|
||||||
determine_ext,
|
determine_ext,
|
||||||
get_element_by_class,
|
get_element_by_class,
|
||||||
int_or_none,
|
int_or_none,
|
||||||
|
js_to_json,
|
||||||
merge_dicts,
|
merge_dicts,
|
||||||
parse_bitrate,
|
parse_bitrate,
|
||||||
parse_resolution,
|
parse_resolution,
|
||||||
@ -177,3 +181,169 @@ class IcareusIE(InfoExtractor):
|
|||||||
'description': clean_html(info.get('description')),
|
'description': clean_html(info.get('description')),
|
||||||
'thumbnails': thumbnails if thumbnails[0]['url'] else None,
|
'thumbnails': thumbnails if thumbnails[0]['url'] else None,
|
||||||
}, info)
|
}, info)
|
||||||
|
|
||||||
|
|
||||||
|
class IcareusNextIE(InfoExtractor):
|
||||||
|
_DOMAINS = '|'.join(
|
||||||
|
re.escape(domain)
|
||||||
|
for domain in (
|
||||||
|
'players.icareus.com',
|
||||||
|
'helsinkikanava.fi',
|
||||||
|
)
|
||||||
|
)
|
||||||
|
_VALID_URL = (
|
||||||
|
rf'(?P<base_url>https?://(?:www\.)?(?:{_DOMAINS}))/(?P<language>.+?)/(video|event)/details/(?P<id>\d+)',
|
||||||
|
r'https?://players.icareus.com/(?P<brand>.+?)/embed/vod/(?P<id>\d+)',
|
||||||
|
)
|
||||||
|
_TESTS = [
|
||||||
|
{ # Regular VOD
|
||||||
|
'url': 'https://www.helsinkikanava.fi/fi/video/details/68021894',
|
||||||
|
'md5': '3e048a91cd6be16d34b98a1548ceed27',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '68021894',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'Perheiden parhaaksi',
|
||||||
|
'description': 'md5:fe4e4ec742a34f53022f3a0409b0f6e7',
|
||||||
|
'thumbnail': 'https://dvcf59enpgt5y.cloudfront.net/image/image_gallery?img_id=68021900',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ # Recorded livestream
|
||||||
|
'url': 'https://www.helsinkikanava.fi/fi/event/details/76241489',
|
||||||
|
'md5': 'a063a7ef36969ced44af9fe3d10a7f47',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '76241489',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'Helsingin kaupungin ja HUSin tiedotustilaisuus koronaepidemiatilanteesta 24.11.2020',
|
||||||
|
'description': 'md5:3129d041c6fbbcdc7fe68d9a938fef1c',
|
||||||
|
'thumbnail': 'https://dvcf59enpgt5y.cloudfront.net/image/image_gallery?img_id=76288630',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{ # Embedded player
|
||||||
|
'url': 'https://players.icareus.com/elonet/embed/vod/256250758',
|
||||||
|
'md5': '420616d561582b9491f0a622b1a3d831',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '256250758',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'Shell Hurriganes',
|
||||||
|
'description': 'Shell Hurriganes',
|
||||||
|
'thumbnail': 'https://dvcf59enpgt5y.cloudfront.net/image/image_gallery?img_id=266941624',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
def _is_playback_data_dict(self, element, display_id):
|
||||||
|
if isinstance(element, dict):
|
||||||
|
if 'src' in element and 'videoInfo' in element and str_or_none(element.get('id')) == str(display_id):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _find_playback_data(self, webpage: str, display_id: str):
|
||||||
|
# Adapted from Goplay
|
||||||
|
nextjs_data = traverse_obj(
|
||||||
|
re.findall(r'<script[^>]*>\s*self\.__next_f\.push\(\s*(\[.+?])\s*\);?\s*</script>', webpage),
|
||||||
|
(
|
||||||
|
...,
|
||||||
|
{js_to_json},
|
||||||
|
{json.loads},
|
||||||
|
...,
|
||||||
|
{
|
||||||
|
lambda s: self._search_json(
|
||||||
|
r'\w+\s*:\s*',
|
||||||
|
s,
|
||||||
|
'next js data',
|
||||||
|
None,
|
||||||
|
contains_pattern=r'\[(?s:.+)\]',
|
||||||
|
default=None,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
...,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
for element in nextjs_data:
|
||||||
|
if self._is_playback_data_dict(element, display_id):
|
||||||
|
return element
|
||||||
|
|
||||||
|
# If the playback data is not found in the first pass, try to find it in the children of the RSC data
|
||||||
|
for element in traverse_obj(nextjs_data, (..., 'children', ...)):
|
||||||
|
if self._is_playback_data_dict(element, display_id):
|
||||||
|
return element
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _real_extract(self, url):
|
||||||
|
display_id = self._match_id(url)
|
||||||
|
webpage = self._download_webpage(url, display_id)
|
||||||
|
playback_data = self._find_playback_data(webpage, display_id)
|
||||||
|
if playback_data is None:
|
||||||
|
raise ExtractorError('No playback data found', expected=True, video_id=display_id)
|
||||||
|
video_id = str(playback_data['id'])
|
||||||
|
video_info = playback_data['videoInfo']
|
||||||
|
|
||||||
|
subtitles = {}
|
||||||
|
for sub_info in video_info.get('subtitles') or []:
|
||||||
|
_, sdesc, surl = sub_info[:3]
|
||||||
|
sub_name = remove_end(sdesc.split(' ')[0], ':')
|
||||||
|
subtitles[sub_name] = [{'url': url_or_none(surl)}]
|
||||||
|
|
||||||
|
formats = []
|
||||||
|
for audio_url_datum in video_info.get('audio_urls') or []:
|
||||||
|
audio_url = audio_url_datum.get('url')
|
||||||
|
if audio_url is None:
|
||||||
|
continue
|
||||||
|
formats.append(
|
||||||
|
{
|
||||||
|
'format': audio_url_datum.get('name'),
|
||||||
|
'format_id': 'audio',
|
||||||
|
'vcodec': 'none',
|
||||||
|
'url': audio_url,
|
||||||
|
'tbr': None,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
for url_datum in video_info.get('urls') or []:
|
||||||
|
video_url = url_or_none(url_datum.get('url'))
|
||||||
|
if video_url is None:
|
||||||
|
continue
|
||||||
|
ext = determine_ext(video_url)
|
||||||
|
if ext == 'm3u8':
|
||||||
|
fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
||||||
|
video_url,
|
||||||
|
video_id,
|
||||||
|
'mp4',
|
||||||
|
m3u8_id='hls',
|
||||||
|
fatal=False,
|
||||||
|
)
|
||||||
|
formats.extend(fmts)
|
||||||
|
self._merge_subtitles(subs, target=subtitles)
|
||||||
|
else:
|
||||||
|
pass # TODO: unsupported for now, no examples of this
|
||||||
|
|
||||||
|
# This is weird, but it's the more robust way to find the video file URL for now
|
||||||
|
if m := re.search(r'\{\\"videoFileUrl\\":\\"(http.+?)\\"', webpage):
|
||||||
|
try:
|
||||||
|
if video_file_url := url_or_none(json.loads(f'"{m.group(1)}"')):
|
||||||
|
formats.append(
|
||||||
|
{
|
||||||
|
'url': video_file_url,
|
||||||
|
'format_id': 'download',
|
||||||
|
},
|
||||||
|
)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
thumbnails = []
|
||||||
|
if thumbnail := url_or_none(video_info.get('thumbnail')):
|
||||||
|
thumbnails.append({'url': thumbnail})
|
||||||
|
|
||||||
|
description = clean_html(self._html_search_meta(['description'], webpage))
|
||||||
|
title = clean_html(self._html_search_meta(['og:title'], webpage))
|
||||||
|
|
||||||
|
return {
|
||||||
|
'id': video_id,
|
||||||
|
'title': title,
|
||||||
|
'formats': formats,
|
||||||
|
'subtitles': subtitles,
|
||||||
|
'description': description,
|
||||||
|
'thumbnails': thumbnails or None,
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user