\w+)'
+ _TESTS = [{
+ 'url': 'https://learningonscreen.ac.uk/ondemand/index.php/prog/005D81B2?bcast=22757013',
+ 'info_dict': {
+ 'id': '005D81B2',
+ 'ext': 'mp4',
+ 'title': 'Planet Earth',
+ 'duration': 3600.0,
+ 'timestamp': 1164567600.0,
+ 'upload_date': '20061126',
+ 'thumbnail': 'https://stream.learningonscreen.ac.uk/trilt-cover-images/005D81B2-Planet-Earth-2006-11-26T190000Z-BBC4.jpg',
+ },
+ }]
+
+ def _real_initialize(self):
+ if not self._get_cookies('https://learningonscreen.ac.uk/').get('PHPSESSID-BOB-LIVE'):
+ self.raise_login_required(
+ 'Use --cookies for authentication. See '
+ ' https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp '
+ 'for how to manually pass cookies', method=None)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ details = traverse_obj(webpage, (
+ {functools.partial(get_element_html_by_id, 'programme-details')}, {
+ 'title': ({functools.partial(re.search, r'([^<]+)
')}, 1, {clean_html}),
+ 'timestamp': (
+ {functools.partial(get_element_by_class, 'broadcast-date')},
+ {functools.partial(re.match, r'([^<]+)')}, 1, {unified_timestamp}),
+ 'duration': (
+ {functools.partial(get_element_by_class, 'prog-running-time')},
+ {clean_html}, {parse_duration}),
+ }))
+
+ title = details.pop('title', None) or traverse_obj(webpage, (
+ {functools.partial(get_element_html_by_id, 'add-to-existing-playlist')},
+ {extract_attributes}, 'data-record-title', {clean_html}))
+
+ entries = self._parse_html5_media_entries(
+ 'https://stream.learningonscreen.ac.uk', webpage, video_id, m3u8_id='hls', mpd_id='dash',
+ _headers={'Origin': 'https://learningonscreen.ac.uk', 'Referer': 'https://learningonscreen.ac.uk/'})
+ if not entries:
+ raise ExtractorError('No video found')
+
+ if len(entries) > 1:
+ duration = details.pop('duration', None)
+ for idx, entry in enumerate(entries, start=1):
+ entry.update(details)
+ entry['id'] = join_nonempty(video_id, idx)
+ entry['title'] = join_nonempty(title, idx)
+ return self.playlist_result(entries, video_id, title, duration=duration)
+
+ return {
+ **entries[0],
+ **details,
+ 'id': video_id,
+ 'title': title,
+ }
diff --git a/yt_dlp/extractor/mediaklikk.py b/yt_dlp/extractor/mediaklikk.py
index bd1a27fcc..f51342060 100644
--- a/yt_dlp/extractor/mediaklikk.py
+++ b/yt_dlp/extractor/mediaklikk.py
@@ -133,7 +133,9 @@ def _real_extract(self, url):
r']+\bclass="article_date">([^<]+)<', webpage, 'upload date', default=None))
player_data['video'] = player_data.pop('token')
- player_page = self._download_webpage('https://player.mediaklikk.hu/playernew/player.php', video_id, query=player_data)
+ player_page = self._download_webpage(
+ 'https://player.mediaklikk.hu/playernew/player.php', video_id,
+ query=player_data, headers={'Referer': url})
player_json = self._search_json(
r'\bpl\.setup\s*\(', player_page, 'player json', video_id, end_pattern=r'\);')
playlist_url = traverse_obj(
diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py
index 6f67602a6..935bf8561 100644
--- a/yt_dlp/extractor/mlb.py
+++ b/yt_dlp/extractor/mlb.py
@@ -1,16 +1,21 @@
+import json
import re
-import urllib.parse
+import time
import uuid
from .common import InfoExtractor
+from ..networking.exceptions import HTTPError
from ..utils import (
+ ExtractorError,
determine_ext,
int_or_none,
join_nonempty,
+ jwt_decode_hs256,
parse_duration,
parse_iso8601,
try_get,
url_or_none,
+ urlencode_postdata,
)
from ..utils.traversal import traverse_obj
@@ -276,81 +281,225 @@ def _download_video_data(self, display_id):
class MLBTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?mlb\.com/tv/g(?P\d{6})'
_NETRC_MACHINE = 'mlb'
-
_TESTS = [{
'url': 'https://www.mlb.com/tv/g661581/vee2eff5f-a7df-4c20-bdb4-7b926fa12638',
'info_dict': {
'id': '661581',
'ext': 'mp4',
'title': '2022-07-02 - St. Louis Cardinals @ Philadelphia Phillies',
+ 'release_date': '20220702',
+ 'release_timestamp': 1656792300,
},
- 'params': {
- 'skip_download': True,
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ # makeup game: has multiple dates, need to avoid games with 'rescheduleDate'
+ 'url': 'https://www.mlb.com/tv/g747039/vd22541c4-5a29-45f7-822b-635ec041cf5e',
+ 'info_dict': {
+ 'id': '747039',
+ 'ext': 'mp4',
+ 'title': '2024-07-29 - Toronto Blue Jays @ Baltimore Orioles',
+ 'release_date': '20240729',
+ 'release_timestamp': 1722280200,
},
+ 'params': {'skip_download': 'm3u8'},
}]
+ _GRAPHQL_INIT_QUERY = '''\
+mutation initSession($device: InitSessionInput!, $clientType: ClientType!, $experience: ExperienceTypeInput) {
+ initSession(device: $device, clientType: $clientType, experience: $experience) {
+ deviceId
+ sessionId
+ entitlements {
+ code
+ }
+ location {
+ countryCode
+ regionName
+ zipCode
+ latitude
+ longitude
+ }
+ clientExperience
+ features
+ }
+ }'''
+ _GRAPHQL_PLAYBACK_QUERY = '''\
+mutation initPlaybackSession(
+ $adCapabilities: [AdExperienceType]
+ $mediaId: String!
+ $deviceId: String!
+ $sessionId: String!
+ $quality: PlaybackQuality
+ ) {
+ initPlaybackSession(
+ adCapabilities: $adCapabilities
+ mediaId: $mediaId
+ deviceId: $deviceId
+ sessionId: $sessionId
+ quality: $quality
+ ) {
+ playbackSessionId
+ playback {
+ url
+ token
+ expiration
+ cdn
+ }
+ }
+ }'''
+ _APP_VERSION = '7.8.2'
+ _device_id = None
+ _session_id = None
_access_token = None
+ _token_expiry = 0
+
+ @property
+ def _api_headers(self):
+ if (self._token_expiry - 120) <= time.time():
+ self.write_debug('Access token has expired; re-logging in')
+ self._perform_login(*self._get_login_info())
+ return {'Authorization': f'Bearer {self._access_token}'}
def _real_initialize(self):
if not self._access_token:
self.raise_login_required(
'All videos are only available to registered users', method='password')
+ def _set_device_id(self, username):
+ if not self._device_id:
+ self._device_id = self.cache.load(
+ self._NETRC_MACHINE, 'device_ids', default={}).get(username)
+ if self._device_id:
+ return
+ self._device_id = str(uuid.uuid4())
+ self.cache.store(self._NETRC_MACHINE, 'device_ids', {username: self._device_id})
+
def _perform_login(self, username, password):
- data = f'grant_type=password&username={urllib.parse.quote(username)}&password={urllib.parse.quote(password)}&scope=openid offline_access&client_id=0oa3e1nutA1HLzAKG356'
- access_token = self._download_json(
- 'https://ids.mlb.com/oauth2/aus1m088yK07noBfh356/v1/token', None,
- headers={
- 'User-Agent': 'okhttp/3.12.1',
- 'Content-Type': 'application/x-www-form-urlencoded',
- }, data=data.encode())['access_token']
+ try:
+ self._access_token = self._download_json(
+ 'https://ids.mlb.com/oauth2/aus1m088yK07noBfh356/v1/token', None,
+ 'Logging in', 'Unable to log in', headers={
+ 'User-Agent': 'okhttp/3.12.1',
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ }, data=urlencode_postdata({
+ 'grant_type': 'password',
+ 'username': username,
+ 'password': password,
+ 'scope': 'openid offline_access',
+ 'client_id': '0oa3e1nutA1HLzAKG356',
+ }))['access_token']
+ except ExtractorError as error:
+ if isinstance(error.cause, HTTPError) and error.cause.status == 400:
+ raise ExtractorError('Invalid username or password', expected=True)
+ raise
- entitlement = self._download_webpage(
- f'https://media-entitlement.mlb.com/api/v3/jwt?os=Android&appname=AtBat&did={uuid.uuid4()}', None,
- headers={
- 'User-Agent': 'okhttp/3.12.1',
- 'Authorization': f'Bearer {access_token}',
- })
+ self._token_expiry = traverse_obj(self._access_token, ({jwt_decode_hs256}, 'exp', {int})) or 0
+ self._set_device_id(username)
- data = f'grant_type=urn:ietf:params:oauth:grant-type:token-exchange&subject_token={entitlement}&subject_token_type=urn:ietf:params:oauth:token-type:jwt&platform=android-tv'
- self._access_token = self._download_json(
- 'https://us.edge.bamgrid.com/token', None,
+ self._session_id = self._call_api({
+ 'operationName': 'initSession',
+ 'query': self._GRAPHQL_INIT_QUERY,
+ 'variables': {
+ 'device': {
+ 'appVersion': self._APP_VERSION,
+ 'deviceFamily': 'desktop',
+ 'knownDeviceId': self._device_id,
+ 'languagePreference': 'ENGLISH',
+ 'manufacturer': '',
+ 'model': '',
+ 'os': '',
+ 'osVersion': '',
+ },
+ 'clientType': 'WEB',
+ },
+ }, None, 'session ID')['data']['initSession']['sessionId']
+
+ def _call_api(self, data, video_id, description='GraphQL JSON', fatal=True):
+ return self._download_json(
+ 'https://media-gateway.mlb.com/graphql', video_id,
+ f'Downloading {description}', f'Unable to download {description}', fatal=fatal,
headers={
+ **self._api_headers,
'Accept': 'application/json',
- 'Authorization': 'Bearer bWxidHYmYW5kcm9pZCYxLjAuMA.6LZMbH2r--rbXcgEabaDdIslpo4RyZrlVfWZhsAgXIk',
- 'Content-Type': 'application/x-www-form-urlencoded',
- }, data=data.encode())['access_token']
+ 'Content-Type': 'application/json',
+ 'x-client-name': 'WEB',
+ 'x-client-version': self._APP_VERSION,
+ }, data=json.dumps(data, separators=(',', ':')).encode())
+
+ def _extract_formats_and_subtitles(self, broadcast, video_id):
+ feed = traverse_obj(broadcast, ('homeAway', {str.title}))
+ medium = traverse_obj(broadcast, ('type', {str}))
+ language = traverse_obj(broadcast, ('language', {str.lower}))
+ format_id = join_nonempty(feed, medium, language)
+
+ response = self._call_api({
+ 'operationName': 'initPlaybackSession',
+ 'query': self._GRAPHQL_PLAYBACK_QUERY,
+ 'variables': {
+ 'adCapabilities': ['GOOGLE_STANDALONE_AD_PODS'],
+ 'deviceId': self._device_id,
+ 'mediaId': broadcast['mediaId'],
+ 'quality': 'PLACEHOLDER',
+ 'sessionId': self._session_id,
+ },
+ }, video_id, f'{format_id} broadcast JSON', fatal=False)
+
+ playback = traverse_obj(response, ('data', 'initPlaybackSession', 'playback', {dict}))
+ m3u8_url = traverse_obj(playback, ('url', {url_or_none}))
+ token = traverse_obj(playback, ('token', {str}))
+
+ if not (m3u8_url and token):
+ errors = '; '.join(traverse_obj(response, ('errors', ..., 'message', {str})))
+ if 'not entitled' in errors:
+ raise ExtractorError(errors, expected=True)
+ elif errors: # Only warn when 'blacked out' since radio formats are available
+ self.report_warning(f'API returned errors for {format_id}: {errors}')
+ else:
+ self.report_warning(f'No formats available for {format_id} broadcast; skipping')
+ return [], {}
+
+ cdn_headers = {'x-cdn-token': token}
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ m3u8_url.replace(f'/{token}/', '/'), video_id, 'mp4',
+ m3u8_id=format_id, fatal=False, headers=cdn_headers)
+ for fmt in fmts:
+ fmt['http_headers'] = cdn_headers
+ fmt.setdefault('format_note', join_nonempty(feed, medium, delim=' '))
+ fmt.setdefault('language', language)
+ if fmt.get('vcodec') == 'none' and fmt['language'] == 'en':
+ fmt['source_preference'] = 10
+
+ return fmts, subs
def _real_extract(self, url):
video_id = self._match_id(url)
- airings = self._download_json(
- f'https://search-api-mlbtv.mlb.com/svc/search/v2/graphql/persisted/query/core/Airings?variables=%7B%22partnerProgramIds%22%3A%5B%22{video_id}%22%5D%2C%22applyEsniMediaRightsLabels%22%3Atrue%7D',
- video_id)['data']['Airings']
+ data = self._download_json(
+ 'https://statsapi.mlb.com/api/v1/schedule', video_id, query={
+ 'gamePk': video_id,
+ 'hydrate': 'broadcasts(all),statusFlags',
+ })
+ metadata = traverse_obj(data, (
+ 'dates', ..., 'games',
+ lambda _, v: str(v['gamePk']) == video_id and not v.get('rescheduleDate'), any))
+
+ broadcasts = traverse_obj(metadata, (
+ 'broadcasts', lambda _, v: v['mediaId'] and v['mediaState']['mediaStateCode'] != 'MEDIA_OFF'))
formats, subtitles = [], {}
- for airing in traverse_obj(airings, lambda _, v: v['playbackUrls'][0]['href']):
- format_id = join_nonempty('feedType', 'feedLanguage', from_dict=airing)
- m3u8_url = traverse_obj(self._download_json(
- airing['playbackUrls'][0]['href'].format(scenario='browser~csai'), video_id,
- note=f'Downloading {format_id} stream info JSON',
- errnote=f'Failed to download {format_id} stream info, skipping',
- fatal=False, headers={
- 'Authorization': self._access_token,
- 'Accept': 'application/vnd.media-service+json; version=2',
- }), ('stream', 'complete', {url_or_none}))
- if not m3u8_url:
- continue
- f, s = self._extract_m3u8_formats_and_subtitles(
- m3u8_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)
- formats.extend(f)
- self._merge_subtitles(s, target=subtitles)
+ for broadcast in broadcasts:
+ fmts, subs = self._extract_formats_and_subtitles(broadcast, video_id)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
return {
'id': video_id,
- 'title': traverse_obj(airings, (..., 'titles', 0, 'episodeName'), get_all=False),
- 'is_live': traverse_obj(airings, (..., 'mediaConfig', 'productType'), get_all=False) == 'LIVE',
+ 'title': join_nonempty(
+ traverse_obj(metadata, ('officialDate', {str})),
+ traverse_obj(metadata, ('teams', ('away', 'home'), 'team', 'name', {str}, all, {' @ '.join})),
+ delim=' - '),
+ 'is_live': traverse_obj(broadcasts, (..., 'mediaState', 'mediaStateCode', {str}, any)) == 'MEDIA_ON',
+ 'release_timestamp': traverse_obj(metadata, ('gameDate', {parse_iso8601})),
'formats': formats,
'subtitles': subtitles,
- 'http_headers': {'Authorization': f'Bearer {self._access_token}'},
}
diff --git a/yt_dlp/extractor/murrtube.py b/yt_dlp/extractor/murrtube.py
index 3b39a1b9a..9067b8781 100644
--- a/yt_dlp/extractor/murrtube.py
+++ b/yt_dlp/extractor/murrtube.py
@@ -5,39 +5,103 @@
from ..utils import (
ExtractorError,
OnDemandPagedList,
- determine_ext,
- int_or_none,
- try_get,
+ clean_html,
+ extract_attributes,
+ get_element_by_class,
+ get_element_html_by_id,
+ parse_count,
+ remove_end,
+ update_url,
+ urlencode_postdata,
)
class MurrtubeIE(InfoExtractor):
- _WORKING = False
_VALID_URL = r'''(?x)
(?:
murrtube:|
- https?://murrtube\.net/videos/(?P[a-z0-9\-]+)\-
+ https?://murrtube\.net/(?:v/|videos/(?P[a-z0-9-]+?)-)
)
- (?P[a-f0-9]{8}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{12})
+ (?P[A-Z0-9]{4}|[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})
'''
- _TEST = {
+ _TESTS = [{
'url': 'https://murrtube.net/videos/inferno-x-skyler-148b6f2a-fdcc-4902-affe-9c0f41aaaca0',
- 'md5': '169f494812d9a90914b42978e73aa690',
+ 'md5': '70380878a77e8565d4aea7f68b8bbb35',
'info_dict': {
- 'id': '148b6f2a-fdcc-4902-affe-9c0f41aaaca0',
+ 'id': 'ca885d8456b95de529b6723b158032e11115d',
'ext': 'mp4',
'title': 'Inferno X Skyler',
'description': 'Humping a very good slutty sheppy (roomate)',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 284,
'uploader': 'Inferno Wolf',
'age_limit': 18,
+ 'thumbnail': 'https://storage.murrtube.net/murrtube-production/ekbs3zcfvuynnqfx72nn2tkokvsd',
'comment_count': int,
'view_count': int,
'like_count': int,
- 'tags': ['hump', 'breed', 'Fursuit', 'murrsuit', 'bareback'],
},
- }
+ }, {
+ 'url': 'https://murrtube.net/v/0J2Q',
+ 'md5': '31262f6ac56f0ca75e5a54a0f3fefcb6',
+ 'info_dict': {
+ 'id': '8442998c52134968d9caa36e473e1a6bac6ca',
+ 'ext': 'mp4',
+ 'uploader': 'Hayel',
+ 'title': 'Who\'s in charge now?',
+ 'description': 'md5:795791e97e5b0f1805ea84573f02a997',
+ 'age_limit': 18,
+ 'thumbnail': 'https://storage.murrtube.net/murrtube-production/fb1ojjwiucufp34ya6hxu5vfqi5s',
+ 'comment_count': int,
+ 'view_count': int,
+ 'like_count': int,
+ },
+ }]
+
+ def _extract_count(self, name, html):
+ return parse_count(self._search_regex(
+ rf'([\d,]+)\s+]*>{name}', html, name, default=None))
+
+ def _real_initialize(self):
+ homepage = self._download_webpage(
+ 'https://murrtube.net', None, note='Getting session token')
+ self._request_webpage(
+ 'https://murrtube.net/accept_age_check', None, 'Setting age cookie',
+ data=urlencode_postdata(self._hidden_inputs(homepage)))
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ if video_id.startswith('murrtube:'):
+ raise ExtractorError('Support for murrtube: prefix URLs is broken')
+ video_page = self._download_webpage(url, video_id)
+ video_attrs = extract_attributes(get_element_html_by_id('video', video_page))
+ playlist = update_url(video_attrs['data-url'], query=None)
+ video_id = self._search_regex(r'/([\da-f]+)/index.m3u8', playlist, 'video id')
+
+ return {
+ 'id': video_id,
+ 'title': remove_end(self._og_search_title(video_page), ' - Murrtube'),
+ 'age_limit': 18,
+ 'formats': self._extract_m3u8_formats(playlist, video_id, 'mp4'),
+ 'description': self._og_search_description(video_page),
+ 'thumbnail': update_url(self._og_search_thumbnail(video_page, default=''), query=None) or None,
+ 'uploader': clean_html(get_element_by_class('pl-1 is-size-6 has-text-lighter', video_page)),
+ 'view_count': self._extract_count('Views', video_page),
+ 'like_count': self._extract_count('Likes', video_page),
+ 'comment_count': self._extract_count('Comments', video_page),
+ }
+
+
+class MurrtubeUserIE(InfoExtractor):
+ _WORKING = False
+ IE_DESC = 'Murrtube user profile'
+ _VALID_URL = r'https?://murrtube\.net/(?P[^/]+)$'
+ _TESTS = [{
+ 'url': 'https://murrtube.net/stormy',
+ 'info_dict': {
+ 'id': 'stormy',
+ },
+ 'playlist_mincount': 27,
+ }]
+ _PAGE_SIZE = 10
def _download_gql(self, video_id, op, note=None, fatal=True):
result = self._download_json(
@@ -46,73 +110,6 @@ def _download_gql(self, video_id, op, note=None, fatal=True):
headers={'Content-Type': 'application/json'})
return result['data']
- def _real_extract(self, url):
- video_id = self._match_id(url)
- data = self._download_gql(video_id, {
- 'operationName': 'Medium',
- 'variables': {
- 'id': video_id,
- },
- 'query': '''\
-query Medium($id: ID!) {
- medium(id: $id) {
- title
- description
- key
- duration
- commentsCount
- likesCount
- viewsCount
- thumbnailKey
- tagList
- user {
- name
- __typename
- }
- __typename
- }
-}'''})
- meta = data['medium']
-
- storage_url = 'https://storage.murrtube.net/murrtube/'
- format_url = storage_url + meta.get('key', '')
- thumbnail = storage_url + meta.get('thumbnailKey', '')
-
- if determine_ext(format_url) == 'm3u8':
- formats = self._extract_m3u8_formats(
- format_url, video_id, 'mp4', entry_protocol='m3u8_native', fatal=False)
- else:
- formats = [{'url': format_url}]
-
- return {
- 'id': video_id,
- 'title': meta.get('title'),
- 'description': meta.get('description'),
- 'formats': formats,
- 'thumbnail': thumbnail,
- 'duration': int_or_none(meta.get('duration')),
- 'uploader': try_get(meta, lambda x: x['user']['name']),
- 'view_count': meta.get('viewsCount'),
- 'like_count': meta.get('likesCount'),
- 'comment_count': meta.get('commentsCount'),
- 'tags': meta.get('tagList'),
- 'age_limit': 18,
- }
-
-
-class MurrtubeUserIE(MurrtubeIE): # XXX: Do not subclass from concrete IE
- _WORKING = False
- IE_DESC = 'Murrtube user profile'
- _VALID_URL = r'https?://murrtube\.net/(?P[^/]+)$'
- _TEST = {
- 'url': 'https://murrtube.net/stormy',
- 'info_dict': {
- 'id': 'stormy',
- },
- 'playlist_mincount': 27,
- }
- _PAGE_SIZE = 10
-
def _fetch_page(self, username, user_id, page):
data = self._download_gql(username, {
'operationName': 'Media',
diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py
index 9d7b010c5..179e7a9b1 100644
--- a/yt_dlp/extractor/niconico.py
+++ b/yt_dlp/extractor/niconico.py
@@ -40,7 +40,6 @@ class NiconicoIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.nicovideo.jp/watch/sm22312215',
- 'md5': 'd1a75c0823e2f629128c43e1212760f9',
'info_dict': {
'id': 'sm22312215',
'ext': 'mp4',
@@ -56,8 +55,8 @@ class NiconicoIE(InfoExtractor):
'comment_count': int,
'genres': ['未設定'],
'tags': [],
- 'expected_protocol': str,
},
+ 'params': {'skip_download': 'm3u8'},
}, {
# File downloaded with and without credentials are different, so omit
# the md5 field
@@ -77,8 +76,8 @@ class NiconicoIE(InfoExtractor):
'view_count': int,
'genres': ['音楽・サウンド'],
'tags': ['Translation_Request', 'Kagamine_Rin', 'Rin_Original'],
- 'expected_protocol': str,
},
+ 'params': {'skip_download': 'm3u8'},
}, {
# 'video exists but is marked as "deleted"
# md5 is unstable
@@ -112,7 +111,6 @@ class NiconicoIE(InfoExtractor):
}, {
# video not available via `getflv`; "old" HTML5 video
'url': 'http://www.nicovideo.jp/watch/sm1151009',
- 'md5': 'f95a3d259172667b293530cc2e41ebda',
'info_dict': {
'id': 'sm1151009',
'ext': 'mp4',
@@ -128,11 +126,10 @@ class NiconicoIE(InfoExtractor):
'comment_count': int,
'genres': ['ゲーム'],
'tags': [],
- 'expected_protocol': str,
},
+ 'params': {'skip_download': 'm3u8'},
}, {
# "New" HTML5 video
- # md5 is unstable
'url': 'http://www.nicovideo.jp/watch/sm31464864',
'info_dict': {
'id': 'sm31464864',
@@ -149,12 +146,11 @@ class NiconicoIE(InfoExtractor):
'comment_count': int,
'genres': ['アニメ'],
'tags': [],
- 'expected_protocol': str,
},
+ 'params': {'skip_download': 'm3u8'},
}, {
# Video without owner
'url': 'http://www.nicovideo.jp/watch/sm18238488',
- 'md5': 'd265680a1f92bdcbbd2a507fc9e78a9e',
'info_dict': {
'id': 'sm18238488',
'ext': 'mp4',
@@ -168,8 +164,8 @@ class NiconicoIE(InfoExtractor):
'comment_count': int,
'genres': ['エンターテイメント'],
'tags': [],
- 'expected_protocol': str,
},
+ 'params': {'skip_download': 'm3u8'},
}, {
'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg',
'only_matching': True,
@@ -458,9 +454,11 @@ def _real_extract(self, url):
if video_id.startswith('so'):
video_id = self._match_id(handle.url)
- api_data = self._parse_json(self._html_search_regex(
- 'data-api-data="([^"]+)"', webpage,
- 'API data', default='{}'), video_id)
+ api_data = traverse_obj(
+ self._parse_json(self._html_search_meta('server-response', webpage) or '', video_id),
+ ('data', 'response', {dict}))
+ if not api_data:
+ raise ExtractorError('Server response data not found')
except ExtractorError as e:
try:
api_data = self._download_json(
diff --git a/yt_dlp/extractor/olympics.py b/yt_dlp/extractor/olympics.py
index becf052f6..bbf83e531 100644
--- a/yt_dlp/extractor/olympics.py
+++ b/yt_dlp/extractor/olympics.py
@@ -1,9 +1,19 @@
from .common import InfoExtractor
-from ..utils import int_or_none, try_get
+from ..networking.exceptions import HTTPError
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ parse_iso8601,
+ parse_qs,
+ try_get,
+ update_url,
+ url_or_none,
+)
+from ..utils.traversal import traverse_obj
class OlympicsReplayIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?olympics\.com(?:/tokyo-2020)?/[a-z]{2}/(?:replay|video)/(?P[^/#&?]+)'
+ _VALID_URL = r'https?://(?:www\.)?olympics\.com/[a-z]{2}/(?:paris-2024/)?(?:replay|videos?|original-series/episode)/(?P[\w-]+)'
_TESTS = [{
'url': 'https://olympics.com/fr/video/men-s-109kg-group-a-weightlifting-tokyo-2020-replays',
'info_dict': {
@@ -11,26 +21,105 @@ class OlympicsReplayIE(InfoExtractor):
'ext': 'mp4',
'title': '+109kg (H) Groupe A - Haltérophilie | Replay de Tokyo 2020',
'upload_date': '20210801',
- 'timestamp': 1627783200,
+ 'timestamp': 1627797600,
'description': 'md5:c66af4a5bc7429dbcc43d15845ff03b3',
- 'uploader': 'International Olympic Committee',
- },
- 'params': {
- 'skip_download': True,
+ 'thumbnail': 'https://img.olympics.com/images/image/private/t_1-1_1280/primary/nua4o7zwyaznoaejpbk2',
+ 'duration': 7017.0,
},
}, {
- 'url': 'https://olympics.com/tokyo-2020/en/replay/bd242924-4b22-49a5-a846-f1d4c809250d/mens-bronze-medal-match-hun-esp',
- 'only_matching': True,
+ 'url': 'https://olympics.com/en/original-series/episode/b-boys-and-b-girls-take-the-spotlight-breaking-life-road-to-paris-2024',
+ 'info_dict': {
+ 'id': '32633650-c5ee-4280-8b94-fb6defb6a9b5',
+ 'ext': 'mp4',
+ 'title': 'B-girl Nicka - Breaking Life, Road to Paris 2024 | Episode 1',
+ 'upload_date': '20240517',
+ 'timestamp': 1715948200,
+ 'description': 'md5:f63d728a41270ec628f6ac33ce471bb1',
+ 'thumbnail': 'https://img.olympics.com/images/image/private/t_1-1_1280/primary/a3j96l7j6so3vyfijby1',
+ 'duration': 1321.0,
+ },
+ }, {
+ 'url': 'https://olympics.com/en/paris-2024/videos/men-s-preliminaries-gbr-esp-ned-rsa-hockey-olympic-games-paris-2024',
+ 'info_dict': {
+ 'id': '3d96db23-8eee-4b7c-8ef5-488a0361026c',
+ 'ext': 'mp4',
+ 'title': 'Men\'s Preliminaries GBR-ESP & NED-RSA | Hockey | Olympic Games Paris 2024',
+ 'upload_date': '20240727',
+ 'timestamp': 1722066600,
+ },
+ 'skip': 'Geo-restricted to RU, BR, BT, NP, TM, BD, TL',
+ }, {
+ 'url': 'https://olympics.com/en/paris-2024/videos/dnp-suni-lee-i-have-goals-and-i-have-expectations-for-myself-but-i-also-am-trying-to-give-myself-grace',
+ 'info_dict': {
+ 'id': 'a42f37ab-8a74-41d0-a7d9-af27b7b02a90',
+ 'ext': 'mp4',
+ 'title': 'md5:c7cfbc9918636a98e66400a812e4d407',
+ 'upload_date': '20240729',
+ 'timestamp': 1722288600,
+ },
}]
+ _GEO_BYPASS = False
+
+ def _extract_from_nextjs_data(self, webpage, video_id):
+ data = traverse_obj(self._search_nextjs_data(webpage, video_id, default={}), (
+ 'props', 'pageProps', 'page', 'items',
+ lambda _, v: v['name'] == 'videoPlaylist', 'data', 'currentVideo', {dict}, any))
+ if not data:
+ return None
+
+ geo_countries = traverse_obj(data, ('countries', ..., {str}))
+ if traverse_obj(data, ('geoRestrictedVideo', {bool})):
+ self.raise_geo_restricted(countries=geo_countries)
+
+ is_live = traverse_obj(data, ('streamingStatus', {str})) == 'LIVE'
+ m3u8_url = traverse_obj(data, ('videoUrl', {url_or_none})) or data['streamUrl']
+ tokenized_url = self._tokenize_url(m3u8_url, data['jwtToken'], is_live, video_id)
+
+ try:
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ tokenized_url, video_id, 'mp4', m3u8_id='hls')
+ except ExtractorError as e:
+ if isinstance(e.cause, HTTPError) and 'georestricted' in e.cause.msg:
+ self.raise_geo_restricted(countries=geo_countries)
+ raise
+
+ return {
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'is_live': is_live,
+ **traverse_obj(data, {
+ 'id': ('videoID', {str}),
+ 'title': ('title', {str}),
+ 'timestamp': ('contentDate', {parse_iso8601}),
+ }),
+ }
+
+ def _tokenize_url(self, url, token, is_live, video_id):
+ return self._download_json(
+ 'https://metering.olympics.com/tokengenerator', video_id,
+ 'Downloading tokenized m3u8 url', query={
+ **parse_qs(url),
+ 'url': update_url(url, query=None),
+ 'service-id': 'live' if is_live else 'vod',
+ 'user-auth': token,
+ })['data']['url']
+
+ def _legacy_tokenize_url(self, url, video_id):
+ return self._download_json(
+ 'https://olympics.com/tokenGenerator', video_id,
+ 'Downloading legacy tokenized m3u8 url', query={'url': url})
def _real_extract(self, url):
video_id = self._match_id(url)
-
webpage = self._download_webpage(url, video_id)
+
+ if info := self._extract_from_nextjs_data(webpage, video_id):
+ return info
+
title = self._html_search_meta(('title', 'og:title', 'twitter:title'), webpage)
- uuid = self._html_search_meta('episode_uid', webpage)
+ video_uuid = self._html_search_meta('episode_uid', webpage)
m3u8_url = self._html_search_meta('video_url', webpage)
- json_ld = self._search_json_ld(webpage, uuid)
+ json_ld = self._search_json_ld(webpage, video_uuid)
thumbnails_list = json_ld.get('image')
if not thumbnails_list:
thumbnails_list = self._html_search_regex(
@@ -48,12 +137,12 @@ def _real_extract(self, url):
'width': width,
'height': int_or_none(try_get(width, lambda x: x * height_a / width_a)),
})
- m3u8_url = self._download_json(
- f'https://olympics.com/tokenGenerator?url={m3u8_url}', uuid, note='Downloading m3u8 url')
- formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, uuid, 'mp4', m3u8_id='hls')
+
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ self._legacy_tokenize_url(m3u8_url, video_uuid), video_uuid, 'mp4', m3u8_id='hls')
return {
- 'id': uuid,
+ 'id': video_uuid,
'title': title,
'thumbnails': thumbnails,
'formats': formats,
diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py
index 7d6e8439c..4489d533a 100644
--- a/yt_dlp/extractor/patreon.py
+++ b/yt_dlp/extractor/patreon.py
@@ -420,7 +420,7 @@ def _get_comments(self, post_id):
class PatreonCampaignIE(PatreonBaseIE):
- _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?:(?:m/(?P\d+))|(?P[-\w]+))'
+ _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?:(?:m|api/campaigns)/(?P\d+)|(?P[-\w]+))'
_TESTS = [{
'url': 'https://www.patreon.com/dissonancepod/',
'info_dict': {
@@ -442,25 +442,44 @@ class PatreonCampaignIE(PatreonBaseIE):
'url': 'https://www.patreon.com/m/4767637/posts',
'info_dict': {
'title': 'Not Just Bikes',
- 'channel_follower_count': int,
'id': '4767637',
'channel_id': '4767637',
'channel_url': 'https://www.patreon.com/notjustbikes',
- 'description': 'md5:595c6e7dca76ae615b1d38c298a287a1',
+ 'description': 'md5:9f4b70051216c4d5c58afe580ffc8d0f',
'age_limit': 0,
'channel': 'Not Just Bikes',
'uploader_url': 'https://www.patreon.com/notjustbikes',
- 'uploader': 'Not Just Bikes',
+ 'uploader': 'Jason',
'uploader_id': '37306634',
'thumbnail': r're:^https?://.*$',
},
'playlist_mincount': 71,
+ }, {
+ 'url': 'https://www.patreon.com/api/campaigns/4243769/posts',
+ 'info_dict': {
+ 'title': 'Second Thought',
+ 'channel_follower_count': int,
+ 'id': '4243769',
+ 'channel_id': '4243769',
+ 'channel_url': 'https://www.patreon.com/secondthought',
+ 'description': 'md5:69c89a3aba43efdb76e85eb023e8de8b',
+ 'age_limit': 0,
+ 'channel': 'Second Thought',
+ 'uploader_url': 'https://www.patreon.com/secondthought',
+ 'uploader': 'JT Chapman',
+ 'uploader_id': '32718287',
+ 'thumbnail': r're:^https?://.*$',
+ },
+ 'playlist_mincount': 201,
}, {
'url': 'https://www.patreon.com/dissonancepod/posts',
'only_matching': True,
}, {
'url': 'https://www.patreon.com/m/5932659',
'only_matching': True,
+ }, {
+ 'url': 'https://www.patreon.com/api/campaigns/4243769',
+ 'only_matching': True,
}]
@classmethod
diff --git a/yt_dlp/extractor/picarto.py b/yt_dlp/extractor/picarto.py
index 726fe4142..72e89c31e 100644
--- a/yt_dlp/extractor/picarto.py
+++ b/yt_dlp/extractor/picarto.py
@@ -5,6 +5,7 @@
ExtractorError,
str_or_none,
traverse_obj,
+ update_url,
)
@@ -43,15 +44,16 @@ def _real_extract(self, url):
url
}
}''' % (channel_id, channel_id), # noqa: UP031
- })['data']
+ }, headers={'Accept': '*/*', 'Content-Type': 'application/json'})['data']
metadata = data['channel']
if metadata.get('online') == 0:
raise ExtractorError('Stream is offline', expected=True)
title = metadata['title']
- cdn_data = self._download_json(
- data['getLoadBalancerUrl']['url'] + '/stream/json_' + metadata['stream_name'] + '.js',
+ cdn_data = self._download_json(''.join((
+ update_url(data['getLoadBalancerUrl']['url'], scheme='https'),
+ '/stream/json_', metadata['stream_name'], '.js')),
channel_id, 'Downloading load balancing info')
formats = []
@@ -99,10 +101,10 @@ class PicartoVodIE(InfoExtractor):
},
'skip': 'The VOD does not exist',
}, {
- 'url': 'https://picarto.tv/ArtofZod/videos/772650',
- 'md5': '00067a0889f1f6869cc512e3e79c521b',
+ 'url': 'https://picarto.tv/ArtofZod/videos/771008',
+ 'md5': 'abef5322f2700d967720c4c6754b2a34',
'info_dict': {
- 'id': '772650',
+ 'id': '771008',
'ext': 'mp4',
'title': 'Art of Zod - Drawing and Painting',
'thumbnail': r're:^https?://.*\.jpg',
@@ -131,7 +133,7 @@ def _real_extract(self, url):
}}
}}
}}''',
- })['data']['video']
+ }, headers={'Accept': '*/*', 'Content-Type': 'application/json'})['data']['video']
file_name = data['file_name']
netloc = urllib.parse.urlparse(data['video_recording_image_url']).netloc
diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py
index 0c6f0b070..4f8d96407 100644
--- a/yt_dlp/extractor/soundcloud.py
+++ b/yt_dlp/extractor/soundcloud.py
@@ -314,23 +314,11 @@ def add_format(f, protocol, is_preview=False):
self.write_debug(f'"{identifier}" is not a requested format, skipping')
continue
- stream = None
- for retry in self.RetryManager(fatal=False):
- try:
- stream = self._call_api(
- format_url, track_id, f'Downloading {identifier} format info JSON',
- query=query, headers=self._HEADERS)
- except ExtractorError as e:
- if isinstance(e.cause, HTTPError) and e.cause.status == 429:
- self.report_warning(
- 'You have reached the API rate limit, which is ~600 requests per '
- '10 minutes. Use the --extractor-retries and --retry-sleep options '
- 'to configure an appropriate retry count and wait time', only_once=True)
- retry.error = e.cause
- else:
- self.report_warning(e.msg)
+ # XXX: if not extract_flat, 429 error must be caught where _extract_info_dict is called
+ stream_url = traverse_obj(self._call_api(
+ format_url, track_id, f'Downloading {identifier} format info JSON',
+ query=query, headers=self._HEADERS), ('url', {url_or_none}))
- stream_url = traverse_obj(stream, ('url', {url_or_none}))
if invalid_url(stream_url):
continue
format_urls.add(stream_url)
@@ -647,7 +635,17 @@ def _real_extract(self, url):
info = self._call_api(
info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS)
- return self._extract_info_dict(info, full_title, token)
+ for retry in self.RetryManager():
+ try:
+ return self._extract_info_dict(info, full_title, token)
+ except ExtractorError as e:
+ if not isinstance(e.cause, HTTPError) or not e.cause.status == 429:
+ raise
+ self.report_warning(
+ 'You have reached the API rate limit, which is ~600 requests per '
+ '10 minutes. Use the --extractor-retries and --retry-sleep options '
+ 'to configure an appropriate retry count and wait time', only_once=True)
+ retry.error = e.cause
class SoundcloudPlaylistBaseIE(SoundcloudBaseIE):
@@ -873,7 +871,7 @@ class SoundcloudUserPermalinkIE(SoundcloudPagedPlaylistBaseIE):
'id': '30909869',
'title': 'neilcic',
},
- 'playlist_mincount': 23,
+ 'playlist_mincount': 22,
}]
def _real_extract(self, url):
@@ -882,7 +880,7 @@ def _real_extract(self, url):
self._resolv_url(url), user_id, 'Downloading user info', headers=self._HEADERS)
return self._extract_playlist(
- f'{self._API_V2_BASE}stream/users/{user["id"]}', str(user['id']), user.get('username'))
+ f'{self._API_V2_BASE}users/{user["id"]}/tracks', str(user['id']), user.get('username'))
class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
diff --git a/yt_dlp/extractor/swearnet.py b/yt_dlp/extractor/swearnet.py
index b4835c5ad..2d6fb3eb4 100644
--- a/yt_dlp/extractor/swearnet.py
+++ b/yt_dlp/extractor/swearnet.py
@@ -1,55 +1,31 @@
-from .common import InfoExtractor
-from ..utils import ExtractorError, int_or_none, traverse_obj
+from .vidyard import VidyardBaseIE
+from ..utils import ExtractorError, int_or_none, make_archive_id
-class SwearnetEpisodeIE(InfoExtractor):
+class SwearnetEpisodeIE(VidyardBaseIE):
_VALID_URL = r'https?://www\.swearnet\.com/shows/(?P[\w-]+)/seasons/(?P\d+)/episodes/(?P\d+)'
_TESTS = [{
'url': 'https://www.swearnet.com/shows/gettin-learnt-with-ricky/seasons/1/episodes/1',
'info_dict': {
- 'id': '232819',
+ 'id': 'wicK2EOzjOdxkUXGDIgcPw',
+ 'display_id': '232819',
'ext': 'mp4',
'episode_number': 1,
'episode': 'Episode 1',
'duration': 719,
- 'description': 'md5:c48ef71440ce466284c07085cd7bd761',
+ 'description': r're:Are you drunk and high and craving a grilled cheese sandwich.+',
'season': 'Season 1',
'title': 'Episode 1 - Grilled Cheese Sammich',
'season_number': 1,
- 'thumbnail': 'https://cdn.vidyard.com/thumbnails/232819/_RX04IKIq60a2V6rIRqq_Q_small.jpg',
+ 'thumbnail': 'https://cdn.vidyard.com/thumbnails/custom/0dd74f9b-388a-452e-b570-b407fb64435b_small.jpg',
+ 'tags': ['Getting Learnt with Ricky', 'drunk', 'grilled cheese', 'high'],
+ '_old_archive_ids': ['swearnetepisode 232819'],
},
}]
- def _get_formats_and_subtitle(self, video_source, video_id):
- video_source = video_source or {}
- formats, subtitles = [], {}
- for key, value in video_source.items():
- if key == 'hls':
- for video_hls in value:
- fmts, subs = self._extract_m3u8_formats_and_subtitles(video_hls.get('url'), video_id)
- formats.extend(fmts)
- self._merge_subtitles(subs, target=subtitles)
- else:
- formats.extend({
- 'url': video_mp4.get('url'),
- 'ext': 'mp4',
- } for video_mp4 in value)
-
- return formats, subtitles
-
- def _get_direct_subtitle(self, caption_json):
- subs = {}
- for caption in caption_json:
- subs.setdefault(caption.get('language') or 'und', []).append({
- 'url': caption.get('vttUrl'),
- 'name': caption.get('name'),
- })
-
- return subs
-
def _real_extract(self, url):
- display_id, season_number, episode_number = self._match_valid_url(url).group('id', 'season_num', 'episode_num')
- webpage = self._download_webpage(url, display_id)
+ slug, season_number, episode_number = self._match_valid_url(url).group('id', 'season_num', 'episode_num')
+ webpage = self._download_webpage(url, slug)
try:
external_id = self._search_regex(r'externalid\s*=\s*"([^"]+)', webpage, 'externalid')
@@ -58,22 +34,12 @@ def _real_extract(self, url):
self.raise_login_required()
raise
- json_data = self._download_json(
- f'https://play.vidyard.com/player/{external_id}.json', display_id)['payload']['chapters'][0]
-
- formats, subtitles = self._get_formats_and_subtitle(json_data['sources'], display_id)
- self._merge_subtitles(self._get_direct_subtitle(json_data.get('captions')), target=subtitles)
+ info = self._process_video_json(self._fetch_video_json(external_id)['chapters'][0], external_id)
+ if info.get('display_id'):
+ info['_old_archive_ids'] = [make_archive_id(self, info['display_id'])]
return {
- 'id': str(json_data['videoId']),
- 'title': json_data.get('name') or self._html_search_meta(['og:title', 'twitter:title'], webpage),
- 'description': (json_data.get('description')
- or self._html_search_meta(['og:description', 'twitter:description'], webpage)),
- 'duration': int_or_none(json_data.get('seconds')),
- 'formats': formats,
- 'subtitles': subtitles,
+ **info,
'season_number': int_or_none(season_number),
'episode_number': int_or_none(episode_number),
- 'thumbnails': [{'url': thumbnail_url}
- for thumbnail_url in traverse_obj(json_data, ('thumbnailUrls', ...))],
}
diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py
index c3505b14f..9d823a315 100644
--- a/yt_dlp/extractor/tiktok.py
+++ b/yt_dlp/extractor/tiktok.py
@@ -23,7 +23,6 @@
mimetype2ext,
parse_qs,
qualities,
- remove_start,
srt_subtitles_timecode,
str_or_none,
traverse_obj,
@@ -254,7 +253,16 @@ def _extract_web_data_and_status(self, url, video_id, fatal=True):
def _get_subtitles(self, aweme_detail, aweme_id, user_name):
# TODO: Extract text positioning info
+
+ EXT_MAP = { # From lowest to highest preference
+ 'creator_caption': 'json',
+ 'srt': 'srt',
+ 'webvtt': 'vtt',
+ }
+ preference = qualities(tuple(EXT_MAP.values()))
+
subtitles = {}
+
# aweme/detail endpoint subs
captions_info = traverse_obj(
aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict)
@@ -278,8 +286,8 @@ def _get_subtitles(self, aweme_detail, aweme_id, user_name):
if not caption.get('url'):
continue
subtitles.setdefault(caption.get('lang') or 'en', []).append({
- 'ext': remove_start(caption.get('caption_format'), 'web'),
'url': caption['url'],
+ 'ext': EXT_MAP.get(caption.get('Format')),
})
# webpage subs
if not subtitles:
@@ -288,9 +296,14 @@ def _get_subtitles(self, aweme_detail, aweme_id, user_name):
self._create_url(user_name, aweme_id), aweme_id, fatal=False)
for caption in traverse_obj(aweme_detail, ('video', 'subtitleInfos', lambda _, v: v['Url'])):
subtitles.setdefault(caption.get('LanguageCodeName') or 'en', []).append({
- 'ext': remove_start(caption.get('Format'), 'web'),
'url': caption['Url'],
+ 'ext': EXT_MAP.get(caption.get('Format')),
})
+
+ # Deprioritize creator_caption json since it can't be embedded or used by media players
+ for lang, subs_list in subtitles.items():
+ subtitles[lang] = sorted(subs_list, key=lambda x: preference(x['ext']))
+
return subtitles
def _parse_url_key(self, url_key):
@@ -1458,9 +1471,11 @@ def _real_extract(self, url):
if webpage:
data = self._get_sigi_state(webpage, uploader or room_id)
- room_id = (traverse_obj(data, ('UserModule', 'users', ..., 'roomId', {str_or_none}), get_all=False)
- or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=None)
- or room_id)
+ room_id = (
+ traverse_obj(data, ((
+ ('LiveRoom', 'liveRoomUserInfo', 'user'),
+ ('UserModule', 'users', ...)), 'roomId', {str}, any))
+ or self._search_regex(r'snssdk\d*://live\?room_id=(\d+)', webpage, 'room ID', default=room_id))
uploader = uploader or traverse_obj(
data, ('LiveRoom', 'liveRoomUserInfo', 'user', 'uniqueId'),
('UserModule', 'users', ..., 'uniqueId'), get_all=False, expected_type=str)
diff --git a/yt_dlp/extractor/toggle.py b/yt_dlp/extractor/toggle.py
index de2e03f17..fbef7cc0f 100644
--- a/yt_dlp/extractor/toggle.py
+++ b/yt_dlp/extractor/toggle.py
@@ -28,35 +28,11 @@ class ToggleIE(InfoExtractor):
'skip_download': 'm3u8 download',
},
}, {
- 'note': 'DRM-protected video',
'url': 'http://www.mewatch.sg/en/movies/dug-s-special-mission/341413',
- 'info_dict': {
- 'id': '341413',
- 'ext': 'wvm',
- 'title': 'Dug\'s Special Mission',
- 'description': 'md5:e86c6f4458214905c1772398fabc93e0',
- 'upload_date': '20150827',
- 'timestamp': 1440644006,
- },
- 'params': {
- 'skip_download': 'DRM-protected wvm download',
- },
+ 'only_matching': True,
}, {
- # this also tests correct video id extraction
- 'note': 'm3u8 links are geo-restricted, but Android/mp4 is okay',
'url': 'http://www.mewatch.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861',
- 'info_dict': {
- 'id': '332861',
- 'ext': 'mp4',
- 'title': '28th SEA Games (5 Show) - Episode 11',
- 'description': 'md5:3cd4f5f56c7c3b1340c50a863f896faa',
- 'upload_date': '20150605',
- 'timestamp': 1433480166,
- },
- 'params': {
- 'skip_download': 'DRM-protected wvm download',
- },
- 'skip': 'm3u8 links are geo-restricted',
+ 'only_matching': True,
}, {
'url': 'http://video.toggle.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331',
'only_matching': True,
diff --git a/yt_dlp/extractor/tv5mondeplus.py b/yt_dlp/extractor/tv5mondeplus.py
index 52ff230f2..953eb77ed 100644
--- a/yt_dlp/extractor/tv5mondeplus.py
+++ b/yt_dlp/extractor/tv5mondeplus.py
@@ -96,7 +96,7 @@ def _extract_subtitles(data_captions):
def _real_extract(self, url):
display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
+ webpage = self._download_webpage(url, display_id, impersonate=True)
if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage:
self.raise_geo_restricted(countries=['FR'])
@@ -122,8 +122,9 @@ def process_video_files(v):
if not token:
continue
deferred_json = self._download_json(
- f'https://api.tv5monde.com/player/asset/{d_param}/resolve?condenseKS=true', display_id,
- note='Downloading deferred info', headers={'Authorization': f'Bearer {token}'}, fatal=False)
+ f'https://api.tv5monde.com/player/asset/{d_param}/resolve?condenseKS=true',
+ display_id, 'Downloading deferred info', fatal=False, impersonate=True,
+ headers={'Authorization': f'Bearer {token}'})
v_url = traverse_obj(deferred_json, (0, 'url', {url_or_none}))
if not v_url:
continue
diff --git a/yt_dlp/extractor/tva.py b/yt_dlp/extractor/tva.py
index e3e10557c..d702640f3 100644
--- a/yt_dlp/extractor/tva.py
+++ b/yt_dlp/extractor/tva.py
@@ -1,60 +1,29 @@
import functools
import re
+from .brightcove import BrightcoveNewIE
from .common import InfoExtractor
from ..utils import float_or_none, int_or_none, smuggle_url, strip_or_none
from ..utils.traversal import traverse_obj
class TVAIE(InfoExtractor):
- _VALID_URL = r'https?://videos?\.tva\.ca/details/_(?P\d+)'
+ IE_NAME = 'tvaplus'
+ IE_DESC = 'TVA+'
+ _VALID_URL = r'https?://(?:www\.)?tvaplus\.ca/(?:[^/?#]+/)*[\w-]+-(?P\d+)(?:$|[#?])'
_TESTS = [{
- 'url': 'https://videos.tva.ca/details/_5596811470001',
- 'info_dict': {
- 'id': '5596811470001',
- 'ext': 'mp4',
- 'title': 'Un extrait de l\'épisode du dimanche 8 octobre 2017 !',
- 'uploader_id': '5481942443001',
- 'upload_date': '20171003',
- 'timestamp': 1507064617,
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
- 'skip': 'HTTP Error 404: Not Found',
- }, {
- 'url': 'https://video.tva.ca/details/_5596811470001',
- 'only_matching': True,
- }]
- BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5481942443001/default_default/index.html?videoId=%s'
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- return {
- '_type': 'url_transparent',
- 'id': video_id,
- 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['CA']}),
- 'ie_key': 'BrightcoveNew',
- }
-
-
-class QubIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?qub\.ca/(?:[^/]+/)*[0-9a-z-]+-(?P\d+)'
- _TESTS = [{
- 'url': 'https://www.qub.ca/tvaplus/tva/alerte-amber/saison-1/episode-01-1000036619',
+ 'url': 'https://www.tvaplus.ca/tva/alerte-amber/saison-1/episode-01-1000036619',
'md5': '949490fd0e7aee11d0543777611fbd53',
'info_dict': {
'id': '6084352463001',
'ext': 'mp4',
- 'title': 'Ép 01. Mon dernier jour',
+ 'title': 'Mon dernier jour',
'uploader_id': '5481942443001',
'upload_date': '20190907',
'timestamp': 1567899756,
'description': 'md5:9c0d7fbb90939420c651fd977df90145',
'thumbnail': r're:https://.+\.jpg',
- 'episode': 'Ép 01. Mon dernier jour',
+ 'episode': 'Mon dernier jour',
'episode_number': 1,
'tags': ['alerte amber', 'alerte amber saison 1', 'surdemande'],
'duration': 2625.963,
@@ -64,23 +33,36 @@ class QubIE(InfoExtractor):
'channel': 'TVA',
},
}, {
- 'url': 'https://www.qub.ca/tele/video/lcn-ca-vous-regarde-rev-30s-ap369664-1009357943',
- 'only_matching': True,
+ 'url': 'https://www.tvaplus.ca/tva/le-baiser-du-barbu/le-baiser-du-barbu-886644190',
+ 'info_dict': {
+ 'id': '6354448043112',
+ 'ext': 'mp4',
+ 'title': 'Le Baiser du barbu',
+ 'uploader_id': '5481942443001',
+ 'upload_date': '20240606',
+ 'timestamp': 1717694023,
+ 'description': 'md5:025b1219086c1cbf4bc27e4e034e8b57',
+ 'thumbnail': r're:https://.+\.jpg',
+ 'episode': 'Le Baiser du barbu',
+ 'tags': ['fullepisode', 'films'],
+ 'duration': 6053.504,
+ 'series': 'Le Baiser du barbu',
+ 'channel': 'TVA',
+ },
}]
- # reference_id also works with old account_id(5481942443001)
- # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5813221784001/default_default/index.html?videoId=ref:%s'
+ _BC_URL_TMPL = 'https://players.brightcove.net/5481942443001/default_default/index.html?videoId={}'
def _real_extract(self, url):
entity_id = self._match_id(url)
webpage = self._download_webpage(url, entity_id)
- entity = self._search_nextjs_data(webpage, entity_id)['props']['initialProps']['pageProps']['fallbackData']
+ entity = self._search_nextjs_data(webpage, entity_id)['props']['pageProps']['staticEntity']
video_id = entity['videoId']
episode = strip_or_none(entity.get('name'))
return {
'_type': 'url_transparent',
- 'url': f'https://videos.tva.ca/details/_{video_id}',
- 'ie_key': TVAIE.ie_key(),
+ 'url': smuggle_url(self._BC_URL_TMPL.format(video_id), {'geo_countries': ['CA']}),
+ 'ie_key': BrightcoveNewIE.ie_key(),
'id': video_id,
'title': episode,
'episode': episode,
diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py
index 8105db41c..c13832c6f 100644
--- a/yt_dlp/extractor/tver.py
+++ b/yt_dlp/extractor/tver.py
@@ -10,7 +10,7 @@
class TVerIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?:(?Plp|corner|series|episodes?|feature|tokyo2020/video)/)+(?P[a-zA-Z0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?:(?Plp|corner|series|episodes?|feature|tokyo2020/video|olympic/paris2024/video)/)+(?P[a-zA-Z0-9]+)'
_TESTS = [{
'skip': 'videos are only available for 7 days',
'url': 'https://tver.jp/episodes/ep83nf3w4p',
@@ -23,6 +23,20 @@ class TVerIE(InfoExtractor):
'channel': 'テレビ朝日',
},
'add_ie': ['BrightcoveNew'],
+ }, {
+ 'url': 'https://tver.jp/olympic/paris2024/video/6359578055112/',
+ 'info_dict': {
+ 'id': '6359578055112',
+ 'ext': 'mp4',
+ 'title': '堀米雄斗 金メダルで五輪連覇!「みんなの応援が最後に乗れたカギ」',
+ 'timestamp': 1722279928,
+ 'upload_date': '20240729',
+ 'tags': ['20240729', 'japanese', 'japanmedal', 'paris'],
+ 'uploader_id': '4774017240001',
+ 'thumbnail': r're:https?://[^/?#]+boltdns\.net/[^?#]+/1920x1080/match/image\.jpg',
+ 'duration': 670.571,
+ },
+ 'params': {'skip_download': 'm3u8'},
}, {
'url': 'https://tver.jp/corner/f0103888',
'only_matching': True,
@@ -47,7 +61,15 @@ def _real_initialize(self):
def _real_extract(self, url):
video_id, video_type = self._match_valid_url(url).group('id', 'type')
- if video_type not in {'series', 'episodes'}:
+
+ if video_type == 'olympic/paris2024/video':
+ # Player ID is taken from .content.brightcove.E200.pro.pc.account_id:
+ # https://tver.jp/olympic/paris2024/req/api/hook?q=https%3A%2F%2Folympic-assets.tver.jp%2Fweb-static%2Fjson%2Fconfig.json&d=
+ return self.url_result(smuggle_url(
+ self.BRIGHTCOVE_URL_TEMPLATE % ('4774017240001', video_id),
+ {'geo_countries': ['JP']}), 'BrightcoveNew')
+
+ elif video_type not in {'series', 'episodes'}:
webpage = self._download_webpage(url, video_id, note='Resolving to new URL')
video_id = self._match_id(self._search_regex(
(r'canonical"\s*href="(https?://tver\.jp/[^"]+)"', r'&link=(https?://tver\.jp/[^?&]+)[?&]'),
diff --git a/yt_dlp/extractor/unsupported.py b/yt_dlp/extractor/unsupported.py
index 1e2d118aa..8b7ec1dd9 100644
--- a/yt_dlp/extractor/unsupported.py
+++ b/yt_dlp/extractor/unsupported.py
@@ -49,6 +49,7 @@ class KnownDRMIE(UnsupportedInfoExtractor):
r'amazon\.(?:\w{2}\.)?\w+/gp/video',
r'music\.amazon\.(?:\w{2}\.)?\w+',
r'(?:watch|front)\.njpwworld\.com',
+ r'qub\.ca/vrai',
)
_TESTS = [{
@@ -149,6 +150,9 @@ class KnownDRMIE(UnsupportedInfoExtractor):
}, {
'url': 'https://front.njpwworld.com/p/s_series_00563_16_bs',
'only_matching': True,
+ }, {
+ 'url': 'https://www.qub.ca/vrai/l-effet-bocuse-d-or/saison-1/l-effet-bocuse-d-or-saison-1-bande-annonce-1098225063',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/yt_dlp/extractor/vidyard.py b/yt_dlp/extractor/vidyard.py
new file mode 100644
index 000000000..20a54b161
--- /dev/null
+++ b/yt_dlp/extractor/vidyard.py
@@ -0,0 +1,426 @@
+import functools
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ extract_attributes,
+ float_or_none,
+ int_or_none,
+ join_nonempty,
+ mimetype2ext,
+ parse_resolution,
+ str_or_none,
+ unescapeHTML,
+ url_or_none,
+)
+from ..utils.traversal import traverse_obj
+
+
+class VidyardBaseIE(InfoExtractor):
+ _HEADERS = {'Referer': 'https://play.vidyard.com/'}
+
+ def _get_formats_and_subtitles(self, sources, video_id):
+ formats, subtitles = [], {}
+
+ def add_hls_fmts_and_subs(m3u8_url):
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ m3u8_url, video_id, 'mp4', m3u8_id='hls', headers=self._HEADERS, fatal=False)
+ formats.extend(fmts)
+ self._merge_subtitles(subs, target=subtitles)
+
+ hls_list = isinstance(sources, dict) and sources.pop('hls', None)
+ if master_m3u8_url := traverse_obj(
+ hls_list, (lambda _, v: v['profile'] == 'auto', 'url', {url_or_none}, any)):
+ add_hls_fmts_and_subs(master_m3u8_url)
+ if not formats: # These are duplicate and unnecesary requests if we got 'auto' hls fmts
+ for variant_m3u8_url in traverse_obj(hls_list, (..., 'url', {url_or_none})):
+ add_hls_fmts_and_subs(variant_m3u8_url)
+
+ for source_type, source_list in traverse_obj(sources, ({dict.items}, ...)):
+ for source in traverse_obj(source_list, lambda _, v: url_or_none(v['url'])):
+ profile = source.get('profile')
+ formats.append({
+ 'url': source['url'],
+ 'ext': mimetype2ext(source.get('mimeType'), default=None),
+ 'format_id': join_nonempty('http', source_type, profile),
+ **parse_resolution(profile),
+ })
+
+ self._remove_duplicate_formats(formats)
+ return formats, subtitles
+
+ def _get_direct_subtitles(self, caption_json):
+ subs = {}
+ for caption in traverse_obj(caption_json, lambda _, v: url_or_none(v['vttUrl'])):
+ subs.setdefault(caption.get('language') or 'und', []).append({
+ 'url': caption['vttUrl'],
+ 'name': caption.get('name'),
+ })
+
+ return subs
+
+ def _fetch_video_json(self, video_id):
+ return self._download_json(
+ f'https://play.vidyard.com/player/{video_id}.json', video_id)['payload']
+
+ def _process_video_json(self, json_data, video_id):
+ formats, subtitles = self._get_formats_and_subtitles(json_data['sources'], video_id)
+ self._merge_subtitles(self._get_direct_subtitles(json_data.get('captions')), target=subtitles)
+
+ return {
+ **traverse_obj(json_data, {
+ 'id': ('facadeUuid', {str}),
+ 'display_id': ('videoId', {int}, {str_or_none}),
+ 'title': ('name', {str}),
+ 'description': ('description', {str}, {unescapeHTML}, {lambda x: x or None}),
+ 'duration': ((
+ ('milliseconds', {functools.partial(float_or_none, scale=1000)}),
+ ('seconds', {int_or_none})), any),
+ 'thumbnails': ('thumbnailUrls', ('small', 'normal'), {'url': {url_or_none}}),
+ 'tags': ('tags', ..., 'name', {str}),
+ }),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'http_headers': self._HEADERS,
+ }
+
+
+class VidyardIE(VidyardBaseIE):
+ _VALID_URL = [
+ r'https?://[\w-]+(?:\.hubs)?\.vidyard\.com/watch/(?P[\w-]+)',
+ r'https?://(?:embed|share)\.vidyard\.com/share/(?P[\w-]+)',
+ r'https?://play\.vidyard\.com/(?:player/)?(?P[\w-]+)',
+ ]
+ _EMBED_REGEX = [r'