diff --git a/yt_dlp/extractor/substack.py b/yt_dlp/extractor/substack.py index 5835a5a8d3..6ee3f75e1a 100644 --- a/yt_dlp/extractor/substack.py +++ b/yt_dlp/extractor/substack.py @@ -56,10 +56,10 @@ def _extract_embed_urls(cls, url, webpage): yield parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl() raise cls.StopExtraction() - def _extract_video_formats(self, video_id, username): + def _extract_video_formats(self, video_id, url): formats, subtitles = [], {} for video_format in ('hls', 'mp4'): - video_url = f'https://{username}.substack.com/api/v1/video/upload/{video_id}/src?type={video_format}' + video_url = urllib.parse.urljoin(url, f'/api/v1/video/upload/{video_id}/src?type={video_format}') if video_format == 'hls': fmts, subs = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', fatal=False) @@ -81,12 +81,17 @@ def _real_extract(self, url): r'window\._preloads\s*=\s*JSON\.parse\(', webpage, 'json string', display_id, transform_source=js_to_json, contains_pattern=r'"{(?s:.+)}"'), display_id) + canonical_url = url + domain = traverse_obj(webpage_info, ('domainInfo', 'customDomain', {str})) + if domain: + canonical_url = urllib.parse.urlparse(url)._replace(netloc=domain).geturl() + post_type = webpage_info['post']['type'] formats, subtitles = [], {} if post_type == 'podcast': formats, subtitles = [{'url': webpage_info['post']['podcast_url']}], {} elif post_type == 'video': - formats, subtitles = self._extract_video_formats(webpage_info['post']['videoUpload']['id'], username) + formats, subtitles = self._extract_video_formats(webpage_info['post']['videoUpload']['id'], canonical_url) else: self.raise_no_formats(f'Page type "{post_type}" is not supported') @@ -99,4 +104,5 @@ def _real_extract(self, url): 'thumbnail': traverse_obj(webpage_info, ('post', 'cover_image')), 'uploader': traverse_obj(webpage_info, ('pub', 'name')), 'uploader_id': str_or_none(traverse_obj(webpage_info, ('post', 'publication_id'))), + 'webpage_url': canonical_url, }