From f80ba18ee9b45cab392ca753a71b5bf3bdb4bd40 Mon Sep 17 00:00:00 2001 From: "Renan D." Date: Fri, 3 May 2024 19:27:49 -0300 Subject: [PATCH 1/5] [threads] Add extractor --- supportedsites.md | 2 + yt_dlp/extractor/_extractors.py | 4 + yt_dlp/extractor/threads.py | 157 ++++++++++++++++++++++++++++++++ 3 files changed, 163 insertions(+) create mode 100644 yt_dlp/extractor/threads.py diff --git a/supportedsites.md b/supportedsites.md index ba77c0feb0..8de524a1d8 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -1449,6 +1449,8 @@ # Supported sites - **ThisVid** - **ThisVidMember** - **ThisVidPlaylist** + - **Threads** + - **ThreadsIOS**: Threads' iOS `barcelona://` URL - **ThreeSpeak** - **ThreeSpeakUser** - **TikTok** diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 42034275b9..c7e1174c37 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1988,6 +1988,10 @@ ThisVidMemberIE, ThisVidPlaylistIE, ) +from .threads import ( + ThreadsIE, + ThreadsIOSIE +) from .threespeak import ( ThreeSpeakIE, ThreeSpeakUserIE, diff --git a/yt_dlp/extractor/threads.py b/yt_dlp/extractor/threads.py new file mode 100644 index 0000000000..890fd8b976 --- /dev/null +++ b/yt_dlp/extractor/threads.py @@ -0,0 +1,157 @@ +from .common import InfoExtractor +from ..utils import ( + strftime_or_none, + traverse_obj, + remove_end, + strip_or_none +) + + +class ThreadsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?threads\.net/(?P[^/]+)/post/(?P[^/?#&]+)/?(?Pembed.*?)?' + + _TESTS = [{ + 'url': 'https://www.threads.net/@tntsportsbr/post/C6cqebdCfBi', + 'info_dict': { + 'id': 'C6cqebdCfBi', + 'ext': 'mp4', + 'title': 'md5:062673d04195aa2d99b8d7a11798cb9d', + 'description': 'md5:fe0c73f9a892fb92efcc67cc075561b0', + 'uploader': 'TNT Sports Brasil', + 'uploader_id': 'tntsportsbr', + 'uploader_url': 'https://www.threads.net/@tntsportsbr', + 'channel': 'tntsportsbr', + 'channel_url': 'https://www.threads.net/@tntsportsbr', + 'timestamp': 1714613811, + 'upload_date': '20240502', + 'like_count': int, + 'channel_is_verified': bool, + 'thumbnail': r're:^https?://.*\.jpg' + } + }, { + 'url': 'https://www.threads.net/@felipebecari/post/C6cM_yNPHCF', + 'info_dict': { + 'id': 'C6cM_yNPHCF', + 'ext': 'mp4', + 'title': '@felipebecari • Sobre o futuro dos dois últimos resgatados: tem muita notícia boa! 🐶❤️', + 'description': 'Sobre o futuro dos dois últimos resgatados: tem muita notícia boa! 🐶❤️', + 'uploader': 'Felipe Becari', + 'uploader_id': 'felipebecari', + 'uploader_url': 'https://www.threads.net/@felipebecari', + 'channel': 'felipebecari', + 'channel_url': 'https://www.threads.net/@felipebecari', + 'timestamp': 1714598318, + 'upload_date': '20240501', + 'like_count': int, + 'channel_is_verified': bool, + 'thumbnail': r're:^https?://.*\.jpg' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + metadata = {} + + # Try getting videos from json + json_data = self._search_regex( + r']+>(.*"code":"%s".*)' % video_id, + webpage, 'main json', fatal=True) + + result = self._search_json( + r'"result":', json_data, + 'result data', video_id, fatal=True) + + edges = traverse_obj(result, ('data', 'data', 'edges')) + + for node in edges: + items = traverse_obj(node, ('node', 'thread_items')) + + for item in items: + post = item.get('post') + + if post and post.get('code') == video_id: + formats = [] + thumbnails = [] + + # Videos + if (post.get('carousel_media') is not None): # Handle multiple videos posts + media_list = post.get('carousel_media') + else: + media_list = [post] + + for media in media_list: + videos = media.get('video_versions') + + for video in videos: + formats.append({ + 'format_id': '%s-%s' % (media.get('pk'), video['type']), # id-type + 'url': video['url'], + 'width': media.get('original_width'), + 'height': media.get('original_height'), + }) + + # Thumbnails + thumbs = traverse_obj(post, ('image_versions2', 'candidates')) + + for thumb in thumbs: + thumbnails.append({ + 'url': thumb['url'], + 'width': thumb['width'], + 'height': thumb['height'], + }) + + # Metadata + metadata.setdefault('uploader_id', traverse_obj(post, ('user', 'username'))) + metadata.setdefault('channel_is_verified', traverse_obj(post, ('user', 'is_verified'))) + metadata.setdefault('uploader_url', 'https://www.threads.net/@%s' % traverse_obj(post, ('user', 'username'))) + metadata.setdefault('timestamp', post.get('taken_at')) + metadata.setdefault('like_count', post.get('like_count')) + + # Try getting metadata + metadata['id'] = video_id + metadata['title'] = strip_or_none(remove_end(self._html_extract_title(webpage), '• Threads')) + metadata['description'] = self._og_search_description(webpage) + + metadata['channel'] = metadata.get('uploader_id') + metadata['channel_url'] = metadata.get('uploader_url') + metadata['uploader'] = self._search_regex(r'(.*?) \(', self._og_search_title(webpage), 'uploader') + metadata['upload_date'] = strftime_or_none(metadata.get('timestamp')) + + return { + **metadata, + 'formats': formats, + 'thumbnails': thumbnails + } + + +class ThreadsIOSIE(InfoExtractor): + IE_DESC = 'IOS barcelona:// URL' + _VALID_URL = r'barcelona://media\?shortcode=(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'barcelona://media?shortcode=C6fDehepo5D', + 'info_dict': { + 'id': 'C6fDehepo5D', + 'ext': 'mp4', + 'title': 'md5:dc92f960981b8b3a33eba9681e9fdfc6', + 'description': 'md5:0c36a7e67e1517459bc0334dba932164', + 'uploader': 'Sa\u0303o Paulo Futebol Clube', + 'uploader_id': 'saopaulofc', + 'uploader_url': 'https://www.threads.net/@saopaulofc', + 'channel': 'saopaulofc', + 'channel_url': 'https://www.threads.net/@saopaulofc', + 'timestamp': 1714694014, + 'upload_date': '20240502', + 'like_count': int, + 'channel_is_verified': bool, + 'thumbnail': r're:^https?://.*\.jpg' + }, + 'add_ie': ['Threads'] + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + # Threads doesn't care about the user url, it redirects to the right one + # So we use ** instead so that we don't need to find it + return self.url_result(f'http://www.threads.net/**/post/{video_id}', ThreadsIE, video_id) From 32298c6d973bbf960a3e06f469730f09b02580e1 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 1 Jun 2024 19:35:29 +0000 Subject: [PATCH 2/5] ruff --- yt_dlp/extractor/threads.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/threads.py b/yt_dlp/extractor/threads.py index 890fd8b976..d20777fc93 100644 --- a/yt_dlp/extractor/threads.py +++ b/yt_dlp/extractor/threads.py @@ -1,10 +1,10 @@ from .common import InfoExtractor from ..utils import ( - strftime_or_none, - traverse_obj, remove_end, - strip_or_none + strftime_or_none, + strip_or_none, ) +from ..utils.traversal import traverse_obj class ThreadsIE(InfoExtractor): From e0eefb2c5a1923d24ec045f24ba83d68c9f4634d Mon Sep 17 00:00:00 2001 From: "Renan D." Date: Fri, 9 Aug 2024 15:30:43 -0300 Subject: [PATCH 3/5] [ie/threads] Fix multi mixed media extraction --- yt_dlp/extractor/threads.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/threads.py b/yt_dlp/extractor/threads.py index 890fd8b976..6179366552 100644 --- a/yt_dlp/extractor/threads.py +++ b/yt_dlp/extractor/threads.py @@ -75,7 +75,7 @@ def _real_extract(self, url): thumbnails = [] # Videos - if (post.get('carousel_media') is not None): # Handle multiple videos posts + if post.get('carousel_media') is not None: # Handle multiple videos posts media_list = post.get('carousel_media') else: media_list = [post] @@ -83,13 +83,14 @@ def _real_extract(self, url): for media in media_list: videos = media.get('video_versions') - for video in videos: - formats.append({ - 'format_id': '%s-%s' % (media.get('pk'), video['type']), # id-type - 'url': video['url'], - 'width': media.get('original_width'), - 'height': media.get('original_height'), - }) + if videos: + for video in videos: + formats.append({ + 'format_id': '%s-%s' % (media.get('pk'), video['type']), # id-type + 'url': video['url'], + 'width': media.get('original_width'), + 'height': media.get('original_height'), + }) # Thumbnails thumbs = traverse_obj(post, ('image_versions2', 'candidates')) From 9989f2ab3b4b09f0c5ca60ef9161de25b622574b Mon Sep 17 00:00:00 2001 From: "Renan D." Date: Fri, 9 Aug 2024 15:35:53 -0300 Subject: [PATCH 4/5] [ie/threads] Lint --- yt_dlp/extractor/_extractors.py | 2 +- yt_dlp/extractor/threads.py | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index e4503003ae..248d47d863 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2081,7 +2081,7 @@ ) from .threads import ( ThreadsIE, - ThreadsIOSIE + ThreadsIOSIE, ) from .threeqsdn import ThreeQSDNIE from .threespeak import ( diff --git a/yt_dlp/extractor/threads.py b/yt_dlp/extractor/threads.py index 6179366552..fba0bd4fc4 100644 --- a/yt_dlp/extractor/threads.py +++ b/yt_dlp/extractor/threads.py @@ -1,10 +1,10 @@ from .common import InfoExtractor from ..utils import ( - strftime_or_none, - traverse_obj, remove_end, - strip_or_none + strftime_or_none, + strip_or_none, ) +from ..utils.traversal import traverse_obj class ThreadsIE(InfoExtractor): @@ -26,8 +26,8 @@ class ThreadsIE(InfoExtractor): 'upload_date': '20240502', 'like_count': int, 'channel_is_verified': bool, - 'thumbnail': r're:^https?://.*\.jpg' - } + 'thumbnail': r're:^https?://.*\.jpg', + }, }, { 'url': 'https://www.threads.net/@felipebecari/post/C6cM_yNPHCF', 'info_dict': { @@ -44,8 +44,8 @@ class ThreadsIE(InfoExtractor): 'upload_date': '20240501', 'like_count': int, 'channel_is_verified': bool, - 'thumbnail': r're:^https?://.*\.jpg' - } + 'thumbnail': r're:^https?://.*\.jpg', + }, }] def _real_extract(self, url): @@ -55,7 +55,7 @@ def _real_extract(self, url): # Try getting videos from json json_data = self._search_regex( - r']+>(.*"code":"%s".*)' % video_id, + rf']+>(.*"code":"{video_id}".*)', webpage, 'main json', fatal=True) result = self._search_json( @@ -86,7 +86,7 @@ def _real_extract(self, url): if videos: for video in videos: formats.append({ - 'format_id': '%s-%s' % (media.get('pk'), video['type']), # id-type + 'format_id': '{}-{}'.format(media.get('pk'), video['type']), # id-type 'url': video['url'], 'width': media.get('original_width'), 'height': media.get('original_height'), @@ -105,7 +105,7 @@ def _real_extract(self, url): # Metadata metadata.setdefault('uploader_id', traverse_obj(post, ('user', 'username'))) metadata.setdefault('channel_is_verified', traverse_obj(post, ('user', 'is_verified'))) - metadata.setdefault('uploader_url', 'https://www.threads.net/@%s' % traverse_obj(post, ('user', 'username'))) + metadata.setdefault('uploader_url', 'https://www.threads.net/@{}'.format(traverse_obj(post, ('user', 'username')))) metadata.setdefault('timestamp', post.get('taken_at')) metadata.setdefault('like_count', post.get('like_count')) @@ -122,7 +122,7 @@ def _real_extract(self, url): return { **metadata, 'formats': formats, - 'thumbnails': thumbnails + 'thumbnails': thumbnails, } @@ -145,9 +145,9 @@ class ThreadsIOSIE(InfoExtractor): 'upload_date': '20240502', 'like_count': int, 'channel_is_verified': bool, - 'thumbnail': r're:^https?://.*\.jpg' + 'thumbnail': r're:^https?://.*\.jpg', }, - 'add_ie': ['Threads'] + 'add_ie': ['Threads'], }] def _real_extract(self, url): From fd6ad217b107ff86d0080b1f24cea68fafdf11a6 Mon Sep 17 00:00:00 2001 From: "Renan D." Date: Tue, 13 Aug 2024 02:00:46 -0300 Subject: [PATCH 5/5] [ie/threads] Avoid error when no uploader found --- yt_dlp/extractor/threads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/threads.py b/yt_dlp/extractor/threads.py index fba0bd4fc4..140bc2ca59 100644 --- a/yt_dlp/extractor/threads.py +++ b/yt_dlp/extractor/threads.py @@ -116,7 +116,7 @@ def _real_extract(self, url): metadata['channel'] = metadata.get('uploader_id') metadata['channel_url'] = metadata.get('uploader_url') - metadata['uploader'] = self._search_regex(r'(.*?) \(', self._og_search_title(webpage), 'uploader') + metadata['uploader'] = self._search_regex(r'(.*?) \(', self._og_search_title(webpage), 'uploader', metadata.get('uploader_id')) metadata['upload_date'] = strftime_or_none(metadata.get('timestamp')) return {