From 6d923aab35f186ca5c4c7fce6b7c43145aa192db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 4 Jun 2017 23:21:30 +0700 Subject: [PATCH] [bandcamp:weekly] Improve and extract more metadata (closes #12758) --- youtube_dl/extractor/bandcamp.py | 78 ++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 54fa8634c1..9ddb9af172 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -14,6 +14,7 @@ ExtractorError, float_or_none, int_or_none, + KNOWN_EXTENSIONS, parse_filesize, unescapeHTML, update_url_query, @@ -22,7 +23,7 @@ class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P.*)' + _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>[^/?#&]+)' _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', @@ -156,7 +157,7 @@ def _real_extract(self, url): class BandcampAlbumIE(InfoExtractor): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^?#]+)|/?(?:$|[?#]))' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' _TESTS = [{ 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -225,7 +226,9 @@ class BandcampAlbumIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if BandcampWeeklyIE.suitable(url) else super(BandcampAlbumIE, cls).suitable(url) + return (False + if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url) + else super(BandcampAlbumIE, cls).suitable(url)) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -258,16 +261,22 @@ def _real_extract(self, url): class BandcampWeeklyIE(InfoExtractor): - IE_NAME = 'Bandcamp:bandcamp_weekly' - _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*&)?show=(?P<id>\d+)(?:$|[&#])' + IE_NAME = 'Bandcamp:weekly' + _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' _TESTS = [{ 'url': 'https://bandcamp.com/?show=224', 'md5': 'b00df799c733cf7e0c567ed187dea0fd', 'info_dict': { 'id': '224', 'ext': 'opus', - 'title': 'BC Weekly April 4th 2017: Magic Moments', - 'description': 'Stones Throw\'s Vex Ruffin, plus up and coming singer Salami Rose Joe Louis, in conversation about their fantastic DIY albums.', + 'title': 'BC Weekly April 4th 2017 - Magic Moments', + 'description': 'md5:5d48150916e8e02d030623a48512c874', + 'duration': 5829.77, + 'release_date': '20170404', + 'series': 'Bandcamp Weekly', + 'episode': 'Magic Moments', + 'episode_number': 208, + 'episode_id': '224', } }, { 'url': 'https://bandcamp.com/?blah/blah@&show=228', @@ -288,32 +297,53 @@ def _real_extract(self, url): # This is desired because any invalid show id redirects to `bandcamp.com` # which happens to expose the latest Bandcamp Weekly episode. - video_id = compat_str(show['show_id']) + show_id = int_or_none(show.get('show_id')) or int_or_none(video_id) - def to_format_dictionaries(audio_stream): - dictionaries = [{'format_id': kvp[0], 'url': kvp[1]} for kvp in audio_stream.items()] - known_extensions = ['mp3', 'opus'] - - for dictionary in dictionaries: - for ext in known_extensions: - if ext in dictionary['format_id']: - dictionary['ext'] = ext - break - - return dictionaries - - formats = to_format_dictionaries(show['audio_stream']) + formats = [] + for format_id, format_url in show['audio_stream'].items(): + if not isinstance(format_url, compat_str): + continue + for known_ext in KNOWN_EXTENSIONS: + if known_ext in format_id: + ext = known_ext + break + else: + ext = None + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'ext': ext, + 'vcodec': 'none', + }) self._sort_formats(formats) + title = show.get('audio_title') or 'Bandcamp Weekly' + subtitle = show.get('subtitle') + if subtitle: + title += ' - %s' % subtitle + + episode_number = None + seq = blob.get('bcw_seq') + + if seq and isinstance(seq, list): + try: + episode_number = next( + int_or_none(e.get('episode_number')) + for e in seq + if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) + except StopIteration: + pass + return { 'id': video_id, - 'title': show['audio_title'] + ': ' + show['subtitle'], - 'description': show.get('desc'), + 'title': title, + 'description': show.get('desc') or show.get('short_desc'), 'duration': float_or_none(show.get('audio_duration')), - 'webpage_url': 'https://bandcamp.com/?show=' + video_id, 'is_live': False, 'release_date': unified_strdate(show.get('published_date')), 'series': 'Bandcamp Weekly', + 'episode': show.get('subtitle'), + 'episode_number': episode_number, 'episode_id': compat_str(video_id), 'formats': formats }