From 11984c7467184100ca4a61ae939a8c260480f42c Mon Sep 17 00:00:00 2001 From: "Devin J. Pohly" Date: Thu, 12 Mar 2015 15:43:13 -0400 Subject: [PATCH 1/8] [BeatportPro] Add new extractor This extractor is for Beatport's 2-minute, low-quality track previews only. To obtain an entire track, you obviously have to purchase and download it normally through the Beatport store! Possible future improvements: - Playlists for albums or other track-list pages - User login to play from My Beatport, Hold Bin, or Cart --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/beatportpro.py | 101 ++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 youtube_dl/extractor/beatportpro.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7f9523c2b4..ac765fdb80 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -37,6 +37,7 @@ from .bbccouk import BBCCoUkIE from .beeg import BeegIE from .behindkink import BehindKinkIE +from .beatportpro import BeatportProIE from .bet import BetIE from .bild import BildIE from .bilibili import BiliBiliIE diff --git a/youtube_dl/extractor/beatportpro.py b/youtube_dl/extractor/beatportpro.py new file mode 100644 index 0000000000..21048b7326 --- /dev/null +++ b/youtube_dl/extractor/beatportpro.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +import re +import json + + +class BeatportProIE(InfoExtractor): + _VALID_URL = r'https?://pro\.beatport\.com/track/.*/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://pro.beatport.com/track/synesthesia-original-mix/5379371', + 'md5': 'b3c34d8639a2f6a7f734382358478887', + 'info_dict': { + 'id': 5379371, + 'display-id': 'synesthesia-original-mix', + 'ext': 'mp4', + 'title': 'Froxic - Synesthesia (Original Mix)', + }, + }, { + 'url': 'https://pro.beatport.com/track/love-and-war-original-mix/3756896', + 'md5': 'e44c3025dfa38c6577fbaeb43da43514', + 'info_dict': { + 'id': 3756896, + 'display-id': 'love-and-war-original-mix', + 'ext': 'mp3', + 'title': 'Wolfgang Gartner - Love & War (Original Mix)', + }, + }, { + 'url': 'https://pro.beatport.com/track/birds-original-mix/4991738', + 'md5': 'a1fd8e8046de3950fd039304c186c05f', + 'info_dict': { + 'id': 4991738, + 'display-id': 'birds-original-mix', + 'ext': 'mp4', + 'title': "Tos, Middle Milk, Mumblin' Johnsson - Birds (Original Mix)", + } + }] + + def _real_extract(self, url): + track_id = self._match_id(url) + webpage = self._download_webpage(url, track_id) + + # Extract "Playables" JSON information from the page + playables = self._search_regex(r'window\.Playables = ({.*?});', webpage, + 'playables info', flags=re.DOTALL) + playables = json.loads(playables) + + # Find first track with matching ID (always the first one listed?) + track = next(filter(lambda t: t['id'] == int(track_id), playables['tracks'])) + + # Construct title from artist(s), track name, and mix name + title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name'] + if track['mix']: + title += ' (' + track['mix'] + ')' + + # Get format information + formats = [] + for ext, info in track['preview'].items(): + if info['url'] is None: + continue + fmt = { + 'url': info['url'], + 'ext': ext, + 'format_id': ext, + 'vcodec': 'none', + } + if ext == 'mp3': + fmt['preference'] = 0 + fmt['acodec'] = 'mp3' + fmt['abr'] = 96 + fmt['asr'] = 44100 + elif ext == 'mp4': + fmt['preference'] = 1 + fmt['acodec'] = 'aac' + fmt['abr'] = 96 + fmt['asr'] = 44100 + formats += [fmt] + formats.sort(key=lambda f: f['preference']) + + # Get album art as thumbnails + imgs = [] + for name, info in track['images'].items(): + if name == 'dynamic' or info['url'] is None: + continue + img = { + 'id': name, + 'url': info['url'], + 'height': info['height'], + 'width': info['width'], + } + imgs += [img] + + return { + 'id': track['id'], + 'display-id': track['slug'], + 'title': title, + 'formats': formats, + 'thumbnails': imgs, + } From 65c5e044c7ab6d3140d30c98abda07785f2974c6 Mon Sep 17 00:00:00 2001 From: "Devin J. Pohly" Date: Thu, 12 Mar 2015 16:42:55 -0400 Subject: [PATCH 2/8] fix python2 --- youtube_dl/extractor/beatportpro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/beatportpro.py b/youtube_dl/extractor/beatportpro.py index 21048b7326..c3c70fb33a 100644 --- a/youtube_dl/extractor/beatportpro.py +++ b/youtube_dl/extractor/beatportpro.py @@ -48,7 +48,7 @@ def _real_extract(self, url): playables = json.loads(playables) # Find first track with matching ID (always the first one listed?) - track = next(filter(lambda t: t['id'] == int(track_id), playables['tracks'])) + track = next(t for t in playables['tracks'] if t['id'] == int(track_id)) # Construct title from artist(s), track name, and mix name title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name'] From 1b53778175e43e2bf2cb71885a760d96727ee837 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 13 Mar 2015 21:51:49 +0600 Subject: [PATCH 3/8] [beatenpro] Use generic format sort --- youtube_dl/extractor/beatportpro.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/beatportpro.py b/youtube_dl/extractor/beatportpro.py index c3c70fb33a..bc201572ec 100644 --- a/youtube_dl/extractor/beatportpro.py +++ b/youtube_dl/extractor/beatportpro.py @@ -77,7 +77,7 @@ def _real_extract(self, url): fmt['abr'] = 96 fmt['asr'] = 44100 formats += [fmt] - formats.sort(key=lambda f: f['preference']) + self._sort_formats(formats) # Get album art as thumbnails imgs = [] From 517bcca29925548f9b9b121beec1391ef3ecedec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 13 Mar 2015 22:01:15 +0600 Subject: [PATCH 4/8] [beatenpro] Simplify and improve --- youtube_dl/extractor/beatportpro.py | 34 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/beatportpro.py b/youtube_dl/extractor/beatportpro.py index bc201572ec..69657cbde3 100644 --- a/youtube_dl/extractor/beatportpro.py +++ b/youtube_dl/extractor/beatportpro.py @@ -1,14 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor - import re import json +from .common import InfoExtractor +from ..utils import int_or_none + class BeatportProIE(InfoExtractor): - _VALID_URL = r'https?://pro\.beatport\.com/track/.*/(?P[0-9]+)' + _VALID_URL = r'https?://pro\.beatport\.com/track/.+/(?P[0-9]+)' _TESTS = [{ 'url': 'https://pro.beatport.com/track/synesthesia-original-mix/5379371', 'md5': 'b3c34d8639a2f6a7f734382358478887', @@ -42,20 +43,17 @@ def _real_extract(self, url): track_id = self._match_id(url) webpage = self._download_webpage(url, track_id) - # Extract "Playables" JSON information from the page - playables = self._search_regex(r'window\.Playables = ({.*?});', webpage, - 'playables info', flags=re.DOTALL) + playables = self._search_regex( + r'window\.Playables\s*=\s*({.*?});', webpage, + 'playables info', flags=re.DOTALL) playables = json.loads(playables) - # Find first track with matching ID (always the first one listed?) track = next(t for t in playables['tracks'] if t['id'] == int(track_id)) - # Construct title from artist(s), track name, and mix name title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name'] if track['mix']: title += ' (' + track['mix'] + ')' - # Get format information formats = [] for ext, info in track['preview'].items(): if info['url'] is None: @@ -76,26 +74,26 @@ def _real_extract(self, url): fmt['acodec'] = 'aac' fmt['abr'] = 96 fmt['asr'] = 44100 - formats += [fmt] + formats.append(fmt) self._sort_formats(formats) - # Get album art as thumbnails - imgs = [] + images = [] for name, info in track['images'].items(): - if name == 'dynamic' or info['url'] is None: + image_url = info.get('url') + if name == 'dynamic' or not image_url: continue img = { 'id': name, - 'url': info['url'], - 'height': info['height'], - 'width': info['width'], + 'url': image_url, + 'height': int_or_none(info.get('height')), + 'width': int_or_none(info.get('width')), } - imgs += [img] + images.append(img) return { 'id': track['id'], 'display-id': track['slug'], 'title': title, 'formats': formats, - 'thumbnails': imgs, + 'thumbnails': images, } From ba1d4c04883cafb55e40734776d9d8ba2ef85582 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 13 Mar 2015 22:03:58 +0600 Subject: [PATCH 5/8] [beatenpro] Improve display_id --- youtube_dl/extractor/beatportpro.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/beatportpro.py b/youtube_dl/extractor/beatportpro.py index 69657cbde3..5c072b131e 100644 --- a/youtube_dl/extractor/beatportpro.py +++ b/youtube_dl/extractor/beatportpro.py @@ -9,7 +9,7 @@ class BeatportProIE(InfoExtractor): - _VALID_URL = r'https?://pro\.beatport\.com/track/.+/(?P[0-9]+)' + _VALID_URL = r'https?://pro\.beatport\.com/track/(?P[^/]+)/(?P[0-9]+)' _TESTS = [{ 'url': 'https://pro.beatport.com/track/synesthesia-original-mix/5379371', 'md5': 'b3c34d8639a2f6a7f734382358478887', @@ -40,8 +40,11 @@ class BeatportProIE(InfoExtractor): }] def _real_extract(self, url): - track_id = self._match_id(url) - webpage = self._download_webpage(url, track_id) + mobj = re.match(self._VALID_URL, url) + track_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) playables = self._search_regex( r'window\.Playables\s*=\s*({.*?});', webpage, @@ -92,7 +95,7 @@ def _real_extract(self, url): return { 'id': track['id'], - 'display-id': track['slug'], + 'display_id': track.get('slug') or display_id, 'title': title, 'formats': formats, 'thumbnails': images, From fcd877013e4a8f654c7778019055b57031492889 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 13 Mar 2015 22:11:56 +0600 Subject: [PATCH 6/8] [beatenpro] Simplify --- youtube_dl/extractor/beatportpro.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/beatportpro.py b/youtube_dl/extractor/beatportpro.py index 5c072b131e..12a7faa4f7 100644 --- a/youtube_dl/extractor/beatportpro.py +++ b/youtube_dl/extractor/beatportpro.py @@ -2,9 +2,9 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor +from ..compat import compat_str from ..utils import int_or_none @@ -46,10 +46,11 @@ def _real_extract(self, url): webpage = self._download_webpage(url, display_id) - playables = self._search_regex( - r'window\.Playables\s*=\s*({.*?});', webpage, - 'playables info', flags=re.DOTALL) - playables = json.loads(playables) + playables = self._parse_json( + self._search_regex( + r'window\.Playables\s*=\s*({.+?});', webpage, + 'playables info', flags=re.DOTALL), + track_id) track = next(t for t in playables['tracks'] if t['id'] == int(track_id)) @@ -59,7 +60,7 @@ def _real_extract(self, url): formats = [] for ext, info in track['preview'].items(): - if info['url'] is None: + if not info['url']: continue fmt = { 'url': info['url'], @@ -85,16 +86,16 @@ def _real_extract(self, url): image_url = info.get('url') if name == 'dynamic' or not image_url: continue - img = { + image = { 'id': name, 'url': image_url, 'height': int_or_none(info.get('height')), 'width': int_or_none(info.get('width')), } - images.append(img) + images.append(image) return { - 'id': track['id'], + 'id': compat_str(track.get('id')) or track_id, 'display_id': track.get('slug') or display_id, 'title': title, 'formats': formats, From bba3fc7960c8cd6f0752c31c55ada804ba7e2ae2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 13 Mar 2015 22:13:50 +0600 Subject: [PATCH 7/8] [beatenpro] Fix tests --- youtube_dl/extractor/beatportpro.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/beatportpro.py b/youtube_dl/extractor/beatportpro.py index 12a7faa4f7..3c7775d3e2 100644 --- a/youtube_dl/extractor/beatportpro.py +++ b/youtube_dl/extractor/beatportpro.py @@ -14,8 +14,8 @@ class BeatportProIE(InfoExtractor): 'url': 'https://pro.beatport.com/track/synesthesia-original-mix/5379371', 'md5': 'b3c34d8639a2f6a7f734382358478887', 'info_dict': { - 'id': 5379371, - 'display-id': 'synesthesia-original-mix', + 'id': '5379371', + 'display_id': 'synesthesia-original-mix', 'ext': 'mp4', 'title': 'Froxic - Synesthesia (Original Mix)', }, @@ -23,8 +23,8 @@ class BeatportProIE(InfoExtractor): 'url': 'https://pro.beatport.com/track/love-and-war-original-mix/3756896', 'md5': 'e44c3025dfa38c6577fbaeb43da43514', 'info_dict': { - 'id': 3756896, - 'display-id': 'love-and-war-original-mix', + 'id': '3756896', + 'display_id': 'love-and-war-original-mix', 'ext': 'mp3', 'title': 'Wolfgang Gartner - Love & War (Original Mix)', }, @@ -32,8 +32,8 @@ class BeatportProIE(InfoExtractor): 'url': 'https://pro.beatport.com/track/birds-original-mix/4991738', 'md5': 'a1fd8e8046de3950fd039304c186c05f', 'info_dict': { - 'id': 4991738, - 'display-id': 'birds-original-mix', + 'id': '4991738', + 'display_id': 'birds-original-mix', 'ext': 'mp4', 'title': "Tos, Middle Milk, Mumblin' Johnsson - Birds (Original Mix)", } From 28c6411e4959e342ec8bc016eb8ca5dbcbdc5d7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 13 Mar 2015 22:14:51 +0600 Subject: [PATCH 8/8] Credit @djpohly for BeatportPro (#5189) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 421df69a67..c10f03b98e 100644 --- a/AUTHORS +++ b/AUTHORS @@ -114,3 +114,4 @@ Ryan Schmidt Leslie P. Polzer Duncan Keall Alexander Mamay +Devin J. Pohly