diff --git a/AUTHORS b/AUTHORS index bb4d8b4d1a..29ce9e3e47 100644 --- a/AUTHORS +++ b/AUTHORS @@ -96,3 +96,4 @@ Mathias Rav Petr Kutalek Will Glynn Max Reimann +Cédric Luthi diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ab0f76862d..c15786ad7f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -25,6 +25,7 @@ ArteTVDDCIE, ArteTVEmbedIE, ) +from .atresplayer import AtresPlayerIE from .audiomack import AudiomackIE from .auengine import AUEngineIE from .azubu import AzubuIE @@ -169,8 +170,10 @@ from .groupon import GrouponIE from .hark import HarkIE from .heise import HeiseIE +from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE +from .hitbox import HitboxIE, HitboxLiveIE from .hornbunny import HornBunnyIE from .hostingbulk import HostingBulkIE from .hotnewhiphop import HotNewHipHopIE @@ -515,6 +518,7 @@ from .xnxx import XNXXIE from .xvideos import XVideosIE from .xtube import XTubeUserIE, XTubeIE +from .xxxymovies import XXXYMoviesIE from .yahoo import ( YahooIE, YahooSearchIE, diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py new file mode 100644 index 0000000000..72e83bfc2c --- /dev/null +++ b/youtube_dl/extractor/atresplayer.py @@ -0,0 +1,114 @@ +from __future__ import unicode_literals + +import time +import hmac + +from .common import InfoExtractor +from ..utils import ( + compat_str, + compat_urllib_request, + int_or_none, + float_or_none, + xpath_text, + ExtractorError, +) + + +class AtresPlayerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?atresplayer\.com/television/[^/]+/[^/]+/[^/]+/(?P.+?)_\d+\.html' + _TESTS = [ + { + 'url': 'http://www.atresplayer.com/television/programas/el-club-de-la-comedia/temporada-4/capitulo-10-especial-solidario-nochebuena_2014122100174.html', + 'md5': 'efd56753cda1bb64df52a3074f62e38a', + 'info_dict': { + 'id': 'capitulo-10-especial-solidario-nochebuena', + 'ext': 'mp4', + 'title': 'Especial Solidario de Nochebuena', + 'description': 'md5:e2d52ff12214fa937107d21064075bf1', + 'duration': 5527.6, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }, + { + 'url': 'http://www.atresplayer.com/television/series/el-secreto-de-puente-viejo/el-chico-de-los-tres-lunares/capitulo-977-29-12-14_2014122400174.html', + 'only_matching': True, + }, + ] + + _USER_AGENT = 'Dalvik/1.6.0 (Linux; U; Android 4.3; GT-I9300 Build/JSS15J' + _MAGIC = 'QWtMLXs414Yo+c#_+Q#K@NN)' + _TIMESTAMP_SHIFT = 30000 + + _TIME_API_URL = 'http://servicios.atresplayer.com/api/admin/time.json' + _URL_VIDEO_TEMPLATE = 'https://servicios.atresplayer.com/api/urlVideo/{1}/{0}/{1}|{2}|{3}.json' + _PLAYER_URL_TEMPLATE = 'https://servicios.atresplayer.com/episode/getplayer.json?episodePk=%s' + _EPISODE_URL_TEMPLATE = 'http://www.atresplayer.com/episodexml/%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + episode_id = self._search_regex( + r'episode="([^"]+)"', webpage, 'episode id') + + timestamp = int_or_none(self._download_webpage( + self._TIME_API_URL, + video_id, 'Downloading timestamp', fatal=False), 1000, time.time()) + timestamp_shifted = compat_str(timestamp + self._TIMESTAMP_SHIFT) + token = hmac.new( + self._MAGIC.encode('ascii'), + (episode_id + timestamp_shifted).encode('utf-8') + ).hexdigest() + + formats = [] + for fmt in ['windows', 'android_tablet']: + request = compat_urllib_request.Request( + self._URL_VIDEO_TEMPLATE.format(fmt, episode_id, timestamp_shifted, token)) + request.add_header('Youtubedl-user-agent', self._USER_AGENT) + + fmt_json = self._download_json( + request, video_id, 'Downloading %s video JSON' % fmt) + + result = fmt_json.get('resultDes') + if result.lower() != 'ok': + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, result), expected=True) + + for _, video_url in fmt_json['resultObject'].items(): + if video_url.endswith('/Manifest'): + formats.extend(self._extract_f4m_formats(video_url[:-9] + '/manifest.f4m', video_id)) + else: + formats.append({ + 'url': video_url, + 'format_id': 'android', + 'preference': 1, + }) + self._sort_formats(formats) + + player = self._download_json( + self._PLAYER_URL_TEMPLATE % episode_id, + episode_id) + + path_data = player.get('pathData') + + episode = self._download_xml( + self._EPISODE_URL_TEMPLATE % path_data, + video_id, 'Downloading episode XML') + + duration = float_or_none(xpath_text( + episode, './media/asset/info/technical/contentDuration', 'duration')) + + art = episode.find('./media/asset/info/art') + title = xpath_text(art, './name', 'title') + description = xpath_text(art, './description', 'description') + thumbnail = xpath_text(episode, './media/asset/files/background', 'thumbnail') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 1bff005d64..93e8d0de35 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ class CNNIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ - (?P.+?/(?P[^/]+?)(?:\.cnn(?:-ap)?|(?=&)))''' + (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:cnn|hln)(?:-ap)?|(?=&)))''' _TESTS = [{ 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', @@ -35,6 +35,16 @@ class CNNIE(InfoExtractor): "description": "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"", "upload_date": "20130821", } + }, { + 'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html', + 'md5': 'f14d02ebd264df951feb2400e2c25a1b', + 'info_dict': { + 'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln', + 'ext': 'mp4', + 'title': 'Nashville Ep. 1: Hand crafted skateboards', + 'description': 'md5:e7223a503315c9f150acac52e76de086', + 'upload_date': '20141222', + } }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index c6b813f58e..934da765ee 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -38,7 +38,7 @@ def _real_extract(self, url): canonical_url = 'http://tvpot.daum.net/v/%s' % video_id webpage = self._download_webpage(canonical_url, video_id) full_id = self._search_regex( - r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]', + r'src=["\']http://videofarm\.daum\.net/controller/video/viewer/Video\.html\?.*?vid=(.+?)[&"\']', webpage, 'full id') query = compat_urllib_parse.urlencode({'vid': full_id}) info = self._download_xml( diff --git a/youtube_dl/extractor/hellporno.py b/youtube_dl/extractor/hellporno.py new file mode 100644 index 0000000000..7a1c75b655 --- /dev/null +++ b/youtube_dl/extractor/hellporno.py @@ -0,0 +1,71 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + js_to_json, + remove_end, +) + + +class HellPornoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hellporno\.com/videos/(?P<id>[^/]+)' + _TEST = { + 'url': 'http://hellporno.com/videos/dixie-is-posing-with-naked-ass-very-erotic/', + 'md5': '1fee339c610d2049699ef2aa699439f1', + 'info_dict': { + 'id': '149116', + 'display_id': 'dixie-is-posing-with-naked-ass-very-erotic', + 'ext': 'mp4', + 'title': 'Dixie is posing with naked ass very erotic', + 'thumbnail': 're:https?://.*\.jpg$', + 'age_limit': 18, + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + title = remove_end(self._html_search_regex( + r'<title>([^<]+)', webpage, 'title'), ' - Hell Porno') + + flashvars = self._parse_json(self._search_regex( + r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flashvars'), + display_id, transform_source=js_to_json) + + video_id = flashvars.get('video_id') + thumbnail = flashvars.get('preview_url') + ext = flashvars.get('postfix', '.mp4')[1:] + + formats = [] + for video_url_key in ['video_url', 'video_alt_url']: + video_url = flashvars.get(video_url_key) + if not video_url: + continue + video_text = flashvars.get('%s_text' % video_url_key) + fmt = { + 'url': video_url, + 'ext': ext, + 'format_id': video_text, + } + m = re.search(r'^(?P\d+)[pP]', video_text) + if m: + fmt['height'] = int(m.group('height')) + formats.append(fmt) + self._sort_formats(formats) + + categories = self._html_search_meta( + 'keywords', webpage, 'categories', default='').split(',') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'categories': categories, + 'age_limit': 18, + 'formats': formats, + } diff --git a/youtube_dl/extractor/hitbox.py b/youtube_dl/extractor/hitbox.py new file mode 100644 index 0000000000..84bd7c0804 --- /dev/null +++ b/youtube_dl/extractor/hitbox.py @@ -0,0 +1,166 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + parse_iso8601, + float_or_none, + int_or_none, + compat_str, +) + + +class HitboxIE(InfoExtractor): + IE_NAME = 'hitbox' + _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/video/(?P[0-9]+)' + _TEST = { + 'url': 'http://www.hitbox.tv/video/203213', + 'info_dict': { + 'id': '203213', + 'title': 'hitbox @ gamescom, Sub Button Hype extended, Giveaway - hitbox News Update with Oxy', + 'alt_title': 'hitboxlive - Aug 9th #6', + 'description': '', + 'ext': 'mp4', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 215.1666, + 'resolution': 'HD 720p', + 'uploader': 'hitboxlive', + 'view_count': int, + 'timestamp': 1407576133, + 'upload_date': '20140809', + 'categories': ['Live Show'], + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _extract_metadata(self, url, video_id): + thumb_base = 'https://edge.sf.hitbox.tv' + metadata = self._download_json( + '%s/%s' % (url, video_id), video_id) + + date = 'media_live_since' + media_type = 'livestream' + if metadata.get('media_type') == 'video': + media_type = 'video' + date = 'media_date_added' + + video_meta = metadata.get(media_type, [])[0] + title = video_meta.get('media_status') + alt_title = video_meta.get('media_title') + description = clean_html( + video_meta.get('media_description') or + video_meta.get('media_description_md')) + duration = float_or_none(video_meta.get('media_duration')) + uploader = video_meta.get('media_user_name') + views = int_or_none(video_meta.get('media_views')) + timestamp = parse_iso8601(video_meta.get(date), ' ') + categories = [video_meta.get('category_name')] + thumbs = [ + {'url': thumb_base + video_meta.get('media_thumbnail'), + 'width': 320, + 'height': 180}, + {'url': thumb_base + video_meta.get('media_thumbnail_large'), + 'width': 768, + 'height': 432}, + ] + + return { + 'id': video_id, + 'title': title, + 'alt_title': alt_title, + 'description': description, + 'ext': 'mp4', + 'thumbnails': thumbs, + 'duration': duration, + 'uploader': uploader, + 'view_count': views, + 'timestamp': timestamp, + 'categories': categories, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + metadata = self._extract_metadata( + 'https://www.hitbox.tv/api/media/video', + video_id) + + player_config = self._download_json( + 'https://www.hitbox.tv/api/player/config/video/%s' % video_id, + video_id) + + clip = player_config.get('clip') + video_url = clip.get('url') + res = clip.get('bitrates', [])[0].get('label') + + metadata['resolution'] = res + metadata['url'] = video_url + metadata['protocol'] = 'm3u8' + + return metadata + + +class HitboxLiveIE(HitboxIE): + IE_NAME = 'hitbox:live' + _VALID_URL = r'https?://(?:www\.)?hitbox\.tv/(?!video)(?P.+)' + _TEST = { + 'url': 'http://www.hitbox.tv/dimak', + 'info_dict': { + 'id': 'dimak', + 'ext': 'mp4', + 'description': 'md5:c9f80fa4410bc588d7faa40003fc7d0e', + 'timestamp': int, + 'upload_date': compat_str, + 'title': compat_str, + 'uploader': 'Dimak', + }, + 'params': { + # live + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + metadata = self._extract_metadata( + 'https://www.hitbox.tv/api/media/live', + video_id) + + player_config = self._download_json( + 'https://www.hitbox.tv/api/player/config/live/%s' % video_id, + video_id) + + formats = [] + cdns = player_config.get('cdns') + servers = [] + for cdn in cdns: + base_url = cdn.get('netConnectionUrl') + host = re.search('.+\.([^\.]+\.[^\./]+)/.+', base_url).group(1) + if base_url not in servers: + servers.append(base_url) + for stream in cdn.get('bitrates'): + label = stream.get('label') + if label != 'Auto': + formats.append({ + 'url': '%s/%s' % (base_url, stream.get('url')), + 'ext': 'mp4', + 'vbr': stream.get('bitrate'), + 'resolution': label, + 'rtmp_live': True, + 'format_note': host, + 'page_url': url, + 'player_url': 'http://www.hitbox.tv/static/player/flowplayer/flowplayer.commercial-3.2.16.swf', + }) + + self._sort_formats(formats) + metadata['formats'] = formats + metadata['is_live'] = True + metadata['title'] = self._live_title(metadata.get('title')) + return metadata diff --git a/youtube_dl/extractor/xxxymovies.py b/youtube_dl/extractor/xxxymovies.py new file mode 100644 index 0000000000..5c8f17eb2f --- /dev/null +++ b/youtube_dl/extractor/xxxymovies.py @@ -0,0 +1,81 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + int_or_none, +) + + +class XXXYMoviesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?xxxymovies\.com/videos/(?P\d+)/(?P[^/]+)' + _TEST = { + 'url': 'http://xxxymovies.com/videos/138669/ecstatic-orgasm-sofcore/', + 'md5': '810b1bdbbffff89dd13bdb369fe7be4b', + 'info_dict': { + 'id': '138669', + 'display_id': 'ecstatic-orgasm-sofcore', + 'ext': 'mp4', + 'title': 'Ecstatic Orgasm Sofcore', + 'duration': 931, + 'categories': list, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + video_url = self._search_regex( + r"video_url\s*:\s*'([^']+)'", webpage, 'video URL') + + title = self._html_search_regex( + [r'
\s*

([^<]+)

', + r'(.*?)\s*-\s*XXXYMovies\.com'], + webpage, 'title') + + thumbnail = self._search_regex( + r"preview_url\s*:\s*'([^']+)'", + webpage, 'thumbnail', fatal=False) + + categories = self._html_search_meta( + 'keywords', webpage, 'categories', default='').split(',') + + duration = parse_duration(self._search_regex( + r'Duration:\s*(\d+:\d+)', + webpage, 'duration', fatal=False)) + + view_count = int_or_none(self._html_search_regex( + r'
\s*(\d+)', + webpage, 'view count', fatal=False)) + like_count = int_or_none(self._search_regex( + r'>\s*Likes? \((\d+)\)', + webpage, 'like count', fatal=False)) + dislike_count = int_or_none(self._search_regex( + r'>\s*Dislike \((\d+)\)', + webpage, 'dislike count', fatal=False)) + + age_limit = self._rta_search(webpage) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'categories': categories, + 'duration': duration, + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 550e18733e..3da83e3a84 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -418,6 +418,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'upload_date': '20140605', }, }, + # Age-gate video with encrypted signature + { + 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU', + 'info_dict': { + 'id': '6kLq3WMV1nU', + 'ext': 'mp4', + 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', + 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', + 'uploader': 'LloydVEVO', + 'uploader_id': 'LloydVEVO', + 'upload_date': '20110629', + }, + }, # video_info is None (https://github.com/rg3/youtube-dl/issues/4421) { 'url': '__2ABJjxzNo', @@ -766,11 +779,13 @@ def _real_extract(self, url): age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube + url = proto + '://www.youtube.com/embed/%s' % video_id + embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') data = compat_urllib_parse.urlencode({ 'video_id': video_id, 'eurl': 'https://youtube.googleapis.com/v/' + video_id, 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''), + r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data video_info_webpage = self._download_webpage( @@ -968,11 +983,10 @@ def _map_to_format_list(urlmap): elif 's' in url_data: encrypted_sig = url_data['s'][0] - if not age_gate: - jsplayer_url_json = self._search_regex( - r'"assets":.+?"js":\s*("[^"]+")', - video_webpage, 'JS player URL') - player_url = json.loads(jsplayer_url_json) + jsplayer_url_json = self._search_regex( + r'"assets":.+?"js":\s*("[^"]+")', + embed_webpage if age_gate else video_webpage, 'JS player URL') + player_url = json.loads(jsplayer_url_json) if player_url is None: player_url_json = self._search_regex( r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',