Merge remote-tracking branch 'origin/master'

Conflicts:
	youtube_dl/YoutubeDL.py
This commit is contained in:
Philipp Hagemeister 2015-03-09 03:01:28 +01:00
commit dcca581967
13 changed files with 246 additions and 34 deletions

View File

@ -38,6 +38,7 @@
parse_iso8601, parse_iso8601,
read_batch_urls, read_batch_urls,
sanitize_filename, sanitize_filename,
sanitize_path,
shell_quote, shell_quote,
smuggle_url, smuggle_url,
str_to_int, str_to_int,
@ -131,6 +132,37 @@ def test_sanitize_ids(self):
self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw') self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw')
self.assertEqual(sanitize_filename('N0Y__7-UOdI', is_id=True), 'N0Y__7-UOdI') self.assertEqual(sanitize_filename('N0Y__7-UOdI', is_id=True), 'N0Y__7-UOdI')
def test_sanitize_path(self):
if sys.platform != 'win32':
return
self.assertEqual(sanitize_path('abc'), 'abc')
self.assertEqual(sanitize_path('abc/def'), 'abc\\def')
self.assertEqual(sanitize_path('abc\\def'), 'abc\\def')
self.assertEqual(sanitize_path('abc|def'), 'abc#def')
self.assertEqual(sanitize_path('<>:"|?*'), '#######')
self.assertEqual(sanitize_path('C:/abc/def'), 'C:\\abc\\def')
self.assertEqual(sanitize_path('C?:/abc/def'), 'C##\\abc\\def')
self.assertEqual(sanitize_path('\\\\?\\UNC\\ComputerName\\abc'), '\\\\?\\UNC\\ComputerName\\abc')
self.assertEqual(sanitize_path('\\\\?\\UNC/ComputerName/abc'), '\\\\?\\UNC\\ComputerName\\abc')
self.assertEqual(sanitize_path('\\\\?\\C:\\abc'), '\\\\?\\C:\\abc')
self.assertEqual(sanitize_path('\\\\?\\C:/abc'), '\\\\?\\C:\\abc')
self.assertEqual(sanitize_path('\\\\?\\C:\\ab?c\\de:f'), '\\\\?\\C:\\ab#c\\de#f')
self.assertEqual(sanitize_path('\\\\?\\C:\\abc'), '\\\\?\\C:\\abc')
self.assertEqual(
sanitize_path('youtube/%(uploader)s/%(autonumber)s-%(title)s-%(upload_date)s.%(ext)s'),
'youtube\\%(uploader)s\\%(autonumber)s-%(title)s-%(upload_date)s.%(ext)s')
self.assertEqual(
sanitize_path('youtube/TheWreckingYard ./00001-Not bad, Especially for Free! (1987 Yamaha 700)-20141116.mp4.part'),
'youtube\\TheWreckingYard #\\00001-Not bad, Especially for Free! (1987 Yamaha 700)-20141116.mp4.part')
self.assertEqual(sanitize_path('abc/def...'), 'abc\\def..#')
self.assertEqual(sanitize_path('abc.../def'), 'abc..#\\def')
self.assertEqual(sanitize_path('abc.../def...'), 'abc..#\\def..#')
def test_ordered_set(self): def test_ordered_set(self):
self.assertEqual(orderedSet([1, 1, 2, 3, 4, 4, 5, 6, 7, 3, 5]), [1, 2, 3, 4, 5, 6, 7]) self.assertEqual(orderedSet([1, 1, 2, 3, 4, 4, 5, 6, 7, 3, 5]), [1, 2, 3, 4, 5, 6, 7])
self.assertEqual(orderedSet([]), []) self.assertEqual(orderedSet([]), [])

View File

@ -61,6 +61,7 @@
render_table, render_table,
SameFileError, SameFileError,
sanitize_filename, sanitize_filename,
sanitize_path,
std_headers, std_headers,
subtitles_filename, subtitles_filename,
takewhile_inclusive, takewhile_inclusive,
@ -562,7 +563,7 @@ def prepare_filename(self, info_dict):
if v is not None) if v is not None)
template_dict = collections.defaultdict(lambda: 'NA', template_dict) template_dict = collections.defaultdict(lambda: 'NA', template_dict)
outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) outtmpl = sanitize_path(self.params.get('outtmpl', DEFAULT_OUTTMPL))
tmpl = compat_expanduser(outtmpl) tmpl = compat_expanduser(outtmpl)
filename = tmpl % template_dict filename = tmpl % template_dict
# Temporary fix for #4787 # Temporary fix for #4787
@ -1261,7 +1262,7 @@ def process_info(self, info_dict):
return return
try: try:
dn = os.path.dirname(encodeFilename(filename)) dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
if dn and not os.path.exists(dn): if dn and not os.path.exists(dn):
os.makedirs(dn) os.makedirs(dn)
except (OSError, IOError) as err: except (OSError, IOError) as err:

View File

@ -281,7 +281,7 @@ def _parse_bootstrap_node(self, node, base_url):
boot_info = self._get_bootstrap_from_url(bootstrap_url) boot_info = self._get_bootstrap_from_url(bootstrap_url)
else: else:
bootstrap_url = None bootstrap_url = None
bootstrap = base64.b64decode(node.text) bootstrap = base64.b64decode(node.text.encode('ascii'))
boot_info = read_bootstrap_info(bootstrap) boot_info = read_bootstrap_info(bootstrap)
return (boot_info, bootstrap_url) return (boot_info, bootstrap_url)
@ -308,7 +308,7 @@ def real_download(self, filename, info_dict):
live = boot_info['live'] live = boot_info['live']
metadata_node = media.find(_add_ns('metadata')) metadata_node = media.find(_add_ns('metadata'))
if metadata_node is not None: if metadata_node is not None:
metadata = base64.b64decode(metadata_node.text) metadata = base64.b64decode(metadata_node.text.encode('ascii'))
else: else:
metadata = None metadata = None

View File

@ -175,6 +175,7 @@
from .gamespot import GameSpotIE from .gamespot import GameSpotIE
from .gamestar import GameStarIE from .gamestar import GameStarIE
from .gametrailers import GametrailersIE from .gametrailers import GametrailersIE
from .gazeta import GazetaIE
from .gdcvault import GDCVaultIE from .gdcvault import GDCVaultIE
from .generic import GenericIE from .generic import GenericIE
from .giantbomb import GiantBombIE from .giantbomb import GiantBombIE
@ -363,6 +364,7 @@
from .phoenix import PhoenixIE from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE from .photobucket import PhotobucketIE
from .planetaplay import PlanetaPlayIE from .planetaplay import PlanetaPlayIE
from .pladform import PladformIE
from .played import PlayedIE from .played import PlayedIE
from .playfm import PlayFMIE from .playfm import PlayFMIE
from .playvid import PlayvidIE from .playvid import PlayvidIE

View File

@ -2,13 +2,12 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
import json
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
xpath_text,
float_or_none, float_or_none,
xpath_text,
) )
@ -60,6 +59,24 @@ class AdultSwimIE(InfoExtractor):
'title': 'American Dad - Putting Francine Out of Business', 'title': 'American Dad - Putting Francine Out of Business',
'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].' 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
}, },
}, {
'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/',
'playlist': [
{
'md5': '3e346a2ab0087d687a05e1e7f3b3e529',
'info_dict': {
'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0',
'ext': 'flv',
'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',
},
}
],
'info_dict': {
'id': 'sY3cMUR_TbuE4YmdjzbIcQ',
'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',
},
}] }]
@staticmethod @staticmethod
@ -80,6 +97,7 @@ def find_collection_containing_video(collections, slug):
for video in collection.get('videos'): for video in collection.get('videos'):
if video.get('slug') == slug: if video.get('slug') == slug:
return collection, video return collection, video
return None, None
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
@ -90,28 +108,30 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, episode_path) webpage = self._download_webpage(url, episode_path)
# Extract the value of `bootstrappedData` from the Javascript in the page. # Extract the value of `bootstrappedData` from the Javascript in the page.
bootstrappedDataJS = self._search_regex(r'var bootstrappedData = ({.*});', webpage, episode_path) bootstrapped_data = self._parse_json(self._search_regex(
r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path)
try:
bootstrappedData = json.loads(bootstrappedDataJS)
except ValueError as ve:
errmsg = '%s: Failed to parse JSON ' % episode_path
raise ExtractorError(errmsg, cause=ve)
# Downloading videos from a /videos/playlist/ URL needs to be handled differently. # Downloading videos from a /videos/playlist/ URL needs to be handled differently.
# NOTE: We are only downloading one video (the current one) not the playlist # NOTE: We are only downloading one video (the current one) not the playlist
if is_playlist: if is_playlist:
collections = bootstrappedData['playlists']['collections'] collections = bootstrapped_data['playlists']['collections']
collection = self.find_collection_by_linkURL(collections, show_path) collection = self.find_collection_by_linkURL(collections, show_path)
video_info = self.find_video_info(collection, episode_path) video_info = self.find_video_info(collection, episode_path)
show_title = video_info['showTitle'] show_title = video_info['showTitle']
segment_ids = [video_info['videoPlaybackID']] segment_ids = [video_info['videoPlaybackID']]
else: else:
collections = bootstrappedData['show']['collections'] collections = bootstrapped_data['show']['collections']
collection, video_info = self.find_collection_containing_video(collections, episode_path) collection, video_info = self.find_collection_containing_video(collections, episode_path)
show = bootstrappedData['show'] # Video wasn't found in the collections, let's try `slugged_video`.
if video_info is None:
if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path:
video_info = bootstrapped_data['slugged_video']
else:
raise ExtractorError('Unable to find video info')
show = bootstrapped_data['show']
show_title = show['title'] show_title = show['title']
segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']]

View File

@ -41,7 +41,7 @@ def _real_extract(self, url):
'tbr': media['bitRate'], 'tbr': media['bitRate'],
'width': media['width'], 'width': media['width'],
'height': media['height'], 'height': media['height'],
} for media in info['media']] } for media in info['media'] if media.get('mediaPurpose') == 'play']
if not formats: if not formats:
formats.append({ formats.append({

View File

@ -0,0 +1,38 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class GazetaIE(InfoExtractor):
_VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:(?:main|\d{4}/\d{2}/\d{2})/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)'
_TESTS = [{
'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml',
'md5': 'd49c9bdc6e5a7888f27475dc215ee789',
'info_dict': {
'id': '205566',
'ext': 'mp4',
'title': '«7080 процентов гражданских в Донецке на грани голода»',
'description': 'md5:38617526050bd17b234728e7f9620a71',
'thumbnail': 're:^https?://.*\.jpg',
},
}, {
'url': 'http://www.gazeta.ru/lifestyle/video/2015/03/08/master-klass_krasivoi_byt._delaem_vesennii_makiyazh.shtml',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('id')
embed_url = '%s?p=embed' % mobj.group('url')
embed_page = self._download_webpage(
embed_url, display_id, 'Downloading embed page')
video_id = self._search_regex(
r'<div[^>]*?class="eagleplayer"[^>]*?data-id="([^"]+)"', embed_page, 'video id')
return self.url_result(
'eagleplatform:gazeta.media.eagleplatform.com:%s' % video_id, 'EaglePlatform')

View File

@ -596,6 +596,19 @@ class GenericIE(InfoExtractor):
'view_count': int, 'view_count': int,
}, },
}, },
# Pladform embed
{
'url': 'http://muz-tv.ru/kinozal/view/7400/',
'info_dict': {
'id': '100183293',
'ext': 'mp4',
'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 694,
'age_limit': 0,
},
},
# RSS feed with enclosure # RSS feed with enclosure
{ {
'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
@ -1193,6 +1206,12 @@ def _playlist_from_matches(matches, getter=None, ie=None):
if mobj is not None: if mobj is not None:
return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
# Look for Pladform embeds
mobj = re.search(
r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'Pladform')
def check_video(vurl): def check_video(vurl):
if YoutubeIE.suitable(vurl): if YoutubeIE.suitable(vurl):
return True return True

View File

@ -0,0 +1,90 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
xpath_text,
qualities,
)
class PladformIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:
(?:
out\.pladform\.ru/player|
static\.pladform\.ru/player\.swf
)
\?.*\bvideoid=|
video\.pladform\.ru/catalog/video/videoid/
)
(?P<id>\d+)
'''
_TESTS = [{
# http://muz-tv.ru/kinozal/view/7400/
'url': 'http://out.pladform.ru/player?pl=24822&videoid=100183293',
'md5': '61f37b575dd27f1bb2e1854777fe31f4',
'info_dict': {
'id': '100183293',
'ext': 'mp4',
'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
'thumbnail': 're:^https?://.*\.jpg$',
'duration': 694,
'age_limit': 0,
},
}, {
'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0',
'only_matching': True,
}, {
'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video = self._download_xml(
'http://out.pladform.ru/getVideo?pl=1&videoid=%s' % video_id,
video_id)
if video.tag == 'error':
raise ExtractorError(
'%s returned error: %s' % (self.IE_NAME, video.text),
expected=True)
quality = qualities(('ld', 'sd', 'hd'))
formats = [{
'url': src.text,
'format_id': src.get('quality'),
'quality': quality(src.get('quality')),
} for src in video.findall('./src')]
self._sort_formats(formats)
webpage = self._download_webpage(
'http://video.pladform.ru/catalog/video/videoid/%s' % video_id,
video_id)
title = self._og_search_title(webpage, fatal=False) or xpath_text(
video, './/title', 'title', fatal=True)
description = self._search_regex(
r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False)
thumbnail = self._og_search_thumbnail(webpage) or xpath_text(
video, './/cover', 'cover')
duration = int_or_none(xpath_text(video, './/time', 'duration'))
age_limit = int_or_none(xpath_text(video, './/age18', 'age limit'))
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'age_limit': age_limit,
'formats': formats,
}

View File

@ -53,10 +53,10 @@ def _real_extract(self, url):
embed = self._download_webpage( embed = self._download_webpage(
embed_url, video_id, 'Downloading embed page') embed_url, video_id, 'Downloading embed page')
encoded_data = self._search_regex( player_data = self._parse_json(self._search_regex(
r'"preload"\s*:\s*"([^"]+)"', embed, 'encoded data') r'Y\.Ginger\.Module\.Player\((\{.*?\})\);', embed, 'player data'), video_id)
data = self._parse_json( data = self._parse_json(
base64.b64decode(encoded_data.encode('ascii')).decode('utf-8'), video_id) base64.b64decode(player_data['preload'].encode('ascii')).decode('utf-8'), video_id)
formats = [] formats = []
get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p'])

View File

@ -358,13 +358,12 @@ def _real_extract(self, url):
'p': random.randint(1000000, 10000000), 'p': random.randint(1000000, 10000000),
'player': 'twitchweb', 'player': 'twitchweb',
'segment_preference': '4', 'segment_preference': '4',
'sig': access_token['sig'], 'sig': access_token['sig'].encode('utf-8'),
'token': access_token['token'], 'token': access_token['token'].encode('utf-8'),
} }
formats = self._extract_m3u8_formats( formats = self._extract_m3u8_formats(
'%s/api/channel/hls/%s.m3u8?%s' '%s/api/channel/hls/%s.m3u8?%s'
% (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query).encode('utf-8')), % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query)),
channel_id, 'mp4') channel_id, 'mp4')
self._prefer_source(formats) self._prefer_source(formats)

View File

@ -41,13 +41,10 @@ def _real_extract(self, url):
duration = float_or_none(self._html_search_regex( duration = float_or_none(self._html_search_regex(
r'data-duration="([^"]+)"', webpage, 'duration', fatal=False)) r'data-duration="([^"]+)"', webpage, 'duration', fatal=False))
view_count = str_to_int(self._html_search_regex( view_count = str_to_int(self._html_search_regex(
r'<span class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False)) r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False))
like_count = str_to_int(self._html_search_regex( like_count = str_to_int(self._html_search_regex(
r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">', r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">',
webpage, 'like count', fatal=False)) webpage, 'like count', fatal=False))
comment_count = str_to_int(self._html_search_regex(
r'class="js-comment-count"[^>]+data-count="([\d,\.\s]+)">',
webpage, 'comment count', fatal=False))
return { return {
'id': video_id, 'id': video_id,
@ -61,5 +58,4 @@ def _real_extract(self, url):
'duration': duration, 'duration': duration,
'view_count': view_count, 'view_count': view_count,
'like_count': like_count, 'like_count': like_count,
'comment_count': comment_count,
} }

View File

@ -252,15 +252,12 @@ def sanitize_open(filename, open_mode):
raise raise
# In case of error, try to remove win32 forbidden chars # In case of error, try to remove win32 forbidden chars
alt_filename = os.path.join( alt_filename = sanitize_path(filename)
re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
for path_part in os.path.split(filename)
)
if alt_filename == filename: if alt_filename == filename:
raise raise
else: else:
# An exception here should be caught in the caller # An exception here should be caught in the caller
stream = open(encodeFilename(filename), open_mode) stream = open(encodeFilename(alt_filename), open_mode)
return (stream, alt_filename) return (stream, alt_filename)
@ -311,6 +308,24 @@ def replace_insane(char):
return result return result
def sanitize_path(s):
"""Sanitizes and normalizes path on Windows"""
if sys.platform != 'win32':
return s
drive, _ = os.path.splitdrive(s)
unc, _ = os.path.splitunc(s)
unc_or_drive = unc or drive
norm_path = os.path.normpath(remove_start(s, unc_or_drive)).split(os.path.sep)
if unc_or_drive:
norm_path.pop(0)
sanitized_path = [
re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
for path_part in norm_path]
if unc_or_drive:
sanitized_path.insert(0, unc_or_drive + os.path.sep)
return os.path.join(*sanitized_path)
def orderedSet(iterable): def orderedSet(iterable):
""" Remove all duplicates from the input iterable """ """ Remove all duplicates from the input iterable """
res = [] res = []