Merge remote-tracking branch 'origin/master'

Conflicts: youtube_dl/YoutubeDL.py
2025-02-18 18:30:58 +01:00 · 2015-03-09 03:01:28 +01:00 · 2015-03-09 03:01:28 +01:00 · dcca581967
commit dcca581967
parent d475b3384c dd7831fe94
13 changed files with 246 additions and 34 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -38,6 +38,7 @@
    parse_iso8601,
    read_batch_urls,
    sanitize_filename,
    sanitize_path,
    shell_quote,
    smuggle_url,
    str_to_int,
@ -131,6 +132,37 @@ def test_sanitize_ids(self):
        self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw')
        self.assertEqual(sanitize_filename('N0Y__7-UOdI', is_id=True), 'N0Y__7-UOdI')
    def test_sanitize_path(self):
        if sys.platform != 'win32':
            return
        self.assertEqual(sanitize_path('abc'), 'abc')
        self.assertEqual(sanitize_path('abc/def'), 'abc\\def')
        self.assertEqual(sanitize_path('abc\\def'), 'abc\\def')
        self.assertEqual(sanitize_path('abc|def'), 'abc#def')
        self.assertEqual(sanitize_path('<>:"|?*'), '#######')
        self.assertEqual(sanitize_path('C:/abc/def'), 'C:\\abc\\def')
        self.assertEqual(sanitize_path('C?:/abc/def'), 'C##\\abc\\def')
        self.assertEqual(sanitize_path('\\\\?\\UNC\\ComputerName\\abc'), '\\\\?\\UNC\\ComputerName\\abc')
        self.assertEqual(sanitize_path('\\\\?\\UNC/ComputerName/abc'), '\\\\?\\UNC\\ComputerName\\abc')
        self.assertEqual(sanitize_path('\\\\?\\C:\\abc'), '\\\\?\\C:\\abc')
        self.assertEqual(sanitize_path('\\\\?\\C:/abc'), '\\\\?\\C:\\abc')
        self.assertEqual(sanitize_path('\\\\?\\C:\\ab?c\\de:f'), '\\\\?\\C:\\ab#c\\de#f')
        self.assertEqual(sanitize_path('\\\\?\\C:\\abc'), '\\\\?\\C:\\abc')
        self.assertEqual(
            sanitize_path('youtube/%(uploader)s/%(autonumber)s-%(title)s-%(upload_date)s.%(ext)s'),
            'youtube\\%(uploader)s\\%(autonumber)s-%(title)s-%(upload_date)s.%(ext)s')
        self.assertEqual(
            sanitize_path('youtube/TheWreckingYard ./00001-Not bad, Especially for Free! (1987 Yamaha 700)-20141116.mp4.part'),
            'youtube\\TheWreckingYard #\\00001-Not bad, Especially for Free! (1987 Yamaha 700)-20141116.mp4.part')
        self.assertEqual(sanitize_path('abc/def...'), 'abc\\def..#')
        self.assertEqual(sanitize_path('abc.../def'), 'abc..#\\def')
        self.assertEqual(sanitize_path('abc.../def...'), 'abc..#\\def..#')
    def test_ordered_set(self):
        self.assertEqual(orderedSet([1, 1, 2, 3, 4, 4, 5, 6, 7, 3, 5]), [1, 2, 3, 4, 5, 6, 7])
        self.assertEqual(orderedSet([]), [])
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -61,6 +61,7 @@
    render_table,
    SameFileError,
    sanitize_filename,
    sanitize_path,
    std_headers,
    subtitles_filename,
    takewhile_inclusive,
@ -562,7 +563,7 @@ def prepare_filename(self, info_dict):
                                 if v is not None)
            template_dict = collections.defaultdict(lambda: 'NA', template_dict)
-            outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
+            outtmpl = sanitize_path(self.params.get('outtmpl', DEFAULT_OUTTMPL))
            tmpl = compat_expanduser(outtmpl)
            filename = tmpl % template_dict
            # Temporary fix for #4787
@ -1261,7 +1262,7 @@ def process_info(self, info_dict):
            return
        try:
-            dn = os.path.dirname(encodeFilename(filename))
+            dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
            if dn and not os.path.exists(dn):
                os.makedirs(dn)
        except (OSError, IOError) as err:
--- a/youtube_dl/downloader/f4m.py
+++ b/youtube_dl/downloader/f4m.py
@ -281,7 +281,7 @@ def _parse_bootstrap_node(self, node, base_url):
            boot_info = self._get_bootstrap_from_url(bootstrap_url)
        else:
            bootstrap_url = None
-            bootstrap = base64.b64decode(node.text)
+            bootstrap = base64.b64decode(node.text.encode('ascii'))
            boot_info = read_bootstrap_info(bootstrap)
        return (boot_info, bootstrap_url)
@ -308,7 +308,7 @@ def real_download(self, filename, info_dict):
        live = boot_info['live']
        metadata_node = media.find(_add_ns('metadata'))
        if metadata_node is not None:
-            metadata = base64.b64decode(metadata_node.text)
+            metadata = base64.b64decode(metadata_node.text.encode('ascii'))
        else:
            metadata = None
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -175,6 +175,7 @@
 from .gamespot import GameSpotIE
 from .gamestar import GameStarIE
 from .gametrailers import GametrailersIE
 from .gazeta import GazetaIE
 from .gdcvault import GDCVaultIE
 from .generic import GenericIE
 from .giantbomb import GiantBombIE
@ -363,6 +364,7 @@
 from .phoenix import PhoenixIE
 from .photobucket import PhotobucketIE
 from .planetaplay import PlanetaPlayIE
 from .pladform import PladformIE
 from .played import PlayedIE
 from .playfm import PlayFMIE
 from .playvid import PlayvidIE
--- a/youtube_dl/extractor/adultswim.py
+++ b/youtube_dl/extractor/adultswim.py
@ -2,13 +2,12 @@
 from __future__ import unicode_literals
 import re
 import json
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
    xpath_text,
    float_or_none,
    xpath_text,
 )
@ -60,6 +59,24 @@ class AdultSwimIE(InfoExtractor):
            'title': 'American Dad - Putting Francine Out of Business',
            'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
        },
    }, {
        'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/',
        'playlist': [
            {
                'md5': '3e346a2ab0087d687a05e1e7f3b3e529',
                'info_dict': {
                    'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0',
                    'ext': 'flv',
                    'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
                    'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',
                },
            }
        ],
        'info_dict': {
            'id': 'sY3cMUR_TbuE4YmdjzbIcQ',
            'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
            'description': 'Dr. Brule reports live from Wine Country with a special report on wines.  \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',
        },
    }]
    @staticmethod
@ -80,6 +97,7 @@ def find_collection_containing_video(collections, slug):
            for video in collection.get('videos'):
                if video.get('slug') == slug:
                    return collection, video
        return None, None
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
@ -90,28 +108,30 @@ def _real_extract(self, url):
        webpage = self._download_webpage(url, episode_path)
        # Extract the value of `bootstrappedData` from the Javascript in the page.
-        bootstrappedDataJS = self._search_regex(r'var bootstrappedData = ({.*});', webpage, episode_path)
+        bootstrapped_data = self._parse_json(self._search_regex(
-
+            r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path)
        try:
            bootstrappedData = json.loads(bootstrappedDataJS)
        except ValueError as ve:
            errmsg = '%s: Failed to parse JSON ' % episode_path
            raise ExtractorError(errmsg, cause=ve)
        # Downloading videos from a /videos/playlist/ URL needs to be handled differently.
        # NOTE: We are only downloading one video (the current one) not the playlist
        if is_playlist:
-            collections = bootstrappedData['playlists']['collections']
+            collections = bootstrapped_data['playlists']['collections']
            collection = self.find_collection_by_linkURL(collections, show_path)
            video_info = self.find_video_info(collection, episode_path)
            show_title = video_info['showTitle']
            segment_ids = [video_info['videoPlaybackID']]
        else:
-            collections = bootstrappedData['show']['collections']
+            collections = bootstrapped_data['show']['collections']
            collection, video_info = self.find_collection_containing_video(collections, episode_path)
-            show = bootstrappedData['show']
+            # Video wasn't found in the collections, let's try `slugged_video`.
            if video_info is None:
                if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path:
                    video_info = bootstrapped_data['slugged_video']
                else:
                    raise ExtractorError('Unable to find video info')
            show = bootstrapped_data['show']
            show_title = show['title']
            segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']]
--- a/youtube_dl/extractor/breakcom.py
+++ b/youtube_dl/extractor/breakcom.py
@ -41,7 +41,7 @@ def _real_extract(self, url):
            'tbr': media['bitRate'],
            'width': media['width'],
            'height': media['height'],
-        } for media in info['media']]
+        } for media in info['media'] if media.get('mediaPurpose') == 'play']
        if not formats:
            formats.append({
--- a/youtube_dl/extractor/gazeta.py
+++ b/youtube_dl/extractor/gazeta.py
@ -0,0 +1,38 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 class GazetaIE(InfoExtractor):
    _VALID_URL = r'(?P<url>https?://(?:www\.)?gazeta\.ru/(?:[^/]+/)?video/(?:(?:main|\d{4}/\d{2}/\d{2})/)?(?P<id>[A-Za-z0-9-_.]+)\.s?html)'
    _TESTS = [{
        'url': 'http://www.gazeta.ru/video/main/zadaite_vopros_vladislavu_yurevichu.shtml',
        'md5': 'd49c9bdc6e5a7888f27475dc215ee789',
        'info_dict': {
            'id': '205566',
            'ext': 'mp4',
            'title': '«70–80 процентов гражданских в Донецке на грани голода»',
            'description': 'md5:38617526050bd17b234728e7f9620a71',
            'thumbnail': 're:^https?://.*\.jpg',
        },
    }, {
        'url': 'http://www.gazeta.ru/lifestyle/video/2015/03/08/master-klass_krasivoi_byt._delaem_vesennii_makiyazh.shtml',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        display_id = mobj.group('id')
        embed_url = '%s?p=embed' % mobj.group('url')
        embed_page = self._download_webpage(
            embed_url, display_id, 'Downloading embed page')
        video_id = self._search_regex(
            r'<div[^>]*?class="eagleplayer"[^>]*?data-id="([^"]+)"', embed_page, 'video id')
        return self.url_result(
            'eagleplatform:gazeta.media.eagleplatform.com:%s' % video_id, 'EaglePlatform')
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -596,6 +596,19 @@ class GenericIE(InfoExtractor):
                'view_count': int,
            },
        },
        # Pladform embed
        {
            'url': 'http://muz-tv.ru/kinozal/view/7400/',
            'info_dict': {
                'id': '100183293',
                'ext': 'mp4',
                'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
                'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
                'thumbnail': 're:^https?://.*\.jpg$',
                'duration': 694,
                'age_limit': 0,
            },
        },
        # RSS feed with enclosure
        {
            'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
@ -1193,6 +1206,12 @@ def _playlist_from_matches(matches, getter=None, ie=None):
        if mobj is not None:
            return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
        # Look for Pladform embeds
        mobj = re.search(
            r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
        if mobj is not None:
            return self.url_result(mobj.group('url'), 'Pladform')
        def check_video(vurl):
            if YoutubeIE.suitable(vurl):
                return True
--- a/youtube_dl/extractor/pladform.py
+++ b/youtube_dl/extractor/pladform.py
@ -0,0 +1,90 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
    int_or_none,
    xpath_text,
    qualities,
 )
 class PladformIE(InfoExtractor):
    _VALID_URL = r'''(?x)
                    https?://
                        (?:
                            (?:
                                out\.pladform\.ru/player|
                                static\.pladform\.ru/player\.swf
                            )
                            \?.*\bvideoid=|
                            video\.pladform\.ru/catalog/video/videoid/
                        )
                        (?P<id>\d+)
                    '''
    _TESTS = [{
        # http://muz-tv.ru/kinozal/view/7400/
        'url': 'http://out.pladform.ru/player?pl=24822&videoid=100183293',
        'md5': '61f37b575dd27f1bb2e1854777fe31f4',
        'info_dict': {
            'id': '100183293',
            'ext': 'mp4',
            'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
            'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
            'thumbnail': 're:^https?://.*\.jpg$',
            'duration': 694,
            'age_limit': 0,
        },
    }, {
        'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0',
        'only_matching': True,
    }, {
        'url': 'http://video.pladform.ru/catalog/video/videoid/100183293/vkcid/0',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        video_id = self._match_id(url)
        video = self._download_xml(
            'http://out.pladform.ru/getVideo?pl=1&videoid=%s' % video_id,
            video_id)
        if video.tag == 'error':
            raise ExtractorError(
                '%s returned error: %s' % (self.IE_NAME, video.text),
                expected=True)
        quality = qualities(('ld', 'sd', 'hd'))
        formats = [{
            'url': src.text,
            'format_id': src.get('quality'),
            'quality': quality(src.get('quality')),
        } for src in video.findall('./src')]
        self._sort_formats(formats)
        webpage = self._download_webpage(
            'http://video.pladform.ru/catalog/video/videoid/%s' % video_id,
            video_id)
        title = self._og_search_title(webpage, fatal=False) or xpath_text(
            video, './/title', 'title', fatal=True)
        description = self._search_regex(
            r'</h3>\s*<p>([^<]+)</p>', webpage, 'description', fatal=False)
        thumbnail = self._og_search_thumbnail(webpage) or xpath_text(
            video, './/cover', 'cover')
        duration = int_or_none(xpath_text(video, './/time', 'duration'))
        age_limit = int_or_none(xpath_text(video, './/age18', 'age limit'))
        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'duration': duration,
            'age_limit': age_limit,
            'formats': formats,
        }
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@ -53,10 +53,10 @@ def _real_extract(self, url):
        embed = self._download_webpage(
            embed_url, video_id, 'Downloading embed page')
-        encoded_data = self._search_regex(
+        player_data = self._parse_json(self._search_regex(
-            r'"preload"\s*:\s*"([^"]+)"', embed, 'encoded data')
+            r'Y\.Ginger\.Module\.Player\((\{.*?\})\);', embed, 'player data'), video_id)
        data = self._parse_json(
-            base64.b64decode(encoded_data.encode('ascii')).decode('utf-8'), video_id)
+            base64.b64decode(player_data['preload'].encode('ascii')).decode('utf-8'), video_id)
        formats = []
        get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p'])
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@ -358,13 +358,12 @@ def _real_extract(self, url):
            'p': random.randint(1000000, 10000000),
            'player': 'twitchweb',
            'segment_preference': '4',
-            'sig': access_token['sig'],
+            'sig': access_token['sig'].encode('utf-8'),
-            'token': access_token['token'],
+            'token': access_token['token'].encode('utf-8'),
        }
        formats = self._extract_m3u8_formats(
            '%s/api/channel/hls/%s.m3u8?%s'
-            % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query).encode('utf-8')),
+            % (self._USHER_BASE, channel_id, compat_urllib_parse.urlencode(query)),
            channel_id, 'mp4')
        self._prefer_source(formats)
--- a/youtube_dl/extractor/vidme.py
+++ b/youtube_dl/extractor/vidme.py
@ -41,13 +41,10 @@ def _real_extract(self, url):
        duration = float_or_none(self._html_search_regex(
            r'data-duration="([^"]+)"', webpage, 'duration', fatal=False))
        view_count = str_to_int(self._html_search_regex(
-            r'<span class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False))
+            r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False))
        like_count = str_to_int(self._html_search_regex(
            r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">',
            webpage, 'like count', fatal=False))
        comment_count = str_to_int(self._html_search_regex(
            r'class="js-comment-count"[^>]+data-count="([\d,\.\s]+)">',
            webpage, 'comment count', fatal=False))
        return {
            'id': video_id,
@ -61,5 +58,4 @@ def _real_extract(self, url):
            'duration': duration,
            'view_count': view_count,
            'like_count': like_count,
            'comment_count': comment_count,
        }
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -252,15 +252,12 @@ def sanitize_open(filename, open_mode):
            raise
        # In case of error, try to remove win32 forbidden chars
-        alt_filename = os.path.join(
+        alt_filename = sanitize_path(filename)
            re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
            for path_part in os.path.split(filename)
        )
        if alt_filename == filename:
            raise
        else:
            # An exception here should be caught in the caller
-            stream = open(encodeFilename(filename), open_mode)
+            stream = open(encodeFilename(alt_filename), open_mode)
            return (stream, alt_filename)
@ -311,6 +308,24 @@ def replace_insane(char):
    return result
 def sanitize_path(s):
    """Sanitizes and normalizes path on Windows"""
    if sys.platform != 'win32':
        return s
    drive, _ = os.path.splitdrive(s)
    unc, _ = os.path.splitunc(s)
    unc_or_drive = unc or drive
    norm_path = os.path.normpath(remove_start(s, unc_or_drive)).split(os.path.sep)
    if unc_or_drive:
        norm_path.pop(0)
    sanitized_path = [
        re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
        for path_part in norm_path]
    if unc_or_drive:
        sanitized_path.insert(0, unc_or_drive + os.path.sep)
    return os.path.join(*sanitized_path)
 def orderedSet(iterable):
    """ Remove all duplicates from the input iterable """
    res = []