From 62e609ab771140b185e98ed085445d40b751cbfc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 25 Feb 2014 01:43:17 +0100 Subject: [PATCH] Ignore BOM in batch files (Fixes #2450) --- test/test_utils.py | 11 +++++++++++ youtube_dl/__init__.py | 13 ++++++------- youtube_dl/utils.py | 17 +++++++++++++++++ 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 84553b9438..4e3c37fb4c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -9,6 +9,7 @@ # Various small unit tests +import io import xml.etree.ElementTree #from youtube_dl.utils import htmlentity_transform @@ -21,6 +22,7 @@ orderedSet, PagedList, parse_duration, + read_batch_urls, sanitize_filename, shell_quote, smuggle_url, @@ -250,5 +252,14 @@ def get_page(pagenum): def test_struct_unpack(self): self.assertEqual(struct_unpack(u'!B', b'\x00'), (0,)) + def test_read_batch_urls(self): + f = io.StringIO(u'''\xef\xbb\xbf foo + bar\r + baz + # More after this line\r + ; or after this + bam''') + self.assertEqual(read_batch_urls(f), [u'foo', u'bar', u'baz', u'bam']) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 84f29a1a5c..2aaafd37a3 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -71,6 +71,7 @@ get_cachedir, MaxDownloadsReached, preferredencoding, + read_batch_urls, SameFileError, setproctitle, std_headers, @@ -552,21 +553,19 @@ def _real_main(argv=None): sys.exit(0) # Batch file verification - batchurls = [] + batch_urls = [] if opts.batchfile is not None: try: if opts.batchfile == '-': batchfd = sys.stdin else: - batchfd = open(opts.batchfile, 'r') - batchurls = batchfd.readlines() - batchurls = [x.strip() for x in batchurls] - batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)] + batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore') + batch_urls = read_batch_urls(batchfd) if opts.verbose: - write_string(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n') + write_string(u'[debug] Batch file urls: ' + repr(batch_urls) + u'\n') except IOError: sys.exit(u'ERROR: batch file could not be read') - all_urls = batchurls + args + all_urls = batch_urls + args all_urls = [url.strip() for url in all_urls] _enc = preferredencoding() all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 25e40a837b..0c482631a2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import contextlib import ctypes import datetime import email.utils @@ -1245,3 +1246,19 @@ def struct_unpack(spec, *args): else: struct_pack = struct.pack struct_unpack = struct.unpack + + +def read_batch_urls(batch_fd): + def fixup(url): + if not isinstance(url, compat_str): + url = url.decode('utf-8', 'replace') + BOM_UTF8 = u'\xef\xbb\xbf' + if url.startswith(BOM_UTF8): + url = url[len(BOM_UTF8):] + url = url.strip() + if url.startswith(('#', ';', ']')): + return False + return url + + with contextlib.closing(batch_fd) as fd: + return [url for url in map(fixup, fd) if url]