diff --git a/README.md b/README.md index 7fded6a336..f46c65dff8 100644 --- a/README.md +++ b/README.md @@ -321,6 +321,15 @@ ## Thumbnail images: --list-thumbnails Simulate and list all available thumbnail formats +## Internet Shortcut Options: + --write-link Write an internet shortcut file, depending on + the current platform (.url/.webloc/.desktop). + The URL may be cached by the OS. + --write-url-link Write a Windows internet shortcut file (.url). + Note that the OS caches the URL based on the file path. + --write-webloc-link Write a macOS internet shortcut file (.webloc) + --write-desktop-link Write a Linux internet shortcut file (.desktop) + ## Verbosity / Simulation Options: -q, --quiet Activate quiet mode --no-warnings Ignore warnings diff --git a/test/parameters.json b/test/parameters.json index 65fd544286..76c2a9ae77 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -35,6 +35,11 @@ "verbose": true, "writedescription": false, "writeinfojson": true, + "writeannotations": false, + "writelink": false, + "writeurllink": false, + "writewebloclink": false, + "writedesktoplink": false, "writesubtitles": false, "allsubtitles": false, "listsubtitles": false, diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index a9e6491917..5950dbffca 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -42,6 +42,7 @@ def _make_result(formats, **kwargs): 'title': 'testttitle', 'extractor': 'testex', 'extractor_key': 'TestEx', + 'webpage_url': 'http://example.com/watch?v=shenanigans', } res.update(**kwargs) return res @@ -567,6 +568,7 @@ def s_formats(lang, autocaption=False): 'subtitles': subtitles, 'automatic_captions': auto_captions, 'extractor': 'TEST', + 'webpage_url': 'http://example.com/watch?v=shenanigans', } def get_info(params={}): @@ -730,6 +732,7 @@ def _match_entry(self, info_dict, incomplete): 'playlist_id': '42', 'uploader': "變態妍字幕版 太妍 тест", 'creator': "тест ' 123 ' тест--", + 'webpage_url': 'http://example.com/watch?v=shenanigans', } second = { 'id': '2', @@ -741,6 +744,7 @@ def _match_entry(self, info_dict, incomplete): 'filesize': 5 * 1024, 'playlist_id': '43', 'uploader': "тест 123", + 'webpage_url': 'http://example.com/watch?v=SHENANIGANS', } videos = [first, second] diff --git a/test/test_compat.py b/test/test_compat.py index 8c49a001e5..f66739bd42 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -19,6 +19,8 @@ compat_shlex_split, compat_str, compat_struct_unpack, + compat_urllib_parse_quote, + compat_urllib_parse_quote_plus, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, compat_urllib_parse_urlencode, @@ -53,6 +55,27 @@ def test_all_present(self): dir(youtube_dlc.compat))) - set(['unicode_literals']) self.assertEqual(all_names, sorted(present_names)) + def test_compat_urllib_parse_quote(self): + self.assertEqual(compat_urllib_parse_quote('abc def'), 'abc%20def') + self.assertEqual(compat_urllib_parse_quote('/~user/abc+def'), '/%7Euser/abc%2Bdef') + self.assertEqual(compat_urllib_parse_quote('/~user/abc+def', safe='/~+'), '/~user/abc+def') + self.assertEqual(compat_urllib_parse_quote(''), '') + self.assertEqual(compat_urllib_parse_quote('%'), '%25') + self.assertEqual(compat_urllib_parse_quote('%', safe='%'), '%') + self.assertEqual(compat_urllib_parse_quote('津波'), '%E6%B4%A5%E6%B3%A2') + self.assertEqual( + compat_urllib_parse_quote(''' +%%a''', safe='<>=":%/ \r\n'), + ''' +%%a''') + self.assertEqual( + compat_urllib_parse_quote('''(^◣_◢^)っ︻デ═一 ⇀ ⇀ ⇀ ⇀ ⇀ ↶%I%Break%25Things%''', safe='% '), + '''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%86%B6%I%Break%25Things%''') + + def test_compat_urllib_parse_quote_plus(self): + self.assertEqual(compat_urllib_parse_quote_plus('abc def'), 'abc+def') + self.assertEqual(compat_urllib_parse_quote_plus('~/abc def'), '%7E%2Fabc+def') + def test_compat_urllib_parse_unquote(self): self.assertEqual(compat_urllib_parse_unquote('abc%20def'), 'abc def') self.assertEqual(compat_urllib_parse_unquote('%7e/abc+def'), '~/abc+def') diff --git a/test/test_utils.py b/test/test_utils.py index 16ad408317..6562d443af 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -104,6 +104,7 @@ cli_valueless_option, cli_bool_option, parse_codecs, + iri_to_uri, ) from youtube_dlc.compat import ( compat_chr, @@ -1465,6 +1466,32 @@ def test_get_elements_by_attribute(self): self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) + def test_iri_to_uri(self): + self.assertEqual( + iri_to_uri('https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b'), + 'https://www.google.com/search?q=foo&ie=utf-8&oe=utf-8&client=firefox-b') # Same + self.assertEqual( + iri_to_uri('https://www.google.com/search?q=Käsesoßenrührlöffel'), # German for cheese sauce stirring spoon + 'https://www.google.com/search?q=K%C3%A4seso%C3%9Fenr%C3%BChrl%C3%B6ffel') + self.assertEqual( + iri_to_uri('https://www.google.com/search?q=lt<+gt>+eq%3D+amp%26+percent%25+hash%23+colon%3A+tilde~#trash=?&garbage=#'), + 'https://www.google.com/search?q=lt%3C+gt%3E+eq%3D+amp%26+percent%25+hash%23+colon%3A+tilde~#trash=?&garbage=#') + self.assertEqual( + iri_to_uri('http://правозащита38.рф/category/news/'), + 'http://xn--38-6kcaak9aj5chl4a3g.xn--p1ai/category/news/') + self.assertEqual( + iri_to_uri('http://www.правозащита38.рф/category/news/'), + 'http://www.xn--38-6kcaak9aj5chl4a3g.xn--p1ai/category/news/') + self.assertEqual( + iri_to_uri('https://i❤.ws/emojidomain/👍👏🤝💪'), + 'https://xn--i-7iq.ws/emojidomain/%F0%9F%91%8D%F0%9F%91%8F%F0%9F%A4%9D%F0%9F%92%AA') + self.assertEqual( + iri_to_uri('http://日本語.jp/'), + 'http://xn--wgv71a119e.jp/') + self.assertEqual( + iri_to_uri('http://导航.中国/'), + 'http://xn--fet810g.xn--fiqs8s/') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index ee6d749107..97e4f451f1 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -51,6 +51,9 @@ DEFAULT_OUTTMPL, determine_ext, determine_protocol, + DOT_DESKTOP_LINK_TEMPLATE, + DOT_URL_LINK_TEMPLATE, + DOT_WEBLOC_LINK_TEMPLATE, DownloadError, encode_compat_str, encodeFilename, @@ -61,6 +64,7 @@ formatSeconds, GeoRestrictedError, int_or_none, + iri_to_uri, ISO3166Utils, locked_file, make_HTTPS_handler, @@ -84,6 +88,7 @@ std_headers, str_or_none, subtitles_filename, + to_high_limit_path, UnavailableVideoError, url_basename, version_tuple, @@ -187,6 +192,11 @@ class YoutubeDL(object): writeannotations: Write the video annotations to a .annotations.xml file writethumbnail: Write the thumbnail image to a file write_all_thumbnails: Write all thumbnail formats to files + writelink: Write an internet shortcut file, depending on the + current platform (.url/.webloc/.desktop) + writeurllink: Write a Windows internet shortcut file (.url) + writewebloclink: Write a macOS internet shortcut file (.webloc) + writedesktoplink: Write a Linux internet shortcut file (.desktop) writesubtitles: Write the video subtitles to a file writeautomaticsub: Write the automatically generated subtitles to a file allsubtitles: Downloads all the subtitles of the video @@ -1984,6 +1994,57 @@ def dl(name, info, subtitle=False): self._write_thumbnails(info_dict, filename) + # Write internet shortcut files + url_link = webloc_link = desktop_link = False + if self.params.get('writelink', False): + if sys.platform == "darwin": # macOS. + webloc_link = True + elif sys.platform.startswith("linux"): + desktop_link = True + else: # if sys.platform in ['win32', 'cygwin']: + url_link = True + if self.params.get('writeurllink', False): + url_link = True + if self.params.get('writewebloclink', False): + webloc_link = True + if self.params.get('writedesktoplink', False): + desktop_link = True + + if url_link or webloc_link or desktop_link: + if 'webpage_url' not in info_dict: + self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information') + return + ascii_url = iri_to_uri(info_dict['webpage_url']) + + def _write_link_file(extension, template, newline, embed_filename): + linkfn = replace_extension(filename, extension, info_dict.get('ext')) + if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(linkfn)): + self.to_screen('[info] Internet shortcut is already present') + else: + try: + self.to_screen('[info] Writing internet shortcut to: ' + linkfn) + with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', newline=newline) as linkfile: + template_vars = {'url': ascii_url} + if embed_filename: + template_vars['filename'] = linkfn[:-(len(extension) + 1)] + linkfile.write(template % template_vars) + except (OSError, IOError): + self.report_error('Cannot write internet shortcut ' + linkfn) + return False + return True + + if url_link: + if not _write_link_file('url', DOT_URL_LINK_TEMPLATE, '\r\n', embed_filename=False): + return + if webloc_link: + if not _write_link_file('webloc', DOT_WEBLOC_LINK_TEMPLATE, '\n', embed_filename=False): + return + if desktop_link: + if not _write_link_file('desktop', DOT_DESKTOP_LINK_TEMPLATE, '\n', embed_filename=True): + return + + # Download + must_record_download_archive = False if not self.params.get('skip_download', False): try: if info_dict.get('requested_formats') is not None: diff --git a/youtube_dlc/__init__.py b/youtube_dlc/__init__.py index df07016e16..d183016b6e 100644 --- a/youtube_dlc/__init__.py +++ b/youtube_dlc/__init__.py @@ -389,6 +389,10 @@ def parse_retries(retries): 'writeinfojson': opts.writeinfojson, 'writethumbnail': opts.writethumbnail, 'write_all_thumbnails': opts.write_all_thumbnails, + 'writelink': opts.writelink, + 'writeurllink': opts.writeurllink, + 'writewebloclink': opts.writewebloclink, + 'writedesktoplink': opts.writedesktoplink, 'writesubtitles': opts.writesubtitles, 'writeautomaticsub': opts.writeautomaticsub, 'allsubtitles': opts.allsubtitles, diff --git a/youtube_dlc/compat.py b/youtube_dlc/compat.py index ac889ddd7a..4a69b098fb 100644 --- a/youtube_dlc/compat.py +++ b/youtube_dlc/compat.py @@ -37,15 +37,20 @@ except ImportError: # Python 2 import urllib as compat_urllib_parse +try: + import urllib.parse as compat_urlparse +except ImportError: # Python 2 + import urlparse as compat_urlparse + try: from urllib.parse import urlparse as compat_urllib_parse_urlparse except ImportError: # Python 2 from urlparse import urlparse as compat_urllib_parse_urlparse try: - import urllib.parse as compat_urlparse + from urllib.parse import urlunparse as compat_urllib_parse_urlunparse except ImportError: # Python 2 - import urlparse as compat_urlparse + from urlparse import urlunparse as compat_urllib_parse_urlunparse try: import urllib.response as compat_urllib_response @@ -2365,6 +2370,20 @@ class compat_HTMLParseError(Exception): except NameError: compat_str = str +try: + from urllib.parse import quote as compat_urllib_parse_quote + from urllib.parse import quote_plus as compat_urllib_parse_quote_plus +except ImportError: # Python 2 + def compat_urllib_parse_quote(string, safe='/'): + return compat_urllib_parse.quote( + string.encode('utf-8'), + str(safe)) + + def compat_urllib_parse_quote_plus(string, safe=''): + return compat_urllib_parse.quote_plus( + string.encode('utf-8'), + str(safe)) + try: from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote as compat_urllib_parse_unquote @@ -3033,11 +3052,14 @@ def compat_ctypes_WINFUNCTYPE(*args, **kwargs): 'compat_tokenize_tokenize', 'compat_urllib_error', 'compat_urllib_parse', + 'compat_urllib_parse_quote', + 'compat_urllib_parse_quote_plus', 'compat_urllib_parse_unquote', 'compat_urllib_parse_unquote_plus', 'compat_urllib_parse_unquote_to_bytes', 'compat_urllib_parse_urlencode', 'compat_urllib_parse_urlparse', + 'compat_urllib_parse_urlunparse', 'compat_urllib_request', 'compat_urllib_request_DataHandler', 'compat_urllib_response', diff --git a/youtube_dlc/options.py b/youtube_dlc/options.py index 44eba3e9c7..bd85abd3a8 100644 --- a/youtube_dlc/options.py +++ b/youtube_dlc/options.py @@ -830,7 +830,25 @@ def _comma_separated_values_options_callback(option, opt_str, value, parser): action='store_true', dest='list_thumbnails', default=False, help='Simulate and list all available thumbnail formats') - postproc = optparse.OptionGroup(parser, 'Post-processing Options') + link = optparse.OptionGroup(parser, 'Internet Shortcut Options') + link.add_option( + '--write-link', + action='store_true', dest='writelink', default=False, + help='Write an internet shortcut file, depending on the current platform (.url/.webloc/.desktop). The URL may be cached by the OS.') + link.add_option( + '--write-url-link', + action='store_true', dest='writeurllink', default=False, + help='Write a Windows internet shortcut file (.url). Note that the OS caches the URL based on the file path.') + link.add_option( + '--write-webloc-link', + action='store_true', dest='writewebloclink', default=False, + help='Write a macOS internet shortcut file (.webloc)') + link.add_option( + '--write-desktop-link', + action='store_true', dest='writedesktoplink', default=False, + help='Write a Linux internet shortcut file (.desktop)') + + postproc = optparse.OptionGroup(parser, 'Post-Processing Options') postproc.add_option( '-x', '--extract-audio', action='store_true', dest='extractaudio', default=False, @@ -932,6 +950,7 @@ def _comma_separated_values_options_callback(option, opt_str, value, parser): parser.add_option_group(downloader) parser.add_option_group(filesystem) parser.add_option_group(thumbnail) + parser.add_option_group(link) parser.add_option_group(verbosity) parser.add_option_group(workarounds) parser.add_option_group(video_format) diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py index 68b4ca944f..d814eb2ac8 100644 --- a/youtube_dlc/utils.py +++ b/youtube_dlc/utils.py @@ -60,6 +60,9 @@ compat_urllib_parse, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, + compat_urllib_parse_urlunparse, + compat_urllib_parse_quote, + compat_urllib_parse_quote_plus, compat_urllib_parse_unquote_plus, compat_urllib_request, compat_urlparse, @@ -5714,3 +5717,81 @@ def random_birthday(year_field, month_field, day_field): month_field: str(random_date.month), day_field: str(random_date.day), } + +# Templates for internet shortcut files, which are plain text files. +DOT_URL_LINK_TEMPLATE = ''' +[InternetShortcut] +URL=%(url)s +'''.lstrip() + +DOT_WEBLOC_LINK_TEMPLATE = ''' + + + + +\tURL +\t%(url)s + + +'''.lstrip() + +DOT_DESKTOP_LINK_TEMPLATE = ''' +[Desktop Entry] +Encoding=UTF-8 +Name=%(filename)s +Type=Link +URL=%(url)s +Icon=text-html +'''.lstrip() + + +def iri_to_uri(iri): + """ + Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only). + + The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact. + """ + + iri_parts = compat_urllib_parse_urlparse(iri) + + if '[' in iri_parts.netloc: + raise ValueError('IPv6 URIs are not, yet, supported.') + # Querying `.netloc`, when there's only one bracket, also raises a ValueError. + + # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is. + + net_location = '' + if iri_parts.username: + net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~") + if iri_parts.password is not None: + net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~") + net_location += '@' + + net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames. + # The 'idna' encoding produces ASCII text. + if iri_parts.port is not None and iri_parts.port != 80: + net_location += ':' + str(iri_parts.port) + + return compat_urllib_parse_urlunparse( + (iri_parts.scheme, + net_location, + + compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"), + + # Unsure about the `safe` argument, since this is a legacy way of handling parameters. + compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"), + + # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component. + compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"), + + compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~"))) + + # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes. + + +def to_high_limit_path(path): + if sys.platform in ['win32', 'cygwin']: + # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited. + return r'\\?\ '.rstrip() + os.path.abspath(path) + + return path