From 5c3895fff150871fde273a10c55691403931b4dc Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 27 Mar 2022 10:04:04 +0530 Subject: [PATCH] [outtmpl] Limit changes during sanitization Closes #2761 --- README.md | 1 + test/test_utils.py | 6 ++++-- yt_dlp/YoutubeDL.py | 7 +++++-- yt_dlp/options.py | 2 +- yt_dlp/utils.py | 34 +++++++++++++++++++--------------- 5 files changed, 30 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index a2e2b34c37..c184d23c4f 100644 --- a/README.md +++ b/README.md @@ -144,6 +144,7 @@ ### Differences in default behavior * Some private fields such as filenames are removed by default from the infojson. Use `--no-clean-infojson` or `--compat-options no-clean-infojson` to revert this * When `--embed-subs` and `--write-subs` are used together, the subtitles are written to disk and also embedded in the media file. You can use just `--embed-subs` to embed the subs and automatically delete the separate file. See [#630 (comment)](https://github.com/yt-dlp/yt-dlp/issues/630#issuecomment-893659460) for more info. `--compat-options no-keep-subs` can be used to revert this * `certifi` will be used for SSL root certificates, if installed. If you want to use system certificates (e.g. self-signed), use `--compat-options no-certifi` +* youtube-dl tries to remove some superfluous punctuations from filenames. While this can sometimes be helpfull, it is often undesirable. So yt-dlp tries to keep the fields in the filenames as close to their original values as possible. You can use `--compat-options filename-sanitization` to revert to youtube-dl's behavior For ease of use, a few more compat options are available: * `--compat-options all`: Use all compat options diff --git a/test/test_utils.py b/test/test_utils.py index a7f1b0e946..660ce03bfe 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -160,10 +160,12 @@ def test_sanitize_filename(self): sanitize_filename('New World record at 0:12:34'), 'New World record at 0_12_34') - self.assertEqual(sanitize_filename('--gasdgf'), '_-gasdgf') + self.assertEqual(sanitize_filename('--gasdgf'), '--gasdgf') self.assertEqual(sanitize_filename('--gasdgf', is_id=True), '--gasdgf') - self.assertEqual(sanitize_filename('.gasdgf'), 'gasdgf') + self.assertEqual(sanitize_filename('--gasdgf', is_id=False), '_-gasdgf') + self.assertEqual(sanitize_filename('.gasdgf'), '.gasdgf') self.assertEqual(sanitize_filename('.gasdgf', is_id=True), '.gasdgf') + self.assertEqual(sanitize_filename('.gasdgf', is_id=False), 'gasdgf') forbidden = '"\0\\/' for fc in forbidden: diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 478bdaccaf..c2f4f3a955 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -87,6 +87,7 @@ MaxDownloadsReached, merge_headers, network_exceptions, + NO_DEFAULT, number_of_digits, orderedSet, OUTTMPL_TYPES, @@ -1150,8 +1151,10 @@ def get_value(mdict): na = self.params.get('outtmpl_na_placeholder', 'NA') def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')): - return sanitize_filename(str(value), restricted=restricted, - is_id=re.search(r'(^|[_.])id(\.|$)', key)) + return sanitize_filename(str(value), restricted=restricted, is_id=( + bool(re.search(r'(^|[_.])id(\.|$)', key)) + if 'filename-sanitization' in self.params.get('compat_opts', []) + else NO_DEFAULT)) sanitizer = sanitize if callable(sanitize) else filename_sanitizer sanitize = bool(sanitize) diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 34a2e1103d..eb306898ab 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -338,7 +338,7 @@ def _dict_from_options_callback( action='callback', callback=_set_from_options_callback, callback_kwargs={ 'allowed_values': { - 'filename', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles', + 'filename', 'filename-sanitization', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles', 'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge', 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-attach-info-json', 'embed-metadata', 'embed-thumbnail-atomicparsley', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index e9eaf7b4ed..6854dbb63a 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -705,36 +705,40 @@ def timeconvert(timestr): return timestamp -def sanitize_filename(s, restricted=False, is_id=False): +def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT): """Sanitizes a string so it could be used as part of a filename. - If restricted is set, use a stricter subset of allowed characters. - Set is_id if this is not an arbitrary string, but an ID that should be kept - if possible. + @param restricted Use a stricter subset of allowed characters + @param is_id Whether this is an ID that should be kept unchanged if possible. + If unset, yt-dlp's new sanitization rules are in effect """ + if s == '': + return '' + def replace_insane(char): if restricted and char in ACCENT_CHARS: return ACCENT_CHARS[char] elif not restricted and char == '\n': - return ' ' + return '\0 ' elif char == '?' or ord(char) < 32 or ord(char) == 127: return '' elif char == '"': return '' if restricted else '\'' elif char == ':': - return '_-' if restricted else ' -' + return '\0_\0-' if restricted else '\0 \0-' elif char in '\\/|*<>': - return '_' - if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()): - return '_' - if restricted and ord(char) > 127: - return '_' + return '\0_' + if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127): + return '\0_' return char - if s == '': - return '' - # Handle timestamps - s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) + s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps result = ''.join(map(replace_insane, s)) + if is_id is NO_DEFAULT: + result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars + STRIP_RE = '(?:\0.|[ _-])*' + result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end + result = result.replace('\0', '') or '_' + if not is_id: while '__' in result: result = result.replace('__', '_')