Add option --parse-metadata

* The fields extracted by this can be used in `--output` * Deprecated `--metadata-from-title` :ci skip dl
2025-02-18 18:30:58 +01:00 · 2021-01-26 15:50:20 +05:30 · 2021-01-26 15:50:20 +05:30 · 5bfa486205
commit 5bfa486205
parent 9882064024
8 changed files with 162 additions and 110 deletions
--- a/README.md
+++ b/README.md
@ -610,16 +610,19 @@ ## Post-Processing Options:
    --no-embed-thumbnail             Do not embed thumbnail (default)
    --add-metadata                   Write metadata to the video file
    --no-add-metadata                Do not write metadata (default)
-    --metadata-from-title FORMAT     Parse additional metadata like song title /
+    --parse-metadata FIELD:FORMAT    Parse additional metadata like title/artist
-                                     artist from the video title. The format
+                                     from other fields. Give field name to
-                                     syntax is the same as --output. Regular
+                                     extract data from, and format of the field
-                                     expression with named capture groups may
+                                     seperated by a ":". The format syntax is
-                                     also be used. The parsed parameters replace
+                                     the same as --output. Regular expression
-                                     existing values. Example: --metadata-from-
+                                     with named capture groups may also be used.
-                                     title "%(artist)s - %(title)s" matches a
+                                     The parsed parameters replace existing
                                     values. This option can be used multiple
                                     times. Example: --parse-metadata
                                     "title:%(artist)s - %(title)s" matches a
                                     title like "Coldplay - Paradise". Example
-                                     (regex): --metadata-from-title
+                                     (regex): --parse-metadata
-                                     "(?P<artist>.+?) - (?P<title>.+)"
+                                     "description:Artist - (?P<artist>.+?)"
    --xattrs                         Write metadata to the video file's xattrs
                                     (using dublin core and xdg standards)
    --fixup POLICY                   Automatically correct known faults of the
@ -1098,7 +1101,7 @@ # PLUGINS
 Plugins are loaded from `<root-dir>/ytdlp_plugins/<type>/__init__.py`. Currently only `extractor` plugins are supported. Support for `downloader` and `postprocessor` plugins may be added in the future. See [ytdlp_plugins](ytdlp_plugins) for example.
-**Note**: `<root-dir>` is the directory of the binary (`<root-dir>/youtube-dlc`), or the root directory of the module if you are running directly from source-code ((`<root dir>/youtube_dlc/__main__.py`)
+**Note**: `<root-dir>` is the directory of the binary (`<root-dir>/youtube-dlc`), or the root directory of the module if you are running directly from source-code (`<root dir>/youtube_dlc/__main__.py`)
 # MORE
-For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl)
+For FAQ, Developer Instructions etc., see the [original README](https://github.com/ytdl-org/youtube-dl#faq)
--- a/test/test_postprocessors.py
+++ b/test/test_postprocessors.py
@ -8,10 +8,16 @@
 import unittest
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from youtube_dlc.postprocessor import MetadataFromTitlePP
+from youtube_dlc.postprocessor import MetadataFromFieldPP, MetadataFromTitlePP
 class TestMetadataFromField(unittest.TestCase):
    def test_format_to_regex(self):
        pp = MetadataFromFieldPP(None, ['title:%(title)s - %(artist)s'])
        self.assertEqual(pp._data[0]['regex'], r'(?P<title>[^\r\n]+)\ \-\ (?P<artist>[^\r\n]+)')
 class TestMetadataFromTitle(unittest.TestCase):
    def test_format_to_regex(self):
        pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s')
-        self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)')
+        self.assertEqual(pp._titleregex, r'(?P<title>[^\r\n]+)\ \-\ (?P<artist>[^\r\n]+)')
--- a/youtube_dlc/YoutubeDL.py
+++ b/youtube_dlc/YoutubeDL.py
@ -375,8 +375,7 @@ class YoutubeDL(object):
    params = None
    _ies = []
-    _pps = []
+    _pps = {'beforedl': [], 'aftermove': [], 'normal': []}
    _pps_end = []
    __prepare_filename_warned = False
    _download_retcode = None
    _num_downloads = None
@ -390,8 +389,7 @@ def __init__(self, params=None, auto_init=True):
            params = {}
        self._ies = []
        self._ies_instances = {}
-        self._pps = []
+        self._pps = {'beforedl': [], 'aftermove': [], 'normal': []}
        self._pps_end = []
        self.__prepare_filename_warned = False
        self._post_hooks = []
        self._progress_hooks = []
@ -494,11 +492,13 @@ def check_deprecated(param, option, suggestion):
            pp_class = get_postprocessor(pp_def_raw['key'])
            pp_def = dict(pp_def_raw)
            del pp_def['key']
-            after_move = pp_def.get('_after_move', False)
+            if 'when' in pp_def:
-            if '_after_move' in pp_def:
+                when = pp_def['when']
-                del pp_def['_after_move']
+                del pp_def['when']
            else:
                when = 'normal'
            pp = pp_class(self, **compat_kwargs(pp_def))
-            self.add_post_processor(pp, after_move=after_move)
+            self.add_post_processor(pp, when=when)
        for ph in self.params.get('post_hooks', []):
            self.add_post_hook(ph)
@ -550,12 +550,9 @@ def add_default_info_extractors(self):
        for ie in gen_extractor_classes():
            self.add_info_extractor(ie)
-    def add_post_processor(self, pp, after_move=False):
+    def add_post_processor(self, pp, when='normal'):
        """Add a PostProcessor object to the end of the chain."""
-        if after_move:
+        self._pps[when].append(pp)
            self._pps_end.append(pp)
        else:
            self._pps.append(pp)
        pp.set_downloader(self)
    def add_post_hook(self, ph):
@ -1948,6 +1945,8 @@ def process_info(self, info_dict):
        self._num_downloads += 1
        info_dict = self.pre_process(info_dict)
        filename = self.prepare_filename(info_dict, warn=True)
        info_dict['_filename'] = full_filename = self.prepare_filepath(filename)
        temp_filename = self.prepare_filepath(filename, 'temp')
@ -2400,41 +2399,45 @@ def filter_requested_info(info_dict):
            (k, v) for k, v in info_dict.items()
            if k not in ['requested_formats', 'requested_subtitles'])
    def run_pp(self, pp, infodict, files_to_move={}):
        files_to_delete = []
        try:
            files_to_delete, infodict = pp.run(infodict)
        except PostProcessingError as e:
            self.report_error(e.msg)
        if not files_to_delete:
            return files_to_move, infodict
        if self.params.get('keepvideo', False):
            for f in files_to_delete:
                files_to_move.setdefault(f, '')
        else:
            for old_filename in set(files_to_delete):
                self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
                try:
                    os.remove(encodeFilename(old_filename))
                except (IOError, OSError):
                    self.report_warning('Unable to remove downloaded original file')
                if old_filename in files_to_move:
                    del files_to_move[old_filename]
        return files_to_move, infodict
    def pre_process(self, ie_info):
        info = dict(ie_info)
        for pp in self._pps['beforedl']:
            info = self.run_pp(pp, info)[1]
        return info
    def post_process(self, filename, ie_info, files_to_move={}):
        """Run all the postprocessors on the given file."""
        info = dict(ie_info)
        info['filepath'] = filename
-        def run_pp(pp):
+        for pp in ie_info.get('__postprocessors', []) + self._pps['normal']:
-            files_to_delete = []
+            files_to_move, info = self.run_pp(pp, info, files_to_move)
-            infodict = info
+        info = self.run_pp(MoveFilesAfterDownloadPP(self, files_to_move), info, files_to_move)[1]
-            try:
+        for pp in self._pps['aftermove']:
-                files_to_delete, infodict = pp.run(infodict)
+            files_to_move, info = self.run_pp(pp, info, {})
            except PostProcessingError as e:
                self.report_error(e.msg)
            if not files_to_delete:
                return infodict
            if self.params.get('keepvideo', False):
                for f in files_to_delete:
                    files_to_move.setdefault(f, '')
            else:
                for old_filename in set(files_to_delete):
                    self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
                    try:
                        os.remove(encodeFilename(old_filename))
                    except (IOError, OSError):
                        self.report_warning('Unable to remove downloaded original file')
                    if old_filename in files_to_move:
                        del files_to_move[old_filename]
            return infodict
        for pp in ie_info.get('__postprocessors', []) + self._pps:
            info = run_pp(pp)
        info = run_pp(MoveFilesAfterDownloadPP(self, files_to_move))
        files_to_move = {}
        for pp in self._pps_end:
            info = run_pp(pp)
    def _make_archive_id(self, info_dict):
        video_id = info_dict.get('id')
--- a/youtube_dlc/init.py
+++ b/youtube_dlc/init.py
@ -45,6 +45,7 @@
 from .extractor import gen_extractors, list_extractors
 from .extractor.common import InfoExtractor
 from .extractor.adobepass import MSO_INFO
 from .postprocessor.metadatafromfield import MetadataFromFieldPP
 from .YoutubeDL import YoutubeDL
@ -249,16 +250,25 @@ def parse_retries(retries):
        if re.match(InfoExtractor.FormatSort.regex, f) is None:
            parser.error('invalid format sort string "%s" specified' % f)
    if opts.metafromfield is None:
        opts.metafromfield = []
    if opts.metafromtitle is not None:
        opts.metafromfield.append('title:%s' % opts.metafromtitle)
    for f in opts.metafromfield:
        if re.match(MetadataFromFieldPP.regex, f) is None:
            parser.error('invalid format string "%s" specified for --parse-metadata' % f)
    any_getting = opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json
    any_printing = opts.print_json
    download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive
    # PostProcessors
    postprocessors = []
-    if opts.metafromtitle:
+    if opts.metafromfield:
        postprocessors.append({
-            'key': 'MetadataFromTitle',
+            'key': 'MetadataFromField',
-            'titleformat': opts.metafromtitle
+            'formats': opts.metafromfield,
            'when': 'beforedl'
        })
    if opts.extractaudio:
        postprocessors.append({
@ -324,7 +334,7 @@ def parse_retries(retries):
        postprocessors.append({
            'key': 'ExecAfterDownload',
            'exec_cmd': opts.exec_cmd,
-            '_after_move': True
+            'when': 'aftermove'
        })
    _args_compat_warning = 'WARNING: %s given without specifying name. The arguments will be given to all %s\n'
--- a/youtube_dlc/options.py
+++ b/youtube_dlc/options.py
@ -1078,14 +1078,20 @@ def _dict_from_multiple_values_options_callback(
    postproc.add_option(
        '--metadata-from-title',
        metavar='FORMAT', dest='metafromtitle',
        help=optparse.SUPPRESS_HELP)
    postproc.add_option(
        '--parse-metadata',
        metavar='FIELD:FORMAT', dest='metafromfield', action='append',
        help=(
-            'Parse additional metadata like song title / artist from the video title. '
+            'Parse additional metadata like title/artist from other fields. '
-            'The format syntax is the same as --output. Regular expression with '
+            'Give field name to extract data from, and format of the field seperated by a ":". '
-            'named capture groups may also be used. '
+            'The format syntax is the same as --output. '
            'Regular expression with named capture groups may also be used. '
            'The parsed parameters replace existing values. '
-            'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '
+            'This option can be used multiple times. '
            'Example: --parse-metadata "title:%(artist)s - %(title)s" matches a title like '
            '"Coldplay - Paradise". '
-            'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"'))
+            'Example (regex): --parse-metadata "description:Artist - (?P<artist>.+?)"'))
    postproc.add_option(
        '--xattrs',
        action='store_true', dest='xattrs', default=False,
--- a/youtube_dlc/postprocessor/init.py
+++ b/youtube_dlc/postprocessor/init.py
@ -16,7 +16,8 @@
 )
 from .xattrpp import XAttrMetadataPP
 from .execafterdownload import ExecAfterDownloadPP
-from .metadatafromtitle import MetadataFromTitlePP
+from .metadatafromfield import MetadataFromFieldPP
 from .metadatafromfield import MetadataFromTitlePP
 from .movefilesafterdownload import MoveFilesAfterDownloadPP
 from .sponskrub import SponSkrubPP
@ -39,6 +40,7 @@ def get_postprocessor(key):
    'FFmpegSubtitlesConvertorPP',
    'FFmpegVideoConvertorPP',
    'FFmpegVideoRemuxerPP',
    'MetadataFromFieldPP',
    'MetadataFromTitlePP',
    'MoveFilesAfterDownloadPP',
    'SponSkrubPP',
--- a/youtube_dlc/postprocessor/metadatafromfield.py
+++ b/youtube_dlc/postprocessor/metadatafromfield.py
@ -0,0 +1,66 @@
 from __future__ import unicode_literals
 import re
 from .common import PostProcessor
 from ..compat import compat_str
 class MetadataFromFieldPP(PostProcessor):
    regex = r'(?P<field>\w+):(?P<format>.+)$'
    def __init__(self, downloader, formats):
        PostProcessor.__init__(self, downloader)
        assert isinstance(formats, (list, tuple))
        self._data = []
        for f in formats:
            assert isinstance(f, compat_str)
            match = re.match(self.regex, f)
            assert match is not None
            self._data.append({
                'field': match.group('field'),
                'format': match.group('format'),
                'regex': self.format_to_regex(match.group('format'))})
    def format_to_regex(self, fmt):
        r"""
        Converts a string like
           '%(title)s - %(artist)s'
        to a regex like
           '(?P<title>.+)\ \-\ (?P<artist>.+)'
        """
        if not re.search(r'%\(\w+\)s', fmt):
            return fmt
        lastpos = 0
        regex = ''
        # replace %(..)s with regex group and escape other string parts
        for match in re.finditer(r'%\((\w+)\)s', fmt):
            regex += re.escape(fmt[lastpos:match.start()])
            regex += r'(?P<' + match.group(1) + r'>[^\r\n]+)'
            lastpos = match.end()
        if lastpos < len(fmt):
            regex += re.escape(fmt[lastpos:])
        return regex
    def run(self, info):
        for dictn in self._data:
            field, regex = dictn['field'], dictn['regex']
            if field not in info:
                self.report_warning('Video doesnot have a %s' % field)
                continue
            self.write_debug('Searching for r"%s" in %s' % (regex, field))
            match = re.search(regex, info[field])
            if match is None:
                self.report_warning('Could not interpret video %s as "%s"' % (field, dictn['format']))
                continue
            for attribute, value in match.groupdict().items():
                info[attribute] = value
                self.to_screen('parsed %s from %s: %s' % (attribute, field, value if value is not None else 'NA'))
        return [], info
 class MetadataFromTitlePP(MetadataFromFieldPP):  # for backward compatibility
    def __init__(self, downloader, titleformat):
        super(MetadataFromTitlePP, self).__init__(downloader, ['title:%s' % titleformat])
        self._titleformat = titleformat
        self._titleregex = self._data[0]['regex']
--- a/youtube_dlc/postprocessor/metadatafromtitle.py
+++ b/youtube_dlc/postprocessor/metadatafromtitle.py
@ -1,44 +0,0 @@
 from __future__ import unicode_literals
 import re
 from .common import PostProcessor
 class MetadataFromTitlePP(PostProcessor):
    def __init__(self, downloader, titleformat):
        super(MetadataFromTitlePP, self).__init__(downloader)
        self._titleformat = titleformat
        self._titleregex = (self.format_to_regex(titleformat)
                            if re.search(r'%\(\w+\)s', titleformat)
                            else titleformat)
    def format_to_regex(self, fmt):
        r"""
        Converts a string like
           '%(title)s - %(artist)s'
        to a regex like
           '(?P<title>.+)\ \-\ (?P<artist>.+)'
        """
        lastpos = 0
        regex = ''
        # replace %(..)s with regex group and escape other string parts
        for match in re.finditer(r'%\((\w+)\)s', fmt):
            regex += re.escape(fmt[lastpos:match.start()])
            regex += r'(?P<' + match.group(1) + '>.+)'
            lastpos = match.end()
        if lastpos < len(fmt):
            regex += re.escape(fmt[lastpos:])
        return regex
    def run(self, info):
        title = info['title']
        match = re.match(self._titleregex, title)
        if match is None:
            self.to_screen('Could not interpret title of video as "%s"' % self._titleformat)
            return [], info
        for attribute, value in match.groupdict().items():
            info[attribute] = value
            self.to_screen('parsed %s: %s' % (attribute, value if value is not None else 'NA'))
        return [], info