[RadioFrance] fix profile pagination detection

2024-11-23 19:33:59 +01:00 · 2024-10-15 14:44:48 +01:00 · 2024-10-15 14:44:48 +01:00 · e01fab7041
commit e01fab7041
parent 867bf965bb
1 changed files with 12 additions and 18 deletions
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@ -392,7 +392,7 @@ class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
    _VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)'

    _TESTS = [{
-        'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3',
+        'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet',
        'info_dict': {
            'id': '86c62790-e481-11e2-9f7b-782bcb6744eb',
            'display_id': 'thomas-pesquet',
@ -422,30 +422,24 @@ def _call_api(self, station, profile_id, cursor):
        webpage = self._download_webpage(url, profile_id, note=f'Downloading {profile_id} page {cursor}')

        resp = dict()
-
-        # On profile pages, the data is stored in a javascript array in the final <script>
-        # Each episode is stored as
-        # a[0] = { id: ... }; a[1] = [ id: ... ]; on page 2->
-        # If a page had a thumbnail, the a variable contains image data,
-        # and episode data is stored in b[0]...
        resp['items'] = []
-        podcastindex = 0
-        nextmatch = True
-        while nextmatch:
-            nextmatch = self._search_json(r'\w+\[' + str(podcastindex) + r'\]\s*=\s*', webpage, profile_id,
-                                          profile_id, transform_source=js_to_json, fatal=False, default=None)
-            podcastindex += 1
-            if nextmatch is not None:
-                resp['items'].append(nextmatch)

-        # There is more than one pagination key in the final <script>
-        # We should use pick the pagination object which is within a documents object
+        # get episode data from page
        pagedata = self._search_json(r'documents\s*:\s*', webpage, profile_id, profile_id,
                                     transform_source=js_to_json)
-        lastPage = traverse_obj(pagedata, ('pagination', 'lastPage'))
+
+        # get thepage data
+        pagekey = pagedata['pagination']
+        hasMorePages = False
+        lastPage = int(self._search_regex(pagekey+'\.lastPage=(\d+);', webpage, profile_id, '0'))
        hasMorePages = cursor < lastPage
        resp['next'] = cursor + 1 if hasMorePages else None

+        # get episode data, note, not all will be A/V, so filter for 'expression'
+        for item in pagedata['items']:
+            if item['model']=='Expression':
+                resp['items'].append(item)
+
        resp['metadata'] = self._search_json(r'content:\s*', webpage, profile_id, profile_id,
                                             transform_source=js_to_json)
        # If the image data is stored separately rather than in the main content area