From 10fa2471fc11d6b63773b663ef0c431b0ce2bde4 Mon Sep 17 00:00:00 2001 From: FestplattenSchnitzel <45077355+FestplattenSchnitzel@users.noreply.github.com> Date: Thu, 5 May 2022 19:31:54 +0200 Subject: [PATCH] [VideocampusSachsen] Improve extractor (#3604) Authored by: FestplattenSchnitzel --- yt_dlp/extractor/extractors.py | 5 +- yt_dlp/extractor/videocampus_sachsen.py | 159 +++++++++++++++++------- 2 files changed, 117 insertions(+), 47 deletions(-) diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 2c09a161ec..6f6862915a 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -1899,10 +1899,7 @@ from .vidbit import VidbitIE from .viddler import ViddlerIE from .videa import VideaIE -from .videocampus_sachsen import ( - VideocampusSachsenIE, - VideocampusSachsenEmbedIE, -) +from .videocampus_sachsen import VideocampusSachsenIE from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE from .videomore import ( diff --git a/yt_dlp/extractor/videocampus_sachsen.py b/yt_dlp/extractor/videocampus_sachsen.py index fe9e061ae2..906412f08d 100644 --- a/yt_dlp/extractor/videocampus_sachsen.py +++ b/yt_dlp/extractor/videocampus_sachsen.py @@ -1,11 +1,70 @@ +import re + from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ExtractorError class VideocampusSachsenIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://videocampus\.sachsen\.de/(?: + IE_NAME = 'Vimp' + _INSTANCES = ( + 'campus.demo.vimp.com', + 'corporate.demo.vimp.com', + 'dancehalldatabase.com', + 'educhannel.hs-gesundheit.de', + 'emedia.ls.haw-hamburg.de', + 'globale-evolution.net', + 'k210039.vimp.mivitec.net', + 'media.cmslegal.com', + 'media.hs-furtwangen.de', + 'media.hwr-berlin.de', + 'mediathek.dkfz.de', + 'mediathek.htw-berlin.de', + 'mediathek.polizei-bw.de', + 'medien.hs-merseburg.de', + 'mportal.europa-uni.de', + 'pacific.demo.vimp.com', + 'slctv.com', + 'tube.isbonline.cn', + 'univideo.uni-kassel.de', + 'ursula2.genetics.emory.edu', + 'ursulablicklevideoarchiv.com', + 'v.agrarumweltpaedagogik.at', + 'video.eplay-tv.de', + 'video.fh-dortmund.de', + 'video.hs-offenburg.de', + 'video.hs-pforzheim.de', + 'video.hspv.nrw.de', + 'video.irtshdf.fr', + 'video.pareygo.de', + 'video.tu-freiberg.de', + 'videocampus.sachsen.de', + 'videoportal.uni-freiburg.de', + 'videoportal.vm.uni-freiburg.de', + 'videos.duoc.cl', + 'videos.uni-paderborn.de', + 'vimp-bemus.udk-berlin.de', + 'vimp.aekwl.de', + 'vimp.hs-mittweida.de', + 'vimp.oth-regensburg.de', + 'vimp.ph-heidelberg.de', + 'vimp.sma-events.com', + 'vimp.weka-fachmedien.de', + 'webtv.univ-montp3.fr', + 'www.b-tu.de/media', + 'www.bigcitytv.de', + 'www.cad-videos.de', + 'www.fh-bielefeld.de/medienportal', + 'www.orvovideo.com', + 'www.rwe.tv', + 'www.wenglor-media.com', + 'www2.univ-sba.dz', + ) + _VALID_URL = r'''(?x)https?://(?P%s)/(?: m/(?P[0-9a-f]+)| - (?:category/)?video/(?P[\w-]+)/(?P[0-9a-f]{32}) - )''' + (?:category/)?video/(?P[\w-]+)/(?P[0-9a-f]{32})| + media/embed.*(?:\?|&)key=(?P[0-9a-f]{32}&?) + )''' % ('|'.join(map(re.escape, _INSTANCES))) _TESTS = [ { @@ -13,6 +72,7 @@ class VideocampusSachsenIE(InfoExtractor): 'info_dict': { 'id': 'e6b9349905c1628631f175712250f2a1', 'title': 'Konstruktiver Entwicklungsprozess Vorlesung 7', + 'description': 'Konstruktiver Entwicklungsprozess Vorlesung 7', 'ext': 'mp4', }, }, @@ -21,6 +81,7 @@ class VideocampusSachsenIE(InfoExtractor): 'info_dict': { 'id': 'fc99c527e4205b121cb7c74433469262', 'title': 'Was ist selbstgesteuertes Lernen?', + 'description': 'md5:196aa3b0509a526db62f84679522a2f5', 'display_id': 'Was-ist-selbstgesteuertes-Lernen', 'ext': 'mp4', }, @@ -30,43 +91,32 @@ class VideocampusSachsenIE(InfoExtractor): 'info_dict': { 'id': '09d4ed029002eb1bdda610f1103dd54c', 'title': 'Tutorial zur Nutzung von Adobe Connect aus Veranstalter-Sicht', + 'description': 'md5:3d379ca3cc17b9da6784d7f58cca4d58', 'display_id': 'Tutorial-zur-Nutzung-von-Adobe-Connect-aus-Veranstalter-Sicht', 'ext': 'mp4', }, }, - ] - - def _real_extract(self, url): - video_id, tmp_id, display_id = self._match_valid_url(url).group('id', 'tmp_id', 'display_id') - webpage = self._download_webpage(url, video_id or tmp_id, fatal=False) or '' - - if not tmp_id: - video_id = self._html_search_regex( - r'src="https?://videocampus\.sachsen\.de/media/embed\?key=([0-9a-f]+)&', - webpage, 'video_id') - - title = self._html_search_regex( - (r'

(?P[^<]+)

', *self._meta_regex('title')), - webpage, 'title', group='content', fatal=False) - - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - f'https://videocampus.sachsen.de/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8', - video_id, 'mp4', 'm3u8_native', m3u8_id='hls') - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'display_id': display_id, - 'formats': formats, - 'subtitles': subtitles - } - - -class VideocampusSachsenEmbedIE(InfoExtractor): - _VALID_URL = r'https?://videocampus.sachsen.de/media/embed\?key=(?P[0-9a-f]+)' - - _TESTS = [ + { + 'url': 'https://www2.univ-sba.dz/video/Presentation-de-la-Faculte-de-droit-et-des-sciences-politiques-Journee-portes-ouvertes-202122/0183356e41af7bfb83d7667b20d9b6a3', + 'info_dict': { + 'url': 'https://www2.univ-sba.dz/getMedium/0183356e41af7bfb83d7667b20d9b6a3.mp4', + 'id': '0183356e41af7bfb83d7667b20d9b6a3', + 'title': 'Présentation de la Faculté de droit et des sciences politiques - Journée portes ouvertes 2021/22', + 'description': 'md5:508958bd93e0ca002ac731d94182a54f', + 'display_id': 'Presentation-de-la-Faculte-de-droit-et-des-sciences-politiques-Journee-portes-ouvertes-202122', + 'ext': 'mp4', + } + }, + { + 'url': 'https://vimp.weka-fachmedien.de/video/Preisverleihung-Produkte-des-Jahres-2022/c8816f1cc942c12b6cce57c835cffd7c', + 'info_dict': { + 'id': 'c8816f1cc942c12b6cce57c835cffd7c', + 'title': 'Preisverleihung »Produkte des Jahres 2022«', + 'description': 'md5:60c347568ca89aa25b772c4ea564ebd3', + 'display_id': 'Preisverleihung-Produkte-des-Jahres-2022', + 'ext': 'mp4', + }, + }, { 'url': 'https://videocampus.sachsen.de/media/embed?key=fc99c527e4205b121cb7c74433469262', 'info_dict': { @@ -78,18 +128,41 @@ class VideocampusSachsenEmbedIE(InfoExtractor): ] def _real_extract(self, url): - video_id = self._match_id(url) + host, video_id, tmp_id, display_id, embed_id = self._match_valid_url(url).group( + 'host', 'id', 'tmp_id', 'display_id', 'embed_id') + webpage = self._download_webpage(url, video_id or tmp_id, fatal=False) or '' - webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r']*title="([^"<]+)"', webpage, 'title', fatal=False) - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - f'https://videocampus.sachsen.de/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8', - video_id, 'mp4', 'm3u8_native', m3u8_id='hls') + if not video_id: + video_id = embed_id or self._html_search_regex( + rf'src="https?://{host}/media/embed.*(?:\?|&)key=([0-9a-f]+)&?', + webpage, 'video_id') + + if not (display_id or tmp_id): + # Title, description from embedded page's meta wouldn't be correct + title = self._html_search_regex(r']* title="([^"<]+)"', webpage, 'title', fatal=False) + description = None + else: + title = self._html_search_meta(('og:title', 'twitter:title', 'title'), webpage, fatal=False) + description = self._html_search_meta( + ('og:description', 'twitter:description', 'description'), webpage, default=None) + + formats, subtitles = [], {} + try: + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://{host}/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8', + video_id, 'mp4', m3u8_id='hls', fatal=True) + except ExtractorError as e: + if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (404, 500): + raise + + formats.append({'url': f'https://{host}/getMedium/{video_id}.mp4'}) self._sort_formats(formats) return { 'id': video_id, 'title': title, + 'description': description, + 'display_id': display_id, 'formats': formats, - 'subtitles': subtitles, + 'subtitles': subtitles }