standardized the use of unescapeHTML; added clean_html()

2024-12-11 20:41:00 +01:00 · 2012-04-10 16:31:46 +02:00 · 2012-04-10 16:31:46 +02:00 · d6a9615347
commit d6a9615347
parent ceba827e9a
1 changed files with 15 additions and 17 deletions
--- a/youtube_dl/init.py
+++ b/youtube_dl/init.py
@ -242,6 +242,18 @@ def htmlentity_transform(matchobj):
 	return (u'&%s;' % entity)


+def clean_html(html):
+	"""Clean an HTML snippet into a readable string"""
+	# Newline vs <br />
+	html = html.replace('\n', ' ')
+	html = re.sub('<\s*br\s*/?\s*>', '\n', html)
+	# Strip html tags
+	html = re.sub('<.*?>', '', html)
+	# Replace html entities
+	html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html)
+	return html
+
+
 def sanitize_title(utitle):
 	"""Sanitizes a video title so it could be used as part of a filename."""
 	utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
@ -3343,8 +3355,6 @@ def report_config_download(self, showName):
 		self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)

 	def _real_extract(self, url):
-		htmlParser = HTMLParser.HTMLParser()
-
 		mobj = re.match(self._VALID_URL, url)
 		if mobj is None:
 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@ -3360,11 +3370,11 @@ def _real_extract(self, url):
 			return

 		descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
-		description = htmlParser.unescape(descMatch.group(1))
+		description = unescapeHTML(descMatch.group(1))
 		imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
-		imgUrl = htmlParser.unescape(imgMatch.group(1))
+		imgUrl = unescapeHTML(imgMatch.group(1))
 		playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
-		playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
+		playerUrl = unescapeHTML(playerUrlMatch.group(1))
 		configUrlMatch = re.search('config=(.*)$', playerUrl)
 		configUrl = urllib2.unquote(configUrlMatch.group(1))

@ -3423,8 +3433,6 @@ def report_extraction(self, video_id):
 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

 	def _real_extract(self, url):
-		htmlParser = HTMLParser.HTMLParser()
-
 		mobj = re.match(self._VALID_URL, url)
 		if mobj is None:
 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@ -3495,8 +3503,6 @@ def report_extraction(self, video_id):
 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

 	def _real_extract(self, url):
-		htmlParser = HTMLParser.HTMLParser()
-
 		mobj = re.match(self._VALID_URL, url)
 		if mobj is None:
 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@ -3585,8 +3591,6 @@ def report_extraction(self, video_id):
 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

 	def _real_extract(self, url):
-		htmlParser = HTMLParser.HTMLParser()
-
 		mobj = re.match(self._VALID_URL, url)
 		if mobj is None:
 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@ -3674,8 +3678,6 @@ def report_extraction(self, video_id):
 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

 	def _real_extract(self, url):
-		htmlParser = HTMLParser.HTMLParser()
-
 		mobj = re.match(self._VALID_URL, url)
 		if mobj is None:
 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@ -3909,8 +3911,6 @@ def _real_extract(self, url):
 			except UnavailableVideoError, err:
 				self._downloader.trouble(u'\nERROR: unable to download video')
 		elif mobj.group('course'): # A course page
-			unescapeHTML = HTMLParser.HTMLParser().unescape
-
 			course = mobj.group('course')
 			info = {
 				'id': _simplify_title(course),
@ -3947,8 +3947,6 @@ def _real_extract(self, url):
 				assert entry['type'] == 'reference'
 				self.extract(entry['url'])
 		else: # Root page
-			unescapeHTML = HTMLParser.HTMLParser().unescape
-
 			info = {
 				'id': 'Stanford OpenClassroom',
 				'type': 'playlist',