[utils] Rework decoding of Content-Encodings

* support nested encodings
* support optional `br` encoding, if brotli package is installed
* support optional 'compress' encoding, if ncompress package is installed
* response `Content-Encoding` has only unprocessed encodings, or removed
* response `Content-Length` is decoded length (usable for filesize metadata)
* use zlib for both deflate and gzip decompression
* some elements taken from yt-dlp: thx especially coletdjnz
This commit is contained in:
dirkf 2023-07-28 06:03:14 +01:00
parent 87e578c9b8
commit e7926ae9f4
3 changed files with 107 additions and 43 deletions

View File

@ -461,33 +461,23 @@ class TestHTTP(unittest.TestCase):
sanitized_Request( sanitized_Request(
self._test_url('content-encoding'), self._test_url('content-encoding'),
headers={'ytdl-encoding': encoding})) headers={'ytdl-encoding': encoding}))
self.assertEqual(res.headers.get('Content-Encoding'), encoding) # decoded encodings are removed: only check for valid decompressed data
self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>') self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>')
@unittest.skipUnless(brotli, 'brotli support is not installed') @unittest.skipUnless(brotli, 'brotli support is not installed')
@unittest.expectedFailure
def test_brotli(self): def test_brotli(self):
self.__test_compression('br') self.__test_compression('br')
@unittest.expectedFailure
def test_deflate(self): def test_deflate(self):
self.__test_compression('deflate') self.__test_compression('deflate')
@unittest.expectedFailure
def test_gzip(self): def test_gzip(self):
self.__test_compression('gzip') self.__test_compression('gzip')
@unittest.expectedFailure # not yet implemented
def test_multiple_encodings(self): def test_multiple_encodings(self):
# https://www.rfc-editor.org/rfc/rfc9110.html#section-8.4 # https://www.rfc-editor.org/rfc/rfc9110.html#section-8.4
with FakeYDL() as ydl:
for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'): for pair in ('gzip,deflate', 'deflate, gzip', 'gzip, gzip', 'deflate, deflate'):
res = ydl.urlopen( self.__test_compression(pair)
sanitized_Request(
self._test_url('content-encoding'),
headers={'ytdl-encoding': pair}))
self.assertEqual(res.headers.get('Content-Encoding'), pair)
self.assertEqual(res.read(), b'<html><video src="/vid.mp4" /></html>')
def test_unsupported_encoding(self): def test_unsupported_encoding(self):
# it should return the raw content # it should return the raw content

View File

@ -3200,6 +3200,18 @@ except AttributeError:
def compat_datetime_timedelta_total_seconds(td): def compat_datetime_timedelta_total_seconds(td):
return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6 return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6
# optional decompression packages
# PyPi brotli package implements 'br' Content-Encoding
try:
import brotli as compat_brotli
except ImportError:
compat_brotli = None
# PyPi ncompress package implements 'compress' Content-Encoding
try:
import ncompress as compat_ncompress
except ImportError:
compat_ncompress = None
legacy = [ legacy = [
'compat_HTMLParseError', 'compat_HTMLParseError',
@ -3234,6 +3246,7 @@ __all__ = [
'compat_Struct', 'compat_Struct',
'compat_base64_b64decode', 'compat_base64_b64decode',
'compat_basestring', 'compat_basestring',
'compat_brotli',
'compat_casefold', 'compat_casefold',
'compat_chr', 'compat_chr',
'compat_collections_abc', 'compat_collections_abc',
@ -3259,6 +3272,7 @@ __all__ = [
'compat_itertools_zip_longest', 'compat_itertools_zip_longest',
'compat_kwargs', 'compat_kwargs',
'compat_map', 'compat_map',
'compat_ncompress',
'compat_numeric_types', 'compat_numeric_types',
'compat_open', 'compat_open',
'compat_ord', 'compat_ord',

View File

@ -15,7 +15,6 @@ import email.utils
import email.header import email.header
import errno import errno
import functools import functools
import gzip
import inspect import inspect
import io import io
import itertools import itertools
@ -42,6 +41,7 @@ from .compat import (
compat_HTMLParseError, compat_HTMLParseError,
compat_HTMLParser, compat_HTMLParser,
compat_basestring, compat_basestring,
compat_brotli as brotli,
compat_casefold, compat_casefold,
compat_chr, compat_chr,
compat_collections_abc, compat_collections_abc,
@ -55,6 +55,7 @@ from .compat import (
compat_http_client, compat_http_client,
compat_integer_types, compat_integer_types,
compat_kwargs, compat_kwargs,
compat_ncompress as ncompress,
compat_os_name, compat_os_name,
compat_re_Match, compat_re_Match,
compat_re_Pattern, compat_re_Pattern,
@ -2638,11 +2639,44 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
req) req)
@staticmethod @staticmethod
def deflate(data): def deflate_gz(data):
try: try:
return zlib.decompress(data, -zlib.MAX_WBITS) # format:zlib,gzip + windowsize:32768
return data and zlib.decompress(data, 32 + zlib.MAX_WBITS)
except zlib.error: except zlib.error:
return zlib.decompress(data) # raw zlib * windowsize:32768 (RFC 9110: "non-conformant")
return zlib.decompress(data, -zlib.MAX_WBITS)
@staticmethod
def gzip(data):
from gzip import GzipFile
def _gzip(data):
with io.BytesIO(data) as data_buf:
gz = GzipFile(fileobj=data_buf, mode='rb')
return gz.read()
try:
return _gzip(data)
except IOError as original_ioerror:
# There may be junk at the end of the file
# See http://stackoverflow.com/q/4928560/35070 for details
for i in range(1, 1024):
try:
return _gzip(data[:-i])
except IOError:
continue
else:
raise original_ioerror
@staticmethod
def brotli(data):
return data and brotli.decompress(data)
@staticmethod
def compress(data):
return data and ncompress.decompress(data)
def http_request(self, req): def http_request(self, req):
# According to RFC 3986, URLs can not contain non-ASCII characters, however this is not # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
@ -2679,33 +2713,59 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
def http_response(self, req, resp): def http_response(self, req, resp):
old_resp = resp old_resp = resp
# gzip
if resp.headers.get('Content-encoding', '') == 'gzip': # Content-Encoding header lists the encodings in order that they were applied [1].
content = resp.read() # To decompress, we simply do the reverse.
gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') # [1]: https://datatracker.ietf.org/doc/html/rfc9110#name-content-encoding
decoded_response = None
decoders = {
'gzip': self.deflate_gz,
'deflate': self.deflate_gz,
}
if brotli:
decoders['br'] = self.brotli
if ncompress:
decoders['compress'] = self.compress
if sys.platform.startswith('java'):
# Jython zlib implementation misses gzip
decoders['gzip'] = self.gzip
def encodings(hdrs):
# A header field that allows multiple values can have multiple instances [2].
# [2]: https://datatracker.ietf.org/doc/html/rfc9110#name-fields
for e in reversed(','.join(hdrs).split(',')):
if e:
yield e.strip()
encodings_left = []
try: try:
uncompressed = io.BytesIO(gz.read()) resp.headers.get_all
except IOError as original_ioerror: hdrs = resp.headers
# There may be junk at the end of the file except AttributeError:
# See http://stackoverflow.com/q/4928560/35070 for details # Py2 has no get_all() method: headers are rfc822.Message
for i in range(1, 1024): from email.message import Message
try: hdrs = Message()
gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb') for k, v in resp.headers.items():
uncompressed = io.BytesIO(gz.read()) hdrs[k] = v
except IOError:
decoder, decoded_response = True, None
for encoding in encodings(hdrs.get_all('Content-Encoding', [])):
# "SHOULD consider" x-compress, x-gzip as compress, gzip
decoder = decoder and decoders.get(remove_start(encoding, 'x-'))
if not decoder:
encodings_left.insert(0, encoding)
continue continue
break decoded_response = decoder(decoded_response or resp.read())
else: if decoded_response is not None:
raise original_ioerror resp = compat_urllib_request.addinfourl(
resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) io.BytesIO(decoded_response), old_resp.headers, old_resp.url, old_resp.code)
resp.msg = old_resp.msg resp.msg = old_resp.msg
del resp.headers['Content-encoding'] del resp.headers['Content-Length']
# deflate resp.headers['Content-Length'] = '%d' % len(decoded_response)
if resp.headers.get('Content-encoding', '') == 'deflate': del resp.headers['Content-Encoding']
gz = io.BytesIO(self.deflate(resp.read())) if encodings_left:
resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) resp.headers['Content-Encoding'] = ', '.join(encodings_left)
resp.msg = old_resp.msg
del resp.headers['Content-encoding']
# Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
# https://github.com/ytdl-org/youtube-dl/issues/6457). # https://github.com/ytdl-org/youtube-dl/issues/6457).
if 300 <= resp.code < 400: if 300 <= resp.code < 400: