2012-03-25 03:07:37 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
2014-03-24 01:40:09 +01:00
import calendar
2014-04-04 23:00:51 +02:00
import codecs
2014-02-25 01:43:17 +01:00
import contextlib
2013-12-16 05:04:12 +01:00
import ctypes
2013-08-28 12:57:10 +02:00
import datetime
import email . utils
2013-05-13 09:20:08 +02:00
import errno
2012-03-25 03:07:37 +02:00
import gzip
2014-01-20 11:36:47 +01:00
import itertools
2012-11-28 00:09:17 +01:00
import io
2012-12-20 13:13:24 +01:00
import json
2012-03-25 03:07:37 +02:00
import locale
2013-11-25 03:12:26 +01:00
import math
2012-03-25 03:07:37 +02:00
import os
2013-10-12 13:49:27 +02:00
import pipes
2013-08-28 12:57:10 +02:00
import platform
2012-03-25 03:07:37 +02:00
import re
2013-11-24 06:37:14 +01:00
import ssl
2013-08-28 12:57:10 +02:00
import socket
2014-02-15 16:24:43 +01:00
import struct
2013-12-09 18:29:07 +01:00
import subprocess
2012-03-25 03:07:37 +02:00
import sys
2014-08-21 13:01:13 +02:00
import tempfile
2013-01-03 15:39:55 +01:00
import traceback
2014-03-10 17:31:32 +01:00
import xml . etree . ElementTree
2012-03-25 03:07:37 +02:00
import zlib
2014-11-02 11:23:40 +01:00
from . compat import (
compat_chr ,
compat_getenv ,
compat_html_entities ,
compat_html_parser ,
compat_parse_qs ,
compat_str ,
compat_urllib_error ,
compat_urllib_parse ,
compat_urllib_parse_urlparse ,
compat_urllib_request ,
compat_urlparse ,
)
2014-09-30 17:27:53 +02:00
2013-06-06 14:35:08 +02:00
# This is not clearly defined otherwise
compiled_regex_type = type ( re . compile ( ' ' ) )
2012-11-28 00:02:55 +01:00
std_headers = {
2013-11-18 13:52:24 +01:00
' User-Agent ' : ' Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome) ' ,
2012-11-28 02:04:46 +01:00
' Accept-Charset ' : ' ISO-8859-1,utf-8;q=0.7,*;q=0.7 ' ,
' Accept ' : ' text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 ' ,
' Accept-Encoding ' : ' gzip, deflate ' ,
' Accept-Language ' : ' en-us,en;q=0.5 ' ,
2012-11-28 00:02:55 +01:00
}
2012-12-30 18:22:36 +01:00
2012-03-25 03:07:37 +02:00
def preferredencoding ( ) :
2012-11-28 02:04:46 +01:00
""" Get preferred encoding.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
Returns the best encoding scheme for the system , based on
locale . getpreferredencoding ( ) and some further tweaks .
"""
try :
pref = locale . getpreferredencoding ( )
u ' TEST ' . encode ( pref )
except :
pref = ' UTF-8 '
2012-07-01 18:21:27 +02:00
2012-11-28 02:04:46 +01:00
return pref
2012-03-25 03:07:37 +02:00
2012-12-20 13:13:24 +01:00
2014-08-21 13:01:13 +02:00
def write_json_file ( obj , fn ) :
""" Encode obj as JSON and write it to fn, atomically """
2014-08-21 17:03:00 +02:00
args = {
' suffix ' : ' .tmp ' ,
' prefix ' : os . path . basename ( fn ) + ' . ' ,
' dir ' : os . path . dirname ( fn ) ,
' delete ' : False ,
}
2014-08-21 13:01:13 +02:00
# In Python 2.x, json.dump expects a bytestream.
# In Python 3.x, it writes to a character stream
if sys . version_info < ( 3 , 0 ) :
2014-08-21 17:03:00 +02:00
args [ ' mode ' ] = ' wb '
2014-08-21 13:01:13 +02:00
else :
2014-08-21 17:03:00 +02:00
args . update ( {
' mode ' : ' w ' ,
' encoding ' : ' utf-8 ' ,
} )
tf = tempfile . NamedTemporaryFile ( * * args )
2014-08-21 13:01:13 +02:00
try :
with tf :
json . dump ( obj , tf )
os . rename ( tf . name , fn )
except :
try :
os . remove ( tf . name )
except OSError :
pass
raise
if sys . version_info > = ( 2 , 7 ) :
2013-07-11 16:12:08 +02:00
def find_xpath_attr ( node , xpath , key , val ) :
""" Find the xpath xpath[@key=val] """
2014-07-25 11:39:17 +02:00
assert re . match ( r ' ^[a-zA-Z-]+$ ' , key )
assert re . match ( r ' ^[a-zA-Z0-9@ \ s:._-]*$ ' , val )
2013-07-11 16:12:08 +02:00
expr = xpath + u " [@ %s = ' %s ' ] " % ( key , val )
return node . find ( expr )
else :
def find_xpath_attr ( node , xpath , key , val ) :
2014-09-13 08:34:15 +02:00
# Here comes the crazy part: In 2.6, if the xpath is a unicode,
# .//node does not match if a node is a direct child of . !
if isinstance ( xpath , unicode ) :
xpath = xpath . encode ( ' ascii ' )
2013-07-11 16:12:08 +02:00
for f in node . findall ( xpath ) :
if f . attrib . get ( key ) == val :
return f
return None
2013-10-12 21:34:04 +02:00
# On python2.6 the xml.etree.ElementTree.Element methods don't support
# the namespace parameter
def xpath_with_ns ( path , ns_map ) :
components = [ c . split ( ' : ' ) for c in path . split ( ' / ' ) ]
replaced = [ ]
for c in components :
if len ( c ) == 1 :
replaced . append ( c [ 0 ] )
else :
ns , tag = c
replaced . append ( ' { %s } %s ' % ( ns_map [ ns ] , tag ) )
return ' / ' . join ( replaced )
2012-03-25 03:07:37 +02:00
2014-09-13 09:09:55 +02:00
def xpath_text ( node , xpath , name = None , fatal = False ) :
2014-09-13 09:11:14 +02:00
if sys . version_info < ( 2 , 7 ) : # Crazy 2.6
xpath = xpath . encode ( ' ascii ' )
2014-09-13 09:09:55 +02:00
n = node . find ( xpath )
if n is None :
if fatal :
name = xpath if name is None else name
raise ExtractorError ( ' Could not find XML element %s ' % name )
else :
return None
return n . text
2012-11-28 00:06:28 +01:00
compat_html_parser . locatestarttagend = re . compile ( r """ <[a-zA-Z][-.a-zA-Z0-9:_]*(?: \ s+(?:(?<=[ ' " \ s])[^ \ s/>][^ \ s/=>]*(?: \ s*=+ \ s*(?: ' [^ ' ]* ' | " [^ " ]* " |(?![ ' " ])[^> \ s]*))? \ s*)*)? \ s* """ , re . VERBOSE ) # backport bugfix
2013-09-13 22:05:29 +02:00
class BaseHTMLParser ( compat_html_parser . HTMLParser ) :
def __init ( self ) :
compat_html_parser . HTMLParser . __init__ ( self )
self . html = None
def loads ( self , html ) :
self . html = html
self . feed ( html )
self . close ( )
class AttrParser ( BaseHTMLParser ) :
2012-12-19 15:21:14 +01:00
""" Modified HTMLParser that isolates a tag with the specified attribute """
def __init__ ( self , attribute , value ) :
self . attribute = attribute
self . value = value
2012-11-28 02:04:46 +01:00
self . result = None
self . started = False
self . depth = { }
self . watch_startpos = False
self . error_count = 0
2013-09-13 22:05:29 +02:00
BaseHTMLParser . __init__ ( self )
2012-11-28 02:04:46 +01:00
def error ( self , message ) :
if self . error_count > 10 or self . started :
raise compat_html_parser . HTMLParseError ( message , self . getpos ( ) )
self . rawdata = ' \n ' . join ( self . html . split ( ' \n ' ) [ self . getpos ( ) [ 0 ] : ] ) # skip one line
self . error_count + = 1
self . goahead ( 1 )
def handle_starttag ( self , tag , attrs ) :
attrs = dict ( attrs )
if self . started :
self . find_startpos ( None )
2012-12-19 15:21:14 +01:00
if self . attribute in attrs and attrs [ self . attribute ] == self . value :
2012-11-28 02:04:46 +01:00
self . result = [ tag ]
self . started = True
self . watch_startpos = True
if self . started :
if not tag in self . depth : self . depth [ tag ] = 0
self . depth [ tag ] + = 1
def handle_endtag ( self , tag ) :
if self . started :
if tag in self . depth : self . depth [ tag ] - = 1
if self . depth [ self . result [ 0 ] ] == 0 :
self . started = False
self . result . append ( self . getpos ( ) )
def find_startpos ( self , x ) :
""" Needed to put the start position of the result (self.result[1])
after the opening tag with the requested id """
if self . watch_startpos :
self . watch_startpos = False
self . result . append ( self . getpos ( ) )
handle_entityref = handle_charref = handle_data = handle_comment = \
handle_decl = handle_pi = unknown_decl = find_startpos
def get_result ( self ) :
if self . result is None :
return None
if len ( self . result ) != 3 :
return None
lines = self . html . split ( ' \n ' )
lines = lines [ self . result [ 1 ] [ 0 ] - 1 : self . result [ 2 ] [ 0 ] ]
lines [ 0 ] = lines [ 0 ] [ self . result [ 1 ] [ 1 ] : ]
if len ( lines ) == 1 :
lines [ - 1 ] = lines [ - 1 ] [ : self . result [ 2 ] [ 1 ] - self . result [ 1 ] [ 1 ] ]
lines [ - 1 ] = lines [ - 1 ] [ : self . result [ 2 ] [ 1 ] ]
return ' \n ' . join ( lines ) . strip ( )
2013-02-01 17:29:50 +01:00
# Hack for https://github.com/rg3/youtube-dl/issues/662
if sys . version_info < ( 2 , 7 , 3 ) :
AttrParser . parse_endtag = ( lambda self , i :
i + len ( " </scr ' + ' ipt> " )
if self . rawdata [ i : ] . startswith ( " </scr ' + ' ipt> " )
else compat_html_parser . HTMLParser . parse_endtag ( self , i ) )
2012-04-11 00:22:51 +02:00
def get_element_by_id ( id , html ) :
2012-12-19 15:21:14 +01:00
""" Return the content of the tag with the specified ID in the passed HTML document """
return get_element_by_attribute ( " id " , id , html )
def get_element_by_attribute ( attribute , value , html ) :
""" Return the content of the tag with the specified attribute in the passed HTML document """
parser = AttrParser ( attribute , value )
2012-11-28 02:04:46 +01:00
try :
parser . loads ( html )
except compat_html_parser . HTMLParseError :
pass
return parser . get_result ( )
2012-04-11 00:22:51 +02:00
2013-09-13 22:05:29 +02:00
class MetaParser ( BaseHTMLParser ) :
"""
Modified HTMLParser that isolates a meta tag with the specified name
attribute .
"""
def __init__ ( self , name ) :
BaseHTMLParser . __init__ ( self )
self . name = name
self . content = None
self . result = None
def handle_starttag ( self , tag , attrs ) :
if tag != ' meta ' :
return
attrs = dict ( attrs )
if attrs . get ( ' name ' ) == self . name :
self . result = attrs . get ( ' content ' )
def get_result ( self ) :
return self . result
def get_meta_content ( name , html ) :
"""
Return the content attribute from the meta tag with the given name attribute .
"""
parser = MetaParser ( name )
try :
parser . loads ( html )
except compat_html_parser . HTMLParseError :
pass
return parser . get_result ( )
2012-04-11 00:22:51 +02:00
def clean_html ( html ) :
2012-11-28 02:04:46 +01:00
""" Clean an HTML snippet into a readable string """
# Newline vs <br />
html = html . replace ( ' \n ' , ' ' )
2012-12-20 16:30:55 +01:00
html = re . sub ( r ' \ s*< \ s*br \ s*/? \ s*> \ s* ' , ' \n ' , html )
html = re . sub ( r ' < \ s*/ \ s*p \ s*> \ s*< \ s*p[^>]*> ' , ' \n ' , html )
2012-11-28 02:04:46 +01:00
# Strip html tags
html = re . sub ( ' <.*?> ' , ' ' , html )
# Replace html entities
html = unescapeHTML ( html )
2013-03-29 15:59:13 +01:00
return html . strip ( )
2012-04-11 00:22:51 +02:00
2012-03-25 03:07:37 +02:00
def sanitize_open ( filename , open_mode ) :
2012-11-28 02:04:46 +01:00
""" Try to open the given filename, and slightly tweak it if this fails.
Attempts to open the given filename . If this fails , it tries to change
the filename slightly , step by step , until it ' s either able to open it
or it fails and raises a final exception , like the standard open ( )
function .
It returns the tuple ( stream , definitive_file_name ) .
"""
try :
if filename == u ' - ' :
if sys . platform == ' win32 ' :
import msvcrt
msvcrt . setmode ( sys . stdout . fileno ( ) , os . O_BINARY )
2013-03-28 13:13:03 +01:00
return ( sys . stdout . buffer if hasattr ( sys . stdout , ' buffer ' ) else sys . stdout , filename )
2012-11-28 02:04:46 +01:00
stream = open ( encodeFilename ( filename ) , open_mode )
return ( stream , filename )
except ( IOError , OSError ) as err :
2013-05-13 09:20:08 +02:00
if err . errno in ( errno . EACCES , ) :
raise
2012-11-28 02:04:46 +01:00
2013-05-13 09:20:08 +02:00
# In case of error, try to remove win32 forbidden chars
alt_filename = os . path . join (
re . sub ( u ' [/<>: " \\ | \\ \\ ? \\ *] ' , u ' # ' , path_part )
for path_part in os . path . split ( filename )
)
if alt_filename == filename :
raise
else :
# An exception here should be caught in the caller
stream = open ( encodeFilename ( filename ) , open_mode )
return ( stream , alt_filename )
2012-03-25 03:07:37 +02:00
def timeconvert ( timestr ) :
2012-11-28 02:04:46 +01:00
""" Convert RFC 2822 defined time string into system timestamp """
timestamp = None
timetuple = email . utils . parsedate_tz ( timestr )
if timetuple is not None :
timestamp = email . utils . mktime_tz ( timetuple )
return timestamp
2012-11-26 23:58:46 +01:00
2012-12-03 15:36:24 +01:00
def sanitize_filename ( s , restricted = False , is_id = False ) :
2012-11-28 02:04:46 +01:00
""" Sanitizes a string so it could be used as part of a filename.
If restricted is set , use a stricter subset of allowed characters .
2012-12-03 15:36:24 +01:00
Set is_id if this is not an arbitrary string , but an ID that should be kept if possible
2012-11-28 02:04:46 +01:00
"""
def replace_insane ( char ) :
if char == ' ? ' or ord ( char ) < 32 or ord ( char ) == 127 :
return ' '
elif char == ' " ' :
return ' ' if restricted else ' \' '
elif char == ' : ' :
return ' _- ' if restricted else ' - '
elif char in ' \\ /|*<> ' :
return ' _ '
2012-11-28 12:59:27 +01:00
if restricted and ( char in ' !& \' ()[] {} $;`^,# ' or char . isspace ( ) ) :
2012-11-28 02:04:46 +01:00
return ' _ '
if restricted and ord ( char ) > 127 :
return ' _ '
return char
result = u ' ' . join ( map ( replace_insane , s ) )
2012-12-03 15:36:24 +01:00
if not is_id :
while ' __ ' in result :
result = result . replace ( ' __ ' , ' _ ' )
result = result . strip ( ' _ ' )
# Common case of "Foreign band name - English song title"
if restricted and result . startswith ( ' -_ ' ) :
result = result [ 2 : ]
if not result :
result = ' _ '
2012-11-28 02:04:46 +01:00
return result
2012-03-25 03:07:37 +02:00
def orderedSet ( iterable ) :
2012-11-28 02:04:46 +01:00
""" Remove all duplicates from the input iterable """
res = [ ]
for el in iterable :
if el not in res :
res . append ( el )
return res
2012-03-25 03:07:37 +02:00
2014-03-24 01:40:09 +01:00
2014-08-27 19:11:45 +02:00
def _htmlentity_transform ( entity ) :
""" Transforms an HTML entity to a character. """
# Known non-numeric HTML entity
if entity in compat_html_entities . name2codepoint :
return compat_chr ( compat_html_entities . name2codepoint [ entity ] )
mobj = re . match ( r ' #(x?[0-9]+) ' , entity )
if mobj is not None :
numstr = mobj . group ( 1 )
if numstr . startswith ( u ' x ' ) :
base = 16
numstr = u ' 0 %s ' % numstr
else :
base = 10
return compat_chr ( int ( numstr , base ) )
# Unknown entity in name, return its literal representation
return ( u ' & %s ; ' % entity )
2012-03-25 03:07:37 +02:00
def unescapeHTML ( s ) :
2014-03-24 01:40:09 +01:00
if s is None :
return None
assert type ( s ) == compat_str
2012-03-25 03:07:37 +02:00
2014-08-27 19:11:45 +02:00
return re . sub (
r ' &([^;]+); ' , lambda m : _htmlentity_transform ( m . group ( 1 ) ) , s )
2012-03-25 03:07:37 +02:00
2014-01-05 03:07:55 +01:00
def encodeFilename ( s , for_subprocess = False ) :
2012-11-28 02:04:46 +01:00
"""
@param s The name of the file
"""
2012-03-25 03:07:37 +02:00
2014-01-05 03:07:55 +01:00
assert type ( s ) == compat_str
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
# Python 3 has a Unicode API
if sys . version_info > = ( 3 , 0 ) :
return s
2012-11-28 00:56:20 +01:00
2012-11-28 02:04:46 +01:00
if sys . platform == ' win32 ' and sys . getwindowsversion ( ) [ 0 ] > = 5 :
# Pass u'' directly to use Unicode APIs on Windows 2000 and up
# (Detecting Windows NT 4 is tricky because 'major >= 4' would
# match Windows 9x series as well. Besides, NT 4 is obsolete.)
2014-01-05 03:07:55 +01:00
if not for_subprocess :
return s
else :
# For subprocess calls, encode with locale encoding
# Refer to http://stackoverflow.com/a/9951851/35070
encoding = preferredencoding ( )
2012-11-28 02:04:46 +01:00
else :
2013-01-20 01:48:05 +01:00
encoding = sys . getfilesystemencoding ( )
2014-01-05 03:07:55 +01:00
if encoding is None :
encoding = ' utf-8 '
return s . encode ( encoding , ' ignore ' )
2014-05-16 15:47:54 +02:00
def encodeArgument ( s ) :
if not isinstance ( s , compat_str ) :
# Legacy code that uses byte strings
# Uncomment the following line after fixing all post processors
#assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
s = s . decode ( ' ascii ' )
return encodeFilename ( s , True )
2013-02-21 17:09:39 +01:00
def decodeOption ( optval ) :
if optval is None :
return optval
if isinstance ( optval , bytes ) :
optval = optval . decode ( preferredencoding ( ) )
assert isinstance ( optval , compat_str )
return optval
2013-01-01 20:27:53 +01:00
2013-05-04 12:02:18 +02:00
def formatSeconds ( secs ) :
if secs > 3600 :
return ' %d : %02d : %02d ' % ( secs / / 3600 , ( secs % 3600 ) / / 60 , secs % 60 )
elif secs > 60 :
return ' %d : %02d ' % ( secs / / 60 , secs % 60 )
else :
return ' %d ' % secs
2013-12-29 15:28:32 +01:00
def make_HTTPS_handler ( opts_no_check_certificate , * * kwargs ) :
2013-11-24 06:37:14 +01:00
if sys . version_info < ( 3 , 2 ) :
import httplib
class HTTPSConnectionV3 ( httplib . HTTPSConnection ) :
def __init__ ( self , * args , * * kwargs ) :
httplib . HTTPSConnection . __init__ ( self , * args , * * kwargs )
def connect ( self ) :
sock = socket . create_connection ( ( self . host , self . port ) , self . timeout )
2013-12-09 03:02:54 +01:00
if getattr ( self , ' _tunnel_host ' , False ) :
2013-11-24 06:37:14 +01:00
self . sock = sock
self . _tunnel ( )
try :
2014-09-12 07:50:31 +02:00
self . sock = ssl . wrap_socket ( sock , self . key_file , self . cert_file , ssl_version = ssl . PROTOCOL_TLSv1 )
2013-11-25 06:06:18 +01:00
except ssl . SSLError :
2013-11-24 06:37:14 +01:00
self . sock = ssl . wrap_socket ( sock , self . key_file , self . cert_file , ssl_version = ssl . PROTOCOL_SSLv23 )
class HTTPSHandlerV3 ( compat_urllib_request . HTTPSHandler ) :
def https_open ( self , req ) :
return self . do_open ( HTTPSConnectionV3 , req )
2013-12-29 15:28:32 +01:00
return HTTPSHandlerV3 ( * * kwargs )
2014-09-12 07:50:31 +02:00
elif hasattr ( ssl , ' create_default_context ' ) : # Python >= 3.4
context = ssl . create_default_context ( ssl . Purpose . CLIENT_AUTH )
context . options & = ~ ssl . OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
if opts_no_check_certificate :
context . verify_mode = ssl . CERT_NONE
return compat_urllib_request . HTTPSHandler ( context = context , * * kwargs )
else : # Python < 3.4
context = ssl . SSLContext ( ssl . PROTOCOL_SSLv23 )
2013-05-04 12:19:02 +02:00
context . verify_mode = ( ssl . CERT_NONE
2013-11-22 19:57:52 +01:00
if opts_no_check_certificate
2013-05-04 12:19:02 +02:00
else ssl . CERT_REQUIRED )
2013-12-08 06:54:39 +01:00
context . set_default_verify_paths ( )
try :
context . load_default_certs ( )
except AttributeError :
pass # Python < 3.4
2013-12-29 15:28:32 +01:00
return compat_urllib_request . HTTPSHandler ( context = context , * * kwargs )
2013-05-04 12:19:02 +02:00
2013-01-01 20:27:53 +01:00
class ExtractorError ( Exception ) :
""" Error during info extraction. """
2014-04-21 20:34:03 +02:00
def __init__ ( self , msg , tb = None , expected = False , cause = None , video_id = None ) :
2013-07-02 08:40:21 +02:00
""" tb, if given, is the original traceback (so that it can be printed out).
If expected is set , this is a normal error message and most likely not a bug in youtube - dl .
"""
if sys . exc_info ( ) [ 0 ] in ( compat_urllib_error . URLError , socket . timeout , UnavailableVideoError ) :
expected = True
2014-04-21 20:34:03 +02:00
if video_id is not None :
msg = video_id + ' : ' + msg
2014-09-30 07:56:24 +02:00
if cause :
msg + = u ' (caused by %r ) ' % cause
2013-07-02 08:40:21 +02:00
if not expected :
2013-08-11 06:46:24 +02:00
msg = msg + u ' ; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update. '
2013-01-01 20:27:53 +01:00
super ( ExtractorError , self ) . __init__ ( msg )
2013-06-09 11:55:08 +02:00
2013-01-01 20:27:53 +01:00
self . traceback = tb
2013-03-09 10:05:43 +01:00
self . exc_info = sys . exc_info ( ) # preserve original exception
2013-08-28 04:25:38 +02:00
self . cause = cause
2014-04-21 20:34:03 +02:00
self . video_id = video_id
2013-01-01 20:27:53 +01:00
2013-01-03 15:39:55 +01:00
def format_traceback ( self ) :
if self . traceback is None :
return None
return u ' ' . join ( traceback . format_tb ( self . traceback ) )
2013-01-01 20:27:53 +01:00
2013-10-23 14:38:03 +02:00
class RegexNotFoundError ( ExtractorError ) :
""" Error when a regex didn ' t match """
pass
2012-03-25 03:07:37 +02:00
class DownloadError ( Exception ) :
2012-11-28 02:04:46 +01:00
""" Download Error exception.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
This exception may be thrown by FileDownloader objects if they are not
configured to continue on errors . They will contain the appropriate
error message .
"""
2013-03-09 10:05:43 +01:00
def __init__ ( self , msg , exc_info = None ) :
""" exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
super ( DownloadError , self ) . __init__ ( msg )
self . exc_info = exc_info
2012-03-25 03:07:37 +02:00
class SameFileError ( Exception ) :
2012-11-28 02:04:46 +01:00
""" Same File exception.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
This exception will be thrown by FileDownloader objects if they detect
multiple files would have to be downloaded to the same file on disk .
"""
pass
2012-03-25 03:07:37 +02:00
class PostProcessingError ( Exception ) :
2012-11-28 02:04:46 +01:00
""" Post Processing exception.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
This exception may be raised by PostProcessor ' s .run() method to
indicate an error in the postprocessing task .
"""
2013-01-12 15:07:59 +01:00
def __init__ ( self , msg ) :
self . msg = msg
2012-03-25 03:07:37 +02:00
class MaxDownloadsReached ( Exception ) :
2012-11-28 02:04:46 +01:00
""" --max-downloads limit has been reached. """
pass
2012-03-25 03:07:37 +02:00
class UnavailableVideoError ( Exception ) :
2012-11-28 02:04:46 +01:00
""" Unavailable Format exception.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
This exception will be thrown when a video is requested
in a format that is not available for that video .
"""
pass
2012-03-25 03:07:37 +02:00
class ContentTooShortError ( Exception ) :
2012-11-28 02:04:46 +01:00
""" Content Too Short exception.
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
This exception may be raised by FileDownloader objects when a file they
download is too small for what the server announced first , indicating
the connection was probably interrupted .
"""
# Both in bytes
downloaded = None
expected = None
2012-03-25 03:07:37 +02:00
2012-11-28 02:04:46 +01:00
def __init__ ( self , downloaded , expected ) :
self . downloaded = downloaded
self . expected = expected
2012-03-25 03:07:37 +02:00
2013-08-27 23:15:01 +02:00
class YoutubeDLHandler ( compat_urllib_request . HTTPHandler ) :
2012-11-28 02:04:46 +01:00
""" Handler for HTTP requests and responses.
This class , when installed with an OpenerDirector , automatically adds
the standard headers to every HTTP request and handles gzipped and
deflated responses from web servers . If compression is to be avoided in
a particular request , the original request in the program code only has
to include the HTTP header " Youtubedl-No-Compression " , which will be
removed before making the real request .
Part of this code was copied from :
http : / / techknack . net / python - urllib2 - handlers /
Andrew Rowls , the author of that code , agreed to release it to the
public domain .
"""
@staticmethod
def deflate ( data ) :
try :
return zlib . decompress ( data , - zlib . MAX_WBITS )
except zlib . error :
return zlib . decompress ( data )
@staticmethod
def addinfourl_wrapper ( stream , headers , url , code ) :
if hasattr ( compat_urllib_request . addinfourl , ' getcode ' ) :
return compat_urllib_request . addinfourl ( stream , headers , url , code )
ret = compat_urllib_request . addinfourl ( stream , headers , url )
ret . code = code
return ret
2013-08-27 23:15:01 +02:00
def http_request ( self , req ) :
2014-08-26 11:51:48 +02:00
for h , v in std_headers . items ( ) :
if h not in req . headers :
req . add_header ( h , v )
2012-11-28 02:04:46 +01:00
if ' Youtubedl-no-compression ' in req . headers :
if ' Accept-encoding ' in req . headers :
del req . headers [ ' Accept-encoding ' ]
del req . headers [ ' Youtubedl-no-compression ' ]
2013-01-12 16:49:13 +01:00
if ' Youtubedl-user-agent ' in req . headers :
2013-01-12 18:38:23 +01:00
if ' User-agent ' in req . headers :
del req . headers [ ' User-agent ' ]
req . headers [ ' User-agent ' ] = req . headers [ ' Youtubedl-user-agent ' ]
2013-01-12 16:49:13 +01:00
del req . headers [ ' Youtubedl-user-agent ' ]
2014-09-29 06:15:46 +02:00
if sys . version_info < ( 2 , 7 ) and ' # ' in req . get_full_url ( ) :
# Python 2.6 is brain-dead when it comes to fragments
req . _Request__original = req . _Request__original . partition ( ' # ' ) [ 0 ]
req . _Request__r_type = req . _Request__r_type . partition ( ' # ' ) [ 0 ]
2012-11-28 02:04:46 +01:00
return req
2013-08-27 23:15:01 +02:00
def http_response ( self , req , resp ) :
2012-11-28 02:04:46 +01:00
old_resp = resp
# gzip
if resp . headers . get ( ' Content-encoding ' , ' ' ) == ' gzip ' :
2013-08-28 11:57:13 +02:00
content = resp . read ( )
gz = gzip . GzipFile ( fileobj = io . BytesIO ( content ) , mode = ' rb ' )
try :
uncompressed = io . BytesIO ( gz . read ( ) )
except IOError as original_ioerror :
# There may be junk add the end of the file
# See http://stackoverflow.com/q/4928560/35070 for details
for i in range ( 1 , 1024 ) :
try :
gz = gzip . GzipFile ( fileobj = io . BytesIO ( content [ : - i ] ) , mode = ' rb ' )
uncompressed = io . BytesIO ( gz . read ( ) )
except IOError :
continue
break
else :
raise original_ioerror
resp = self . addinfourl_wrapper ( uncompressed , old_resp . headers , old_resp . url , old_resp . code )
2012-11-28 02:04:46 +01:00
resp . msg = old_resp . msg
# deflate
if resp . headers . get ( ' Content-encoding ' , ' ' ) == ' deflate ' :
gz = io . BytesIO ( self . deflate ( resp . read ( ) ) )
resp = self . addinfourl_wrapper ( gz , old_resp . headers , old_resp . url , old_resp . code )
resp . msg = old_resp . msg
return resp
2012-12-07 00:39:44 +01:00
2013-08-27 23:15:01 +02:00
https_request = http_request
https_response = http_response
2013-04-27 15:14:20 +02:00
2014-02-06 11:29:46 +01:00
2014-05-17 19:04:02 +02:00
def parse_iso8601 ( date_str , delimiter = ' T ' ) :
2014-03-24 01:40:09 +01:00
""" Return a UNIX timestamp from the given date """
if date_str is None :
return None
m = re . search (
2014-10-29 20:10:00 +01:00
r ' ( \ .[0-9]+)?(?:Z$| ?(?P<sign> \ +|-)(?P<hours>[0-9] {2} ):?(?P<minutes>[0-9] {2} )$) ' ,
2014-03-24 01:40:09 +01:00
date_str )
if not m :
timezone = datetime . timedelta ( )
else :
date_str = date_str [ : - len ( m . group ( 0 ) ) ]
if not m . group ( ' sign ' ) :
timezone = datetime . timedelta ( )
else :
sign = 1 if m . group ( ' sign ' ) == ' + ' else - 1
timezone = datetime . timedelta (
hours = sign * int ( m . group ( ' hours ' ) ) ,
minutes = sign * int ( m . group ( ' minutes ' ) ) )
2014-10-29 20:10:00 +01:00
date_format = ' % Y- % m- %d {0} % H: % M: % S ' . format ( delimiter )
2014-05-17 19:04:02 +02:00
dt = datetime . datetime . strptime ( date_str , date_format ) - timezone
2014-03-24 01:40:09 +01:00
return calendar . timegm ( dt . timetuple ( ) )
2013-04-27 15:14:20 +02:00
def unified_strdate ( date_str ) :
""" Return a string with the date in the format YYYYMMDD """
2014-03-21 14:38:37 +01:00
if date_str is None :
return None
2013-04-27 15:14:20 +02:00
upload_date = None
#Replace commas
2014-02-09 18:09:57 +01:00
date_str = date_str . replace ( ' , ' , ' ' )
2013-04-27 15:14:20 +02:00
# %z (UTC offset) is only supported in python>=3.2
2014-02-09 18:09:57 +01:00
date_str = re . sub ( r ' ?( \ +|-)[0-9] {2} :?[0-9] {2} $ ' , ' ' , date_str )
2013-09-14 14:26:42 +02:00
format_expressions = [
' %d % B % Y ' ,
2014-02-16 21:47:03 +01:00
' %d % b % Y ' ,
2013-09-14 14:26:42 +02:00
' % B %d % Y ' ,
' % b %d % Y ' ,
2014-06-28 20:02:02 +02:00
' % b %d st % Y % I: % M % p ' ,
' % b %d nd % Y % I: % M % p ' ,
' % b %d th % Y % I: % M % p ' ,
2013-09-14 14:26:42 +02:00
' % Y- % m- %d ' ,
2014-08-19 15:02:08 +02:00
' % Y/ % m/ %d ' ,
2014-02-27 11:44:05 +01:00
' %d . % m. % Y ' ,
2013-09-14 14:26:42 +02:00
' %d / % m/ % Y ' ,
2014-08-24 06:41:55 +02:00
' %d / % m/ % y ' ,
2013-09-14 14:26:42 +02:00
' % Y/ % m/ %d % H: % M: % S ' ,
2014-09-29 12:45:18 +02:00
' %d / % m/ % Y % H: % M: % S ' ,
2014-01-06 17:15:27 +01:00
' % Y- % m- %d % H: % M: % S ' ,
2014-10-04 21:38:23 +02:00
' % Y- % m- %d % H: % M: % S. %f ' ,
2013-09-14 14:26:42 +02:00
' %d . % m. % Y % H: % M ' ,
2014-03-11 22:18:43 +01:00
' %d . % m. % Y % H. % M ' ,
2013-09-14 14:26:42 +02:00
' % Y- % m- %d T % H: % M: % SZ ' ,
2013-11-20 06:13:19 +01:00
' % Y- % m- %d T % H: % M: % S. %f Z ' ,
' % Y- % m- %d T % H: % M: % S. %f 0Z ' ,
2013-10-10 15:25:11 +02:00
' % Y- % m- %d T % H: % M: % S ' ,
2014-02-23 13:00:51 +01:00
' % Y- % m- %d T % H: % M: % S. %f ' ,
2014-02-06 11:29:46 +01:00
' % Y- % m- %d T % H: % M ' ,
2013-09-14 14:26:42 +02:00
]
2013-04-27 15:14:20 +02:00
for expression in format_expressions :
try :
upload_date = datetime . datetime . strptime ( date_str , expression ) . strftime ( ' % Y % m %d ' )
2014-02-06 11:29:46 +01:00
except ValueError :
2013-04-27 15:14:20 +02:00
pass
2013-12-17 12:33:55 +01:00
if upload_date is None :
timetuple = email . utils . parsedate_tz ( date_str )
if timetuple :
upload_date = datetime . datetime ( * timetuple [ : 6 ] ) . strftime ( ' % Y % m %d ' )
2013-04-27 15:14:20 +02:00
return upload_date
2013-07-12 21:52:59 +02:00
def determine_ext ( url , default_ext = u ' unknown_video ' ) :
2014-08-01 14:08:09 +02:00
if url is None :
return default_ext
2013-07-08 01:13:55 +02:00
guess = url . partition ( u ' ? ' ) [ 0 ] . rpartition ( u ' . ' ) [ 2 ]
if re . match ( r ' ^[A-Za-z0-9]+$ ' , guess ) :
return guess
else :
2013-07-12 21:52:59 +02:00
return default_ext
2013-07-08 01:13:55 +02:00
2013-07-20 12:48:57 +02:00
def subtitles_filename ( filename , sub_lang , sub_format ) :
return filename . rsplit ( ' . ' , 1 ) [ 0 ] + u ' . ' + sub_lang + u ' . ' + sub_format
2013-04-27 14:01:55 +02:00
def date_from_str ( date_str ) :
2013-04-28 11:39:37 +02:00
"""
Return a datetime object from a string in the format YYYYMMDD or
( now | today ) [ + - ] [ 0 - 9 ] ( day | week | month | year ) ( s ) ? """
today = datetime . date . today ( )
if date_str == ' now ' or date_str == ' today ' :
return today
match = re . match ( ' (now|today)(?P<sign>[+-])(?P<time> \ d+)(?P<unit>day|week|month|year)(s)? ' , date_str )
if match is not None :
sign = match . group ( ' sign ' )
time = int ( match . group ( ' time ' ) )
if sign == ' - ' :
time = - time
unit = match . group ( ' unit ' )
#A bad aproximation?
if unit == ' month ' :
unit = ' day '
time * = 30
elif unit == ' year ' :
unit = ' day '
time * = 365
unit + = ' s '
delta = datetime . timedelta ( * * { unit : time } )
return today + delta
2013-04-27 14:01:55 +02:00
return datetime . datetime . strptime ( date_str , " % Y % m %d " ) . date ( )
2014-01-02 13:47:28 +01:00
def hyphenate_date ( date_str ) :
"""
Convert a date in ' YYYYMMDD ' format to ' YYYY-MM-DD ' format """
match = re . match ( r ' ^( \ d \ d \ d \ d)( \ d \ d)( \ d \ d)$ ' , date_str )
if match is not None :
return ' - ' . join ( match . groups ( ) )
else :
return date_str
2013-04-27 14:01:55 +02:00
class DateRange ( object ) :
""" Represents a time interval between two dates """
def __init__ ( self , start = None , end = None ) :
""" start and end must be strings in the format accepted by date """
if start is not None :
self . start = date_from_str ( start )
else :
self . start = datetime . datetime . min . date ( )
if end is not None :
self . end = date_from_str ( end )
else :
self . end = datetime . datetime . max . date ( )
2013-04-28 11:39:37 +02:00
if self . start > self . end :
2013-04-27 14:01:55 +02:00
raise ValueError ( ' Date range: " %s " , the start date must be before the end date ' % self )
@classmethod
def day ( cls , day ) :
""" Returns a range that only contains the given day """
return cls ( day , day )
def __contains__ ( self , date ) :
""" Check if the date is in the range """
2013-04-28 11:39:37 +02:00
if not isinstance ( date , datetime . date ) :
date = date_from_str ( date )
return self . start < = date < = self . end
2013-04-27 14:01:55 +02:00
def __str__ ( self ) :
return ' %s - %s ' % ( self . start . isoformat ( ) , self . end . isoformat ( ) )
2013-08-28 12:57:10 +02:00
def platform_name ( ) :
""" Returns the platform name as a compat_str """
res = platform . platform ( )
if isinstance ( res , bytes ) :
res = res . decode ( preferredencoding ( ) )
assert isinstance ( res , compat_str )
return res
2013-08-28 18:22:28 +02:00
2014-04-07 22:48:13 +02:00
def _windows_write_string ( s , out ) :
""" Returns True if the string was written using special methods,
False if it has yet to be written out . """
# Adapted from http://stackoverflow.com/a/3259271/35070
import ctypes
import ctypes . wintypes
WIN_OUTPUT_IDS = {
1 : - 11 ,
2 : - 12 ,
}
2014-04-30 10:07:32 +02:00
try :
fileno = out . fileno ( )
except AttributeError :
# If the output stream doesn't have a fileno, it's virtual
return False
2014-04-07 22:48:13 +02:00
if fileno not in WIN_OUTPUT_IDS :
return False
GetStdHandle = ctypes . WINFUNCTYPE (
ctypes . wintypes . HANDLE , ctypes . wintypes . DWORD ) (
( " GetStdHandle " , ctypes . windll . kernel32 ) )
h = GetStdHandle ( WIN_OUTPUT_IDS [ fileno ] )
WriteConsoleW = ctypes . WINFUNCTYPE (
ctypes . wintypes . BOOL , ctypes . wintypes . HANDLE , ctypes . wintypes . LPWSTR ,
ctypes . wintypes . DWORD , ctypes . POINTER ( ctypes . wintypes . DWORD ) ,
ctypes . wintypes . LPVOID ) ( ( " WriteConsoleW " , ctypes . windll . kernel32 ) )
written = ctypes . wintypes . DWORD ( 0 )
GetFileType = ctypes . WINFUNCTYPE ( ctypes . wintypes . DWORD , ctypes . wintypes . DWORD ) ( ( " GetFileType " , ctypes . windll . kernel32 ) )
FILE_TYPE_CHAR = 0x0002
FILE_TYPE_REMOTE = 0x8000
GetConsoleMode = ctypes . WINFUNCTYPE (
ctypes . wintypes . BOOL , ctypes . wintypes . HANDLE ,
ctypes . POINTER ( ctypes . wintypes . DWORD ) ) (
( " GetConsoleMode " , ctypes . windll . kernel32 ) )
INVALID_HANDLE_VALUE = ctypes . wintypes . DWORD ( - 1 ) . value
def not_a_console ( handle ) :
if handle == INVALID_HANDLE_VALUE or handle is None :
return True
return ( ( GetFileType ( handle ) & ~ FILE_TYPE_REMOTE ) != FILE_TYPE_CHAR
or GetConsoleMode ( handle , ctypes . byref ( ctypes . wintypes . DWORD ( ) ) ) == 0 )
if not_a_console ( h ) :
return False
2014-04-21 04:59:44 +02:00
def next_nonbmp_pos ( s ) :
try :
return next ( i for i , c in enumerate ( s ) if ord ( c ) > 0xffff )
except StopIteration :
return len ( s )
while s :
count = min ( next_nonbmp_pos ( s ) , 1024 )
2014-04-07 22:48:13 +02:00
ret = WriteConsoleW (
2014-04-21 04:59:44 +02:00
h , s , count if count else 2 , ctypes . byref ( written ) , None )
2014-04-07 22:48:13 +02:00
if ret == 0 :
raise OSError ( ' Failed to write string ' )
2014-04-21 04:59:44 +02:00
if not count : # We just wrote a non-BMP character
assert written . value == 2
s = s [ 1 : ]
else :
assert written . value > 0
s = s [ written . value : ]
2014-04-07 22:48:13 +02:00
return True
2014-04-07 19:57:42 +02:00
def write_string ( s , out = None , encoding = None ) :
2013-09-16 06:55:33 +02:00
if out is None :
out = sys . stderr
2014-01-05 03:07:55 +01:00
assert type ( s ) == compat_str
2013-09-16 06:55:33 +02:00
2014-04-07 22:48:13 +02:00
if sys . platform == ' win32 ' and encoding is None and hasattr ( out , ' fileno ' ) :
if _windows_write_string ( s , out ) :
return
2013-09-16 06:55:33 +02:00
if ( ' b ' in getattr ( out , ' mode ' , ' ' ) or
sys . version_info [ 0 ] < 3 ) : # Python 2 lies about mode of sys.stderr
2014-04-07 21:40:34 +02:00
byt = s . encode ( encoding or preferredencoding ( ) , ' ignore ' )
out . write ( byt )
elif hasattr ( out , ' buffer ' ) :
enc = encoding or getattr ( out , ' encoding ' , None ) or preferredencoding ( )
byt = s . encode ( enc , ' ignore ' )
out . buffer . write ( byt )
else :
2014-01-05 03:07:55 +01:00
out . write ( s )
2013-09-16 06:55:33 +02:00
out . flush ( )
2013-08-28 14:28:55 +02:00
def bytes_to_intlist ( bs ) :
if not bs :
return [ ]
if isinstance ( bs [ 0 ] , int ) : # Python 3
return list ( bs )
else :
return [ ord ( c ) for c in bs ]
2013-08-28 18:22:28 +02:00
2013-08-28 15:59:07 +02:00
def intlist_to_bytes ( xs ) :
if not xs :
return b ' '
if isinstance ( chr ( 0 ) , bytes ) : # Python 2
return ' ' . join ( [ chr ( x ) for x in xs ] )
else :
return bytes ( xs )
2013-10-02 08:41:03 +02:00
2013-10-06 04:27:09 +02:00
# Cross-platform file locking
if sys . platform == ' win32 ' :
import ctypes . wintypes
import msvcrt
class OVERLAPPED ( ctypes . Structure ) :
_fields_ = [
( ' Internal ' , ctypes . wintypes . LPVOID ) ,
( ' InternalHigh ' , ctypes . wintypes . LPVOID ) ,
( ' Offset ' , ctypes . wintypes . DWORD ) ,
( ' OffsetHigh ' , ctypes . wintypes . DWORD ) ,
( ' hEvent ' , ctypes . wintypes . HANDLE ) ,
]
kernel32 = ctypes . windll . kernel32
LockFileEx = kernel32 . LockFileEx
LockFileEx . argtypes = [
ctypes . wintypes . HANDLE , # hFile
ctypes . wintypes . DWORD , # dwFlags
ctypes . wintypes . DWORD , # dwReserved
ctypes . wintypes . DWORD , # nNumberOfBytesToLockLow
ctypes . wintypes . DWORD , # nNumberOfBytesToLockHigh
ctypes . POINTER ( OVERLAPPED ) # Overlapped
]
LockFileEx . restype = ctypes . wintypes . BOOL
UnlockFileEx = kernel32 . UnlockFileEx
UnlockFileEx . argtypes = [
ctypes . wintypes . HANDLE , # hFile
ctypes . wintypes . DWORD , # dwReserved
ctypes . wintypes . DWORD , # nNumberOfBytesToLockLow
ctypes . wintypes . DWORD , # nNumberOfBytesToLockHigh
ctypes . POINTER ( OVERLAPPED ) # Overlapped
]
UnlockFileEx . restype = ctypes . wintypes . BOOL
whole_low = 0xffffffff
whole_high = 0x7fffffff
def _lock_file ( f , exclusive ) :
overlapped = OVERLAPPED ( )
overlapped . Offset = 0
overlapped . OffsetHigh = 0
overlapped . hEvent = 0
f . _lock_file_overlapped_p = ctypes . pointer ( overlapped )
handle = msvcrt . get_osfhandle ( f . fileno ( ) )
if not LockFileEx ( handle , 0x2 if exclusive else 0x0 , 0 ,
whole_low , whole_high , f . _lock_file_overlapped_p ) :
raise OSError ( ' Locking file failed: %r ' % ctypes . FormatError ( ) )
def _unlock_file ( f ) :
assert f . _lock_file_overlapped_p
handle = msvcrt . get_osfhandle ( f . fileno ( ) )
if not UnlockFileEx ( handle , 0 ,
whole_low , whole_high , f . _lock_file_overlapped_p ) :
raise OSError ( ' Unlocking file failed: %r ' % ctypes . FormatError ( ) )
else :
import fcntl
def _lock_file ( f , exclusive ) :
2014-09-01 01:41:25 +02:00
fcntl . flock ( f , fcntl . LOCK_EX if exclusive else fcntl . LOCK_SH )
2013-10-06 04:27:09 +02:00
def _unlock_file ( f ) :
2014-09-01 01:41:25 +02:00
fcntl . flock ( f , fcntl . LOCK_UN )
2013-10-06 04:27:09 +02:00
class locked_file ( object ) :
def __init__ ( self , filename , mode , encoding = None ) :
assert mode in [ ' r ' , ' a ' , ' w ' ]
self . f = io . open ( filename , mode , encoding = encoding )
self . mode = mode
def __enter__ ( self ) :
exclusive = self . mode != ' r '
try :
_lock_file ( self . f , exclusive )
except IOError :
self . f . close ( )
raise
return self
def __exit__ ( self , etype , value , traceback ) :
try :
_unlock_file ( self . f )
finally :
self . f . close ( )
def __iter__ ( self ) :
return iter ( self . f )
def write ( self , * args ) :
return self . f . write ( * args )
def read ( self , * args ) :
return self . f . read ( * args )
2013-10-12 13:49:27 +02:00
2014-09-30 17:27:53 +02:00
def get_filesystem_encoding ( ) :
encoding = sys . getfilesystemencoding ( )
return encoding if encoding is not None else ' utf-8 '
2013-10-12 13:49:27 +02:00
def shell_quote ( args ) :
2013-11-21 14:09:28 +01:00
quoted_args = [ ]
2014-09-30 17:27:53 +02:00
encoding = get_filesystem_encoding ( )
2013-11-21 14:09:28 +01:00
for a in args :
if isinstance ( a , bytes ) :
# We may get a filename encoded with 'encodeFilename'
a = a . decode ( encoding )
quoted_args . append ( pipes . quote ( a ) )
return u ' ' . join ( quoted_args )
2013-10-15 12:05:13 +02:00
2013-10-18 00:46:35 +02:00
def takewhile_inclusive ( pred , seq ) :
""" Like itertools.takewhile, but include the latest evaluated element
( the first element so that Not pred ( e ) ) """
for e in seq :
yield e
if not pred ( e ) :
return
2013-10-15 12:05:13 +02:00
def smuggle_url ( url , data ) :
""" Pass additional data in a URL for internal use. """
sdata = compat_urllib_parse . urlencode (
{ u ' __youtubedl_smuggle ' : json . dumps ( data ) } )
return url + u ' # ' + sdata
2014-01-07 05:34:14 +01:00
def unsmuggle_url ( smug_url , default = None ) :
2013-10-15 12:05:13 +02:00
if not ' #__youtubedl_smuggle ' in smug_url :
2014-01-07 05:34:14 +01:00
return smug_url , default
2013-10-15 12:05:13 +02:00
url , _ , sdata = smug_url . rpartition ( u ' # ' )
jsond = compat_parse_qs ( sdata ) [ u ' __youtubedl_smuggle ' ] [ 0 ]
data = json . loads ( jsond )
return url , data
2013-11-25 03:12:26 +01:00
def format_bytes ( bytes ) :
if bytes is None :
return u ' N/A '
if type ( bytes ) is str :
bytes = float ( bytes )
if bytes == 0.0 :
exponent = 0
else :
exponent = int ( math . log ( bytes , 1024.0 ) )
suffix = [ u ' B ' , u ' KiB ' , u ' MiB ' , u ' GiB ' , u ' TiB ' , u ' PiB ' , u ' EiB ' , u ' ZiB ' , u ' YiB ' ] [ exponent ]
converted = float ( bytes ) / float ( 1024 * * exponent )
return u ' %.2f %s ' % ( converted , suffix )
2013-12-06 13:36:36 +01:00
2013-12-09 18:29:07 +01:00
def get_term_width ( ) :
2014-09-30 17:27:53 +02:00
columns = compat_getenv ( ' COLUMNS ' , None )
2013-12-09 18:29:07 +01:00
if columns :
return int ( columns )
try :
sp = subprocess . Popen (
[ ' stty ' , ' size ' ] ,
stdout = subprocess . PIPE , stderr = subprocess . PIPE )
out , err = sp . communicate ( )
return int ( out . split ( ) [ 1 ] )
except :
pass
return None
2013-12-09 19:39:41 +01:00
def month_by_name ( name ) :
""" Return the number of a month by (locale-independently) English name """
ENGLISH_NAMES = [
2013-12-13 16:27:37 +01:00
u ' January ' , u ' February ' , u ' March ' , u ' April ' , u ' May ' , u ' June ' ,
2013-12-09 19:39:41 +01:00
u ' July ' , u ' August ' , u ' September ' , u ' October ' , u ' November ' , u ' December ' ]
try :
return ENGLISH_NAMES . index ( name ) + 1
except ValueError :
return None
2013-12-10 21:03:53 +01:00
2014-01-20 22:11:34 +01:00
def fix_xml_ampersands ( xml_str ) :
2013-12-10 21:03:53 +01:00
""" Replace all the ' & ' by ' & ' in XML """
2014-01-20 22:11:34 +01:00
return re . sub (
r ' &(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F] { ,4};|#[0-9] { ,4};) ' ,
u ' & ' ,
xml_str )
2013-12-16 05:04:12 +01:00
def setproctitle ( title ) :
2014-01-05 03:07:55 +01:00
assert isinstance ( title , compat_str )
2013-12-16 05:04:12 +01:00
try :
libc = ctypes . cdll . LoadLibrary ( " libc.so.6 " )
except OSError :
return
2014-03-23 14:28:22 +01:00
title_bytes = title . encode ( ' utf-8 ' )
buf = ctypes . create_string_buffer ( len ( title_bytes ) )
buf . value = title_bytes
2013-12-16 05:04:12 +01:00
try :
2014-03-23 14:28:22 +01:00
libc . prctl ( 15 , buf , 0 , 0 , 0 )
2013-12-16 05:04:12 +01:00
except AttributeError :
return # Strange libc, just skip this
2013-12-16 13:56:13 +01:00
def remove_start ( s , start ) :
if s . startswith ( start ) :
return s [ len ( start ) : ]
return s
2013-12-17 04:13:36 +01:00
2014-08-22 18:40:26 +02:00
def remove_end ( s , end ) :
if s . endswith ( end ) :
return s [ : - len ( end ) ]
return s
2013-12-17 04:13:36 +01:00
def url_basename ( url ) :
2013-12-17 14:56:29 +01:00
path = compat_urlparse . urlparse ( url ) . path
return path . strip ( u ' / ' ) . split ( u ' / ' ) [ - 1 ]
2013-12-20 17:05:28 +01:00
class HEADRequest ( compat_urllib_request . Request ) :
def get_method ( self ) :
return " HEAD "
2013-12-25 15:18:40 +01:00
2014-07-21 12:02:44 +02:00
def int_or_none ( v , scale = 1 , default = None , get_attr = None , invscale = 1 ) :
2014-04-21 13:45:27 +02:00
if get_attr :
if v is not None :
v = getattr ( v , get_attr , None )
2014-08-10 13:04:45 +02:00
if v == ' ' :
v = None
2014-07-21 12:02:44 +02:00
return default if v is None else ( int ( v ) * invscale / / scale )
2014-08-10 13:04:45 +02:00
2014-08-10 11:00:14 +02:00
def str_or_none ( v , default = None ) :
return default if v is None else compat_str ( v )
2014-07-21 12:02:44 +02:00
def str_to_int ( int_str ) :
2014-08-31 23:51:36 +02:00
""" A more relaxed version of int_or_none """
2014-07-21 12:02:44 +02:00
if int_str is None :
return None
2014-09-03 14:59:36 +02:00
int_str = re . sub ( r ' [, \ . \ +] ' , u ' ' , int_str )
2014-07-21 12:02:44 +02:00
return int ( int_str )
2013-12-26 13:49:44 +01:00
2014-07-21 12:02:44 +02:00
def float_or_none ( v , scale = 1 , invscale = 1 , default = None ) :
return default if v is None else ( float ( v ) * invscale / scale )
2014-03-28 23:06:34 +01:00
2013-12-26 13:49:44 +01:00
def parse_duration ( s ) :
if s is None :
return None
2014-08-31 01:41:30 +02:00
s = s . strip ( )
2013-12-26 13:49:44 +01:00
m = re . match (
2014-09-03 16:03:36 +02:00
r ' (?i)(?:(?:(?P<hours>[0-9]+) \ s*(?:[:h]|hours?) \ s*)?(?P<mins>[0-9]+) \ s*(?:[:m]|mins?|minutes?) \ s*)?(?P<secs>[0-9]+)(?P<ms> \ .[0-9]+)? \ s*(?:s|secs?|seconds?)?$ ' , s )
2013-12-26 13:49:44 +01:00
if not m :
return None
res = int ( m . group ( ' secs ' ) )
if m . group ( ' mins ' ) :
res + = int ( m . group ( ' mins ' ) ) * 60
if m . group ( ' hours ' ) :
res + = int ( m . group ( ' hours ' ) ) * 60 * 60
2014-08-25 12:59:53 +02:00
if m . group ( ' ms ' ) :
res + = float ( m . group ( ' ms ' ) )
2013-12-26 13:49:44 +01:00
return res
2014-01-03 12:52:27 +01:00
def prepend_extension ( filename , ext ) :
name , real_ext = os . path . splitext ( filename )
return u ' {0} . {1} {2} ' . format ( name , ext , real_ext )
2014-01-07 06:23:41 +01:00
def check_executable ( exe , args = [ ] ) :
""" Checks if the given binary is installed somewhere in PATH, and returns its name.
args can be a list of arguments for a short output ( like - version ) """
try :
subprocess . Popen ( [ exe ] + args , stdout = subprocess . PIPE , stderr = subprocess . PIPE ) . communicate ( )
except OSError :
return False
return exe
2014-01-20 11:36:47 +01:00
2014-11-02 10:50:30 +01:00
def get_exe_version ( exe , args = [ ' --version ' ] ,
version_re = r ' version \ s+([0-9._-a-zA-Z]+) ' ,
unrecognized = u ' present ' ) :
""" Returns the version of the specified executable,
or False if the executable is not present """
try :
out , err = subprocess . Popen (
[ exe ] + args ,
stdout = subprocess . PIPE , stderr = subprocess . STDOUT ) . communicate ( )
except OSError :
return False
firstline = out . partition ( b ' \n ' ) [ 0 ] . decode ( ' ascii ' , ' ignore ' )
m = re . search ( version_re , firstline )
if m :
return m . group ( 1 )
else :
return unrecognized
2014-01-20 11:36:47 +01:00
class PagedList ( object ) :
2014-01-22 21:43:33 +01:00
def __len__ ( self ) :
# This is only useful for tests
return len ( self . getslice ( ) )
2014-09-29 00:36:06 +02:00
class OnDemandPagedList ( PagedList ) :
def __init__ ( self , pagefunc , pagesize ) :
self . _pagefunc = pagefunc
self . _pagesize = pagesize
2014-01-20 11:36:47 +01:00
def getslice ( self , start = 0 , end = None ) :
res = [ ]
for pagenum in itertools . count ( start / / self . _pagesize ) :
firstid = pagenum * self . _pagesize
nextfirstid = pagenum * self . _pagesize + self . _pagesize
if start > = nextfirstid :
continue
page_results = list ( self . _pagefunc ( pagenum ) )
startv = (
start % self . _pagesize
if firstid < = start < nextfirstid
else 0 )
endv = (
( ( end - 1 ) % self . _pagesize ) + 1
if ( end is not None and firstid < = end < = nextfirstid )
else None )
if startv != 0 or endv is not None :
page_results = page_results [ startv : endv ]
res . extend ( page_results )
# A little optimization - if current page is not "full", ie. does
# not contain page_size videos then we can assume that this page
# is the last one - there are no more ids on further pages -
# i.e. no need to query again.
if len ( page_results ) + startv < self . _pagesize :
break
# If we got the whole page, but the next page is not interesting,
# break out early as well
if end == nextfirstid :
break
return res
2014-02-09 17:56:10 +01:00
2014-09-29 00:36:06 +02:00
class InAdvancePagedList ( PagedList ) :
def __init__ ( self , pagefunc , pagecount , pagesize ) :
self . _pagefunc = pagefunc
self . _pagecount = pagecount
self . _pagesize = pagesize
def getslice ( self , start = 0 , end = None ) :
res = [ ]
start_page = start / / self . _pagesize
end_page = (
self . _pagecount if end is None else ( end / / self . _pagesize + 1 ) )
skip_elems = start - start_page * self . _pagesize
only_more = None if end is None else end - start
for pagenum in range ( start_page , end_page ) :
page = list ( self . _pagefunc ( pagenum ) )
if skip_elems :
page = page [ skip_elems : ]
skip_elems = None
if only_more is not None :
if len ( page ) < only_more :
only_more - = len ( page )
else :
page = page [ : only_more ]
res . extend ( page )
break
res . extend ( page )
return res
2014-02-09 17:56:10 +01:00
def uppercase_escape ( s ) :
2014-04-04 23:00:51 +02:00
unicode_escape = codecs . getdecoder ( ' unicode_escape ' )
2014-02-09 17:56:10 +01:00
return re . sub (
2014-04-01 13:17:07 +02:00
r ' \\ U[0-9a-fA-F] {8} ' ,
2014-04-04 23:00:51 +02:00
lambda m : unicode_escape ( m . group ( 0 ) ) [ 0 ] ,
s )
2014-02-15 16:24:43 +01:00
2014-09-13 15:59:16 +02:00
def escape_rfc3986 ( s ) :
""" Escape non-ASCII characters as suggested by RFC 3986 """
if sys . version_info < ( 3 , 0 ) and isinstance ( s , unicode ) :
s = s . encode ( ' utf-8 ' )
2014-09-13 16:08:04 +02:00
return compat_urllib_parse . quote ( s , " % /;:@&=+$,!~* ' ()?#[] " )
2014-09-13 15:59:16 +02:00
def escape_url ( url ) :
""" Escape URL as suggested by RFC 3986 """
url_parsed = compat_urllib_parse_urlparse ( url )
return url_parsed . _replace (
path = escape_rfc3986 ( url_parsed . path ) ,
params = escape_rfc3986 ( url_parsed . params ) ,
query = escape_rfc3986 ( url_parsed . query ) ,
fragment = escape_rfc3986 ( url_parsed . fragment )
) . geturl ( )
2014-02-15 16:24:43 +01:00
try :
struct . pack ( u ' !I ' , 0 )
except TypeError :
# In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
def struct_pack ( spec , * args ) :
if isinstance ( spec , compat_str ) :
spec = spec . encode ( ' ascii ' )
return struct . pack ( spec , * args )
def struct_unpack ( spec , * args ) :
if isinstance ( spec , compat_str ) :
spec = spec . encode ( ' ascii ' )
return struct . unpack ( spec , * args )
else :
struct_pack = struct . pack
struct_unpack = struct . unpack
2014-02-25 01:43:17 +01:00
def read_batch_urls ( batch_fd ) :
def fixup ( url ) :
if not isinstance ( url , compat_str ) :
url = url . decode ( ' utf-8 ' , ' replace ' )
BOM_UTF8 = u ' \xef \xbb \xbf '
if url . startswith ( BOM_UTF8 ) :
url = url [ len ( BOM_UTF8 ) : ]
url = url . strip ( )
if url . startswith ( ( ' # ' , ' ; ' , ' ] ' ) ) :
return False
return url
with contextlib . closing ( batch_fd ) as fd :
return [ url for url in map ( fixup , fd ) if url ]
2014-03-07 15:25:33 +01:00
def urlencode_postdata ( * args , * * kargs ) :
return compat_urllib_parse . urlencode ( * args , * * kargs ) . encode ( ' ascii ' )
2014-03-10 17:31:32 +01:00
2014-08-25 18:03:01 +02:00
try :
etree_iter = xml . etree . ElementTree . Element . iter
except AttributeError : # Python <=2.6
etree_iter = lambda n : n . findall ( ' .//* ' )
2014-03-10 17:31:32 +01:00
def parse_xml ( s ) :
class TreeBuilder ( xml . etree . ElementTree . TreeBuilder ) :
def doctype ( self , name , pubid , system ) :
pass # Ignore doctypes
parser = xml . etree . ElementTree . XMLParser ( target = TreeBuilder ( ) )
kwargs = { ' parser ' : parser } if sys . version_info > = ( 2 , 7 ) else { }
2014-08-25 18:03:01 +02:00
tree = xml . etree . ElementTree . XML ( s . encode ( ' utf-8 ' ) , * * kwargs )
# Fix up XML parser in Python 2.x
if sys . version_info < ( 3 , 0 ) :
for n in etree_iter ( tree ) :
if n . text is not None :
if not isinstance ( n . text , compat_str ) :
n . text = n . text . decode ( ' utf-8 ' )
return tree
2014-03-18 14:27:42 +01:00
2014-03-21 00:59:51 +01:00
US_RATINGS = {
' G ' : 0 ,
' PG ' : 10 ,
' PG-13 ' : 13 ,
' R ' : 16 ,
' NC ' : 18 ,
}
2014-03-24 23:21:20 +01:00
2014-10-03 14:37:25 +02:00
def parse_age_limit ( s ) :
if s is None :
2014-10-03 20:17:10 +02:00
return None
2014-10-03 14:37:25 +02:00
m = re . match ( r ' ^(?P<age> \ d { 1,2}) \ +?$ ' , s )
2014-10-03 20:17:10 +02:00
return int ( m . group ( ' age ' ) ) if m else US_RATINGS . get ( s , None )
2014-10-03 14:37:25 +02:00
2014-03-24 23:21:20 +01:00
def strip_jsonp ( code ) :
2014-07-14 00:41:23 +02:00
return re . sub ( r ' (?s)^[a-zA-Z0-9_]+ \ s* \ ( \ s*(.*) \ );? \ s*? \ s*$ ' , r ' \ 1 ' , code )
2014-04-21 07:12:02 +02:00
2014-08-22 02:33:29 +02:00
def js_to_json ( code ) :
def fix_kv ( m ) :
2014-09-30 11:12:59 +02:00
v = m . group ( 0 )
if v in ( ' true ' , ' false ' , ' null ' ) :
return v
if v . startswith ( ' " ' ) :
return v
if v . startswith ( " ' " ) :
v = v [ 1 : - 1 ]
v = re . sub ( r " \\ \\ | \\ ' | \" " , lambda m : {
' \\ \\ ' : ' \\ \\ ' ,
" \\ ' " : " ' " ,
' " ' : ' \\ " ' ,
} [ m . group ( 0 ) ] , v )
return ' " %s " ' % v
2014-08-22 02:33:29 +02:00
res = re . sub ( r ''' (?x)
2014-09-30 11:12:59 +02:00
" (?:[^ " \\] * ( ? : \\\\| \\" )?)* " |
' (?:[^ ' \\] * ( ? : \\\\| \\' )?)* ' |
[ a - zA - Z_ ] [ a - zA - Z_0 - 9 ] *
2014-08-22 02:33:29 +02:00
''' , fix_kv, code)
res = re . sub ( r ' ,( \ s* \ ]) ' , lambda m : m . group ( 1 ) , res )
return res
2014-04-21 07:12:02 +02:00
def qualities ( quality_ids ) :
""" Get a numeric quality value out of a list of possible values """
def q ( qid ) :
try :
return quality_ids . index ( qid )
except ValueError :
return - 1
return q
2014-04-30 10:02:03 +02:00
DEFAULT_OUTTMPL = ' %(title)s - %(id)s . %(ext)s '
2014-05-16 12:03:59 +02:00
2014-09-15 15:10:24 +02:00
def limit_length ( s , length ) :
""" Add ellipses to overly long strings """
if s is None :
return None
ELLIPSES = ' ... '
if len ( s ) > length :
return s [ : length - len ( ELLIPSES ) ] + ELLIPSES
return s
2014-10-26 16:46:34 +01:00
def version_tuple ( v ) :
return [ int ( e ) for e in v . split ( ' . ' ) ]
def is_outdated_version ( version , limit , assume_new = True ) :
if not version :
return not assume_new
try :
return version_tuple ( version ) < version_tuple ( limit )
except ValueError :
return not assume_new