From 38b3adb46040139acfb5a36ba60d3816a74c91ca Mon Sep 17 00:00:00 2001 From: df Date: Mon, 25 Oct 2021 12:32:25 +0100 Subject: [PATCH 1/2] Shorten proposed file name on create if too long --- test/test_utils.py | 21 +++++++++++ youtube_dl/compat.py | 5 +++ youtube_dl/utils.py | 83 +++++++++++++++++++++++++++++++++++++++----- 3 files changed, 101 insertions(+), 8 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 259c4763e1e..2258a356b05 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -63,6 +63,7 @@ pkcs1pad, read_batch_urls, sanitize_filename, + sanitize_open, sanitize_path, sanitize_url, expand_path, @@ -118,6 +119,16 @@ class TestUtil(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.tearDown() + + @classmethod + def tearDown(cls): + for tf in os.listdir('.'): + if os.path.splitext(tf)[1] == '.test': + os.remove(tf) + def test_timeconvert(self): self.assertTrue(timeconvert('') is None) self.assertTrue(timeconvert('bougrg') is None) @@ -231,6 +242,16 @@ def test_sanitize_path(self): self.assertEqual(sanitize_path('./abc'), 'abc') self.assertEqual(sanitize_path('./../abc'), '..\\abc') + def test_sanitize_open(self): + long_name = " I'm a lumberjack ".join(['I sleep all night and I work all day %d' % n for n in range(50)]) + result = sanitize_open( + '%s%s.test' % ('.\\' if sys.platform == 'win32' else './', long_name, ), + open_mode='w') + result[0].close() + self.assertEqual( + result[1][2:] if result[1].startswith('./') else result[1], + "I sleep all night and I work all day 0 I'm a lumberjack I sleep all night and I work[...] night and I work all day 48 I'm a lumberjack I sleep all night and I work all day 49.test") + def test_sanitize_url(self): self.assertEqual(sanitize_url('//foo.bar'), 'http://foo.bar') self.assertEqual(sanitize_url('httpss://foo.bar'), 'https://foo.bar') diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 9e45c454b26..48697cf35e1 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2997,6 +2997,10 @@ def resf(tpl, *args, **kwargs): def compat_ctypes_WINFUNCTYPE(*args, **kwargs): return ctypes.WINFUNCTYPE(*args, **kwargs) +try: + import reprlib as compat_reprlib +except ImportError: + import repr as compat_reprlib __all__ = [ 'compat_HTMLParseError', @@ -3032,6 +3036,7 @@ def compat_ctypes_WINFUNCTYPE(*args, **kwargs): 'compat_parse_qs', 'compat_print', 'compat_realpath', + 'compat_reprlib', 'compat_setenv', 'compat_shlex_quote', 'compat_shlex_split', diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e722eed58de..0680fc96640 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals +import ast import base64 import binascii import calendar @@ -53,6 +54,7 @@ compat_kwargs, compat_os_name, compat_parse_qs, + compat_reprlib, compat_shlex_quote, compat_str, compat_struct_pack, @@ -2036,6 +2038,43 @@ def clean_html(html): return html.strip() +def eviscerate(text, width, placeholder=' [...]'): + """Shorten the text to width by replacing text + from the middle of text with placeholder. + """ + + r = compat_reprlib.Repr() + r.ellipsis = '...' + r.maxstring = width - len(placeholder) + len(r.ellipsis) + len(r.repr('')) + r.maxother = r.maxstring + + t = r.repr(text) + # u'xx...xx'/'xx...xx' -> xx[...]xx + return ast.literal_eval(t).replace(r.ellipsis, placeholder) + + +def reduce_filename(path, reduction=0.5, min_length=20, ellipsis='[...]'): + """Try to reduce the filename by a specified reduction factor + + Arguments: + path -- the path name to reduce + reduction -- factor by which to reduce its filename component + ellipsis -- placeholder for removed text + + Returns path name with reduced filename, or None + """ + + fname = os.path.split(path) + fname = list(fname[:1] + os.path.splitext(fname[1])) + fname[1] = fname[1].replace(ellipsis, ' ') + flen = len(fname[1]) + if flen < min_length: + # give up + return None + fname[1] = eviscerate(fname[1], int(1 + reduction * flen), placeholder=ellipsis) + return os.path.join(fname[0], ''.join(fname[1:])) + + def sanitize_open(filename, open_mode): """Try to open the given filename, and slightly tweak it if this fails. @@ -2046,26 +2085,54 @@ def sanitize_open(filename, open_mode): It returns the tuple (stream, definitive_file_name). """ + def openfile(filename, open_mode): + stream = open(encodeFilename(filename), open_mode) + return (stream, filename) + try: if filename == '-': if sys.platform == 'win32': import msvcrt msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename) - stream = open(encodeFilename(filename), open_mode) - return (stream, filename) + return openfile(filename, open_mode) except (IOError, OSError) as err: if err.errno in (errno.EACCES,): raise - # In case of error, try to remove win32 forbidden chars - alt_filename = sanitize_path(filename) - if alt_filename == filename: + if 'w' not in open_mode or '+' in open_mode: + # only mung filename when creating the file raise + + org_err = err + + # In case of error, try to remove win32 forbidden chars + if err.errno in (errno.EINVAL, ): + alt_filename = sanitize_path(filename) + if alt_filename != filename: + try: + return openfile(alt_filename, open_mode) + except (IOError, OSError) as new_err: + err = new_err else: - # An exception here should be caught in the caller - stream = open(encodeFilename(alt_filename), open_mode) - return (stream, alt_filename) + alt_filename = filename + + # Windows: an over-long file name can be detected by the CreateFile() + # API, and then get EINVAL, or by the filesystem, and then perhaps + # ENAMETOOLONG + # POSIX: ENAMETOOLONG in general + while err.errno in (errno.ENAMETOOLONG, errno.EINVAL, ): + alt_filename = reduce_filename(alt_filename) + if not alt_filename: + break + try: + return openfile(alt_filename, open_mode) + except (IOError, OSError) as new_err: + err = new_err + + # Reduction didn't help; give up and report what initially went wrong + # This exception should be caught in the caller + raise org_err def timeconvert(timestr): From a0159ad89ac876a57eca83430dd091baf330f807 Mon Sep 17 00:00:00 2001 From: df Date: Mon, 25 Oct 2021 13:41:13 +0100 Subject: [PATCH 2/2] Avoid finding explicit unicode literals in comments, etc --- test/test_unicode_literals.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_unicode_literals.py b/test/test_unicode_literals.py index 6c1b7ec915c..bc543e8b717 100644 --- a/test/test_unicode_literals.py +++ b/test/test_unicode_literals.py @@ -51,7 +51,9 @@ def test_all_files(self): r'(?:(?:#.*?|\s*)\n)*from __future__ import (?:[a-z_]+,\s*)*unicode_literals', 'unicode_literals import missing in %s' % fn) - m = re.search(r'(?<=\s)u[\'"](?!\)|,|$)', code) + # match explicit unicode literal on a line not starting with #|'|" + # and preceded by a space or = + m = re.search(r'(?m)(?:^\s*?[^#\s"\'].*?)(?<=\s|=)u[\'"](?!\)|,|$)', code) if m is not None: self.assertTrue( m is None,