From 7c3df9c23f25e0a516ac2856962a1d13538ef8e6 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Sun, 18 Aug 2024 19:39:43 +0200 Subject: [PATCH] feat: report file name of file that chardet fails to read resolves #3519 Tested and it works now, reporting the file name: ``` codespell --write-changes -i3 -C 5 -H -f -e --count -s --builtin clear,rare,names Failed to decode file ./pep_sphinx_extensions/tests/pep_lint/test_pep_number.py using detected encoding Windows-1254. Traceback (most recent call last): File "/Users/corneliusromer/micromamba/envs/codespell/bin/codespell", line 8, in sys.exit(_script_main()) ^^^^^^^^^^^^^^ File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 1103, in _script_main return main(*sys.argv[1:]) ^^^^^^^^^^^^^^^^^^^ File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 1300, in main bad_count += parse_file( ^^^^^^^^^^^ File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 945, in parse_file lines, encoding = file_opener.open(filename) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 232, in open return self.open_with_chardet(filename) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 246, in open_with_chardet lines = self.get_lines(f) ^^^^^^^^^^^^^^^^^ File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 303, in get_lines lines = f.readlines() ^^^^^^^^^^^^^ File "/Users/corneliusromer/micromamba/envs/codespell/lib/python3.12/encodings/cp1254.py", line 23, in decode return codecs.charmap_decode(input,self.errors,decoding_table)[0] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 1349: character maps to ``` --- codespell_lib/_codespell.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index c7cc63bcfe..32bd61ad69 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -227,12 +227,12 @@ def init_chardet(self) -> None: self.encdetector = UniversalDetector() - def open(self, filename: str) -> Tuple[List[str], str]: + def open(self, filename: str) -> Tuple[List[str], str | None]: if self.use_chardet: return self.open_with_chardet(filename) return self.open_with_internal(filename) - def open_with_chardet(self, filename: str) -> Tuple[List[str], str]: + def open_with_chardet(self, filename: str) -> Tuple[List[str], str | None]: self.encdetector.reset() with open(filename, "rb") as fb: for line in fb: @@ -241,26 +241,30 @@ def open_with_chardet(self, filename: str) -> Tuple[List[str], str]: break self.encdetector.close() encoding = self.encdetector.result["encoding"] - + if not encoding: + print( + f"WARNING: Chardet failed to detect encoding for file {filename}.", + file=sys.stderr, + ) try: - f = open(filename, encoding=encoding, newline="") + with open(filename, encoding=encoding, newline="") as f: + lines = self.get_lines(f) except UnicodeDecodeError: - print(f"ERROR: Could not detect encoding: {filename}", file=sys.stderr) + error_msg = ( + f"Failed to decode file {filename} using detected " + f"encoding {encoding}." + ) + print(error_msg, file=sys.stderr) raise except LookupError: - print( - f"ERROR: Don't know how to handle encoding {encoding}: {filename}", - file=sys.stderr, - ) + error_msg = f"Unknown encoding {encoding} detected for file {filename}." + print(error_msg, file=sys.stderr) raise - else: - lines = self.get_lines(f) - f.close() - return lines, f.encoding + return lines, encoding def open_with_internal(self, filename: str) -> Tuple[List[str], str]: - encoding = None + encoding: str first_try = True for encoding in ("utf-8", "iso-8859-1"): if first_try: @@ -887,10 +891,10 @@ def parse_file( bad_count = 0 lines = None changed = False + encoding: str | None = "utf-8" if filename == "-": f = sys.stdin - encoding = "utf-8" lines = f.readlines() else: if options.check_filenames: