Skip to content

Commit

Permalink
feat: report file name of file that chardet fails to read
Browse files Browse the repository at this point in the history
resolves codespell-project#3519

Tested and it works now, reporting the file name:

```
codespell --write-changes -i3 -C 5 -H -f -e --count -s --builtin clear,rare,names
Failed to decode file ./pep_sphinx_extensions/tests/pep_lint/test_pep_number.py using detected encoding Windows-1254.
Traceback (most recent call last):
  File "/Users/corneliusromer/micromamba/envs/codespell/bin/codespell", line 8, in <module>
    sys.exit(_script_main())
             ^^^^^^^^^^^^^^
  File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 1103, in _script_main
    return main(*sys.argv[1:])
           ^^^^^^^^^^^^^^^^^^^
  File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 1300, in main
    bad_count += parse_file(
                 ^^^^^^^^^^^
  File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 945, in parse_file
    lines, encoding = file_opener.open(filename)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 232, in open
    return self.open_with_chardet(filename)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 246, in open_with_chardet
    lines = self.get_lines(f)
            ^^^^^^^^^^^^^^^^^
  File "/Users/corneliusromer/code/codespell/codespell_lib/_codespell.py", line 303, in get_lines
    lines = f.readlines()
            ^^^^^^^^^^^^^
  File "/Users/corneliusromer/micromamba/envs/codespell/lib/python3.12/encodings/cp1254.py", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 1349: character maps to <undefined>
```
  • Loading branch information
corneliusroemer committed Aug 18, 2024
1 parent a2de580 commit 7c3df9c
Showing 1 changed file with 19 additions and 15 deletions.
34 changes: 19 additions & 15 deletions codespell_lib/_codespell.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,12 +227,12 @@ def init_chardet(self) -> None:

self.encdetector = UniversalDetector()

def open(self, filename: str) -> Tuple[List[str], str]:
def open(self, filename: str) -> Tuple[List[str], str | None]:
if self.use_chardet:
return self.open_with_chardet(filename)
return self.open_with_internal(filename)

def open_with_chardet(self, filename: str) -> Tuple[List[str], str]:
def open_with_chardet(self, filename: str) -> Tuple[List[str], str | None]:
self.encdetector.reset()
with open(filename, "rb") as fb:
for line in fb:
Expand All @@ -241,26 +241,30 @@ def open_with_chardet(self, filename: str) -> Tuple[List[str], str]:
break
self.encdetector.close()
encoding = self.encdetector.result["encoding"]

if not encoding:
print(
f"WARNING: Chardet failed to detect encoding for file {filename}.",
file=sys.stderr,
)
try:
f = open(filename, encoding=encoding, newline="")
with open(filename, encoding=encoding, newline="") as f:
lines = self.get_lines(f)
except UnicodeDecodeError:
print(f"ERROR: Could not detect encoding: {filename}", file=sys.stderr)
error_msg = (
f"Failed to decode file {filename} using detected "
f"encoding {encoding}."
)
print(error_msg, file=sys.stderr)
raise
except LookupError:
print(
f"ERROR: Don't know how to handle encoding {encoding}: {filename}",
file=sys.stderr,
)
error_msg = f"Unknown encoding {encoding} detected for file {filename}."
print(error_msg, file=sys.stderr)
raise
else:
lines = self.get_lines(f)
f.close()

return lines, f.encoding
return lines, encoding

def open_with_internal(self, filename: str) -> Tuple[List[str], str]:
encoding = None
encoding: str
first_try = True
for encoding in ("utf-8", "iso-8859-1"):
if first_try:
Expand Down Expand Up @@ -887,10 +891,10 @@ def parse_file(
bad_count = 0
lines = None
changed = False
encoding: str | None = "utf-8"

if filename == "-":
f = sys.stdin
encoding = "utf-8"
lines = f.readlines()
else:
if options.check_filenames:
Expand Down

0 comments on commit 7c3df9c

Please sign in to comment.