Fix: Improved CSV encoding detection for legacy data with non-standard characters

- Changed encoding fallback order to prioritize iso-8859-1/latin-1 over cp1252
- Increased encoding test from 1KB to 10KB to catch issues deeper in files
- Added proper file handle cleanup on encoding failures
- Resolves 'charmap codec can't decode byte 0x9d' error in rolodex import
- Tested with rolodex file containing 52,100 rows successfully
This commit is contained in:
HotSwapp
2025-10-12 19:19:25 -05:00
parent f4c5b9019b
commit 7958556613
16 changed files with 438 additions and 8 deletions

View File

@@ -36,18 +36,25 @@ def open_text_with_fallbacks(file_path: str):
Returns a tuple of (file_object, encoding_used).
"""
# First try strict mode with common encodings
encodings = ["utf-8", "utf-8-sig", "cp1252", "windows-1252", "cp1250", "iso-8859-1", "latin-1"]
# Try latin-1/iso-8859-1 earlier as they are more forgiving and commonly used in legacy data
encodings = ["utf-8", "utf-8-sig", "iso-8859-1", "latin-1", "cp1252", "windows-1252", "cp1250"]
last_error = None
for enc in encodings:
try:
f = open(file_path, 'r', encoding=enc, errors='strict', newline='')
_ = f.read(1024)
# Read more than 1KB to catch encoding issues deeper in the file
# Many legacy CSVs have issues beyond the first few rows
_ = f.read(10240) # Read 10KB to test
f.seek(0)
logger.info("csv_open_encoding_selected", file=file_path, encoding=enc)
return f, enc
except Exception as e:
last_error = e
logger.warning("encoding_fallback_failed", file=file_path, encoding=enc, error=str(e))
try:
f.close()
except:
pass
continue
# If strict mode fails, try with error replacement for robustness