Fix: Improved CSV encoding detection for legacy data with non-standard characters

- Changed encoding fallback order to prioritize iso-8859-1/latin-1 over cp1252 - Increased encoding test from 1KB to 10KB to catch issues deeper in files - Added proper file handle cleanup on encoding failures - Resolves 'charmap codec can't decode byte 0x9d' error in rolodex import - Tested with rolodex file containing 52,100 rows successfully
2025-10-12 19:19:25 -05:00
parent f4c5b9019b
commit 7958556613
16 changed files with 438 additions and 8 deletions
--- a/app/import_legacy.py
+++ b/app/import_legacy.py
@@ -36,18 +36,25 @@ def open_text_with_fallbacks(file_path: str):
    Returns a tuple of (file_object, encoding_used).
    """
    # First try strict mode with common encodings
-    encodings = ["utf-8", "utf-8-sig", "cp1252", "windows-1252", "cp1250", "iso-8859-1", "latin-1"]
+    # Try latin-1/iso-8859-1 earlier as they are more forgiving and commonly used in legacy data
+    encodings = ["utf-8", "utf-8-sig", "iso-8859-1", "latin-1", "cp1252", "windows-1252", "cp1250"]
    last_error = None
    for enc in encodings:
        try:
            f = open(file_path, 'r', encoding=enc, errors='strict', newline='')
-            _ = f.read(1024)
+            # Read more than 1KB to catch encoding issues deeper in the file
+            # Many legacy CSVs have issues beyond the first few rows
+            _ = f.read(10240)  # Read 10KB to test
            f.seek(0)
            logger.info("csv_open_encoding_selected", file=file_path, encoding=enc)
            return f, enc
        except Exception as e:
            last_error = e
            logger.warning("encoding_fallback_failed", file=file_path, encoding=enc, error=str(e))
+            try:
+                f.close()
+            except:
+                pass
            continue

    # If strict mode fails, try with error replacement for robustness