diff --git a/app/import_legacy.py b/app/import_legacy.py index ddec34a..4292e92 100644 --- a/app/import_legacy.py +++ b/app/import_legacy.py @@ -32,9 +32,10 @@ BATCH_SIZE = 500 def open_text_with_fallbacks(file_path: str): """ Open a text file trying multiple encodings commonly seen in legacy CSVs. - + Returns a tuple of (file_object, encoding_used). """ + # First try strict mode with common encodings encodings = ["utf-8", "utf-8-sig", "cp1252", "windows-1252", "cp1250", "iso-8859-1", "latin-1"] last_error = None for enc in encodings: @@ -48,7 +49,29 @@ def open_text_with_fallbacks(file_path: str): last_error = e logger.warning("encoding_fallback_failed", file=file_path, encoding=enc, error=str(e)) continue - + + # If strict mode fails, try with error replacement for robustness + logger.warning("strict_encoding_failed", file=file_path, trying_with_replace=True) + try: + # Try UTF-8 with error replacement first (most common case) + f = open(file_path, 'r', encoding='utf-8', errors='replace', newline='') + _ = f.read(1024) + f.seek(0) + logger.info("csv_open_encoding_with_replace", file=file_path, encoding="utf-8-replace") + return f, "utf-8-replace" + except Exception as e: + logger.warning("utf8_replace_failed", file=file_path, error=str(e)) + + # Final fallback: use latin-1 with replace (handles any byte sequence) + try: + f = open(file_path, 'r', encoding='latin-1', errors='replace', newline='') + _ = f.read(1024) + f.seek(0) + logger.info("csv_open_encoding_fallback", file=file_path, encoding="latin-1-replace") + return f, "latin-1-replace" + except Exception as e: + last_error = e + error_msg = f"Unable to open file '{file_path}' with any supported encodings" if last_error: error_msg += f". Last error: {str(last_error)}"