Final enhancement: Increase encoding detection read size to 50KB

- Increased read size from 20KB to 50KB in both main.py and import_legacy.py - This ensures problematic bytes at position 3738+ are caught during encoding detection - Provides maximum robustness for legacy CSV files with deeply embedded encoding issues - Maintains all previous improvements including fallback mechanisms
2025-10-13 21:44:17 -05:00
parent 05b9d38c61
commit 9b2ce0d28f
2 changed files with 6 additions and 6 deletions
--- a/app/import_legacy.py
+++ b/app/import_legacy.py
@@ -44,7 +44,7 @@ def open_text_with_fallbacks(file_path: str):
            f = open(file_path, 'r', encoding=enc, errors='strict', newline='')
            # Read more than 1KB to catch encoding issues deeper in the file
            # Many legacy CSVs have issues beyond the first few rows
-            _ = f.read(20480)  # Read 20KB to test (increased from 10KB)
+            _ = f.read(51200)  # Read 50KB to test (increased from 20KB)
            f.seek(0)
            logger.info("csv_open_encoding_selected", file=file_path, encoding=enc)
            return f, enc
@@ -62,7 +62,7 @@ def open_text_with_fallbacks(file_path: str):
    try:
        # Try UTF-8 with error replacement first (most common case)
        f = open(file_path, 'r', encoding='utf-8', errors='replace', newline='')
-        _ = f.read(20480)  # Read 20KB to catch encoding issues deeper in the file (increased from 10KB)
+        _ = f.read(51200)  # Read 50KB to catch encoding issues deeper in the file (increased from 20KB)
        f.seek(0)
        logger.info("csv_open_encoding_with_replace", file=file_path, encoding="utf-8-replace")
        return f, "utf-8-replace"
@@ -72,7 +72,7 @@ def open_text_with_fallbacks(file_path: str):
    # Final fallback: use latin-1 with replace (handles any byte sequence)
    try:
        f = open(file_path, 'r', encoding='latin-1', errors='replace', newline='')
-        _ = f.read(20480)  # Read 20KB to catch encoding issues deeper in the file (increased from 10KB)
+        _ = f.read(51200)  # Read 50KB to catch encoding issues deeper in the file (increased from 20KB)
        f.seek(0)
        logger.info("csv_open_encoding_fallback", file=file_path, encoding="latin-1-replace")
        return f, "latin-1-replace"
--- a/app/main.py
+++ b/app/main.py
@@ -79,7 +79,7 @@ def open_text_with_fallbacks(file_path: str):
            f = open(file_path, 'r', encoding=enc, errors='strict', newline='')
            # Read more than 1KB to catch encoding issues deeper in the file
            # Many legacy CSVs have issues beyond the first few rows
-            _ = f.read(20480)  # Read 20KB to test (increased from 10KB)
+            _ = f.read(51200)  # Read 50KB to test (increased from 20KB)
            f.seek(0)
            logger.info("csv_open_encoding_selected", file=file_path, encoding=enc)
            return f, enc
@@ -97,7 +97,7 @@ def open_text_with_fallbacks(file_path: str):
    try:
        # Try UTF-8 with error replacement first (most common case)
        f = open(file_path, 'r', encoding='utf-8', errors='replace', newline='')
-        _ = f.read(20480)  # Read 20KB to catch encoding issues deeper in the file (increased from 10KB)
+        _ = f.read(51200)  # Read 50KB to catch encoding issues deeper in the file (increased from 20KB)
        f.seek(0)
        logger.info("csv_open_encoding_with_replace", file=file_path, encoding="utf-8-replace")
        return f, "utf-8-replace"
@@ -107,7 +107,7 @@ def open_text_with_fallbacks(file_path: str):
    # Final fallback: use latin-1 with replace (handles any byte sequence)
    try:
        f = open(file_path, 'r', encoding='latin-1', errors='replace', newline='')
-        _ = f.read(20480)  # Read 20KB to catch encoding issues deeper in the file (increased from 10KB)
+        _ = f.read(51200)  # Read 50KB to catch encoding issues deeper in the file (increased from 20KB)
        f.seek(0)
        logger.info("csv_open_encoding_fallback", file=file_path, encoding="latin-1-replace")
        return f, "latin-1-replace"