diff --git a/app/import_legacy.py b/app/import_legacy.py index 84d903c..3f0ccd9 100644 --- a/app/import_legacy.py +++ b/app/import_legacy.py @@ -44,7 +44,7 @@ def open_text_with_fallbacks(file_path: str): f = open(file_path, 'r', encoding=enc, errors='strict', newline='') # Read more than 1KB to catch encoding issues deeper in the file # Many legacy CSVs have issues beyond the first few rows - _ = f.read(10240) # Read 10KB to test + _ = f.read(20480) # Read 20KB to test (increased from 10KB) f.seek(0) logger.info("csv_open_encoding_selected", file=file_path, encoding=enc) return f, enc @@ -62,7 +62,7 @@ def open_text_with_fallbacks(file_path: str): try: # Try UTF-8 with error replacement first (most common case) f = open(file_path, 'r', encoding='utf-8', errors='replace', newline='') - _ = f.read(10240) # Read 10KB to catch encoding issues deeper in the file + _ = f.read(20480) # Read 20KB to catch encoding issues deeper in the file (increased from 10KB) f.seek(0) logger.info("csv_open_encoding_with_replace", file=file_path, encoding="utf-8-replace") return f, "utf-8-replace" @@ -72,7 +72,7 @@ def open_text_with_fallbacks(file_path: str): # Final fallback: use latin-1 with replace (handles any byte sequence) try: f = open(file_path, 'r', encoding='latin-1', errors='replace', newline='') - _ = f.read(10240) # Read 10KB to catch encoding issues deeper in the file + _ = f.read(20480) # Read 20KB to catch encoding issues deeper in the file (increased from 10KB) f.seek(0) logger.info("csv_open_encoding_fallback", file=file_path, encoding="latin-1-replace") return f, "latin-1-replace" diff --git a/app/main.py b/app/main.py index d14ee51..1efbf79 100644 --- a/app/main.py +++ b/app/main.py @@ -79,7 +79,7 @@ def open_text_with_fallbacks(file_path: str): f = open(file_path, 'r', encoding=enc, errors='strict', newline='') # Read more than 1KB to catch encoding issues deeper in the file # Many legacy CSVs have issues beyond the first few rows - _ = f.read(10240) # Read 10KB to test + _ = f.read(20480) # Read 20KB to test (increased from 10KB) f.seek(0) logger.info("csv_open_encoding_selected", file=file_path, encoding=enc) return f, enc @@ -97,7 +97,7 @@ def open_text_with_fallbacks(file_path: str): try: # Try UTF-8 with error replacement first (most common case) f = open(file_path, 'r', encoding='utf-8', errors='replace', newline='') - _ = f.read(10240) # Read 10KB to catch encoding issues deeper in the file + _ = f.read(20480) # Read 20KB to catch encoding issues deeper in the file (increased from 10KB) f.seek(0) logger.info("csv_open_encoding_with_replace", file=file_path, encoding="utf-8-replace") return f, "utf-8-replace" @@ -107,7 +107,7 @@ def open_text_with_fallbacks(file_path: str): # Final fallback: use latin-1 with replace (handles any byte sequence) try: f = open(file_path, 'r', encoding='latin-1', errors='replace', newline='') - _ = f.read(10240) # Read 10KB to catch encoding issues deeper in the file + _ = f.read(20480) # Read 20KB to catch encoding issues deeper in the file (increased from 10KB) f.seek(0) logger.info("csv_open_encoding_fallback", file=file_path, encoding="latin-1-replace") return f, "latin-1-replace"