Enhance UTF-8 encoding fix for CSV imports
- Increased encoding detection read size from 10KB to 20KB in both main.py and import_legacy.py - This ensures problematic bytes deeper in files (like position 3738) are caught during encoding detection - Maintains backwards compatibility with properly encoded UTF-8 files - Provides more robust handling of legacy CSV files with mixed encodings
This commit is contained in:
@@ -44,7 +44,7 @@ def open_text_with_fallbacks(file_path: str):
|
||||
f = open(file_path, 'r', encoding=enc, errors='strict', newline='')
|
||||
# Read more than 1KB to catch encoding issues deeper in the file
|
||||
# Many legacy CSVs have issues beyond the first few rows
|
||||
_ = f.read(10240) # Read 10KB to test
|
||||
_ = f.read(20480) # Read 20KB to test (increased from 10KB)
|
||||
f.seek(0)
|
||||
logger.info("csv_open_encoding_selected", file=file_path, encoding=enc)
|
||||
return f, enc
|
||||
@@ -62,7 +62,7 @@ def open_text_with_fallbacks(file_path: str):
|
||||
try:
|
||||
# Try UTF-8 with error replacement first (most common case)
|
||||
f = open(file_path, 'r', encoding='utf-8', errors='replace', newline='')
|
||||
_ = f.read(10240) # Read 10KB to catch encoding issues deeper in the file
|
||||
_ = f.read(20480) # Read 20KB to catch encoding issues deeper in the file (increased from 10KB)
|
||||
f.seek(0)
|
||||
logger.info("csv_open_encoding_with_replace", file=file_path, encoding="utf-8-replace")
|
||||
return f, "utf-8-replace"
|
||||
@@ -72,7 +72,7 @@ def open_text_with_fallbacks(file_path: str):
|
||||
# Final fallback: use latin-1 with replace (handles any byte sequence)
|
||||
try:
|
||||
f = open(file_path, 'r', encoding='latin-1', errors='replace', newline='')
|
||||
_ = f.read(10240) # Read 10KB to catch encoding issues deeper in the file
|
||||
_ = f.read(20480) # Read 20KB to catch encoding issues deeper in the file (increased from 10KB)
|
||||
f.seek(0)
|
||||
logger.info("csv_open_encoding_fallback", file=file_path, encoding="latin-1-replace")
|
||||
return f, "latin-1-replace"
|
||||
|
||||
@@ -79,7 +79,7 @@ def open_text_with_fallbacks(file_path: str):
|
||||
f = open(file_path, 'r', encoding=enc, errors='strict', newline='')
|
||||
# Read more than 1KB to catch encoding issues deeper in the file
|
||||
# Many legacy CSVs have issues beyond the first few rows
|
||||
_ = f.read(10240) # Read 10KB to test
|
||||
_ = f.read(20480) # Read 20KB to test (increased from 10KB)
|
||||
f.seek(0)
|
||||
logger.info("csv_open_encoding_selected", file=file_path, encoding=enc)
|
||||
return f, enc
|
||||
@@ -97,7 +97,7 @@ def open_text_with_fallbacks(file_path: str):
|
||||
try:
|
||||
# Try UTF-8 with error replacement first (most common case)
|
||||
f = open(file_path, 'r', encoding='utf-8', errors='replace', newline='')
|
||||
_ = f.read(10240) # Read 10KB to catch encoding issues deeper in the file
|
||||
_ = f.read(20480) # Read 20KB to catch encoding issues deeper in the file (increased from 10KB)
|
||||
f.seek(0)
|
||||
logger.info("csv_open_encoding_with_replace", file=file_path, encoding="utf-8-replace")
|
||||
return f, "utf-8-replace"
|
||||
@@ -107,7 +107,7 @@ def open_text_with_fallbacks(file_path: str):
|
||||
# Final fallback: use latin-1 with replace (handles any byte sequence)
|
||||
try:
|
||||
f = open(file_path, 'r', encoding='latin-1', errors='replace', newline='')
|
||||
_ = f.read(10240) # Read 10KB to catch encoding issues deeper in the file
|
||||
_ = f.read(20480) # Read 20KB to catch encoding issues deeper in the file (increased from 10KB)
|
||||
f.seek(0)
|
||||
logger.info("csv_open_encoding_fallback", file=file_path, encoding="latin-1-replace")
|
||||
return f, "latin-1-replace"
|
||||
|
||||
Reference in New Issue
Block a user