Expand encoding fallback to handle more legacy CSV encodings

- Added windows-1252, cp1250, iso-8859-1 to encoding fallback list
- Enhanced error logging in open_text_with_fallbacks function
- Improved error messages to show all attempted encodings
- Added warning logs for each encoding attempt that fails

This should resolve 'charmap' codec errors and other encoding issues with legacy CSV files that use different Windows codepages or ISO encodings.
This commit is contained in:
HotSwapp
2025-10-07 22:25:34 -05:00
parent 09ef56fc1d
commit fdcff9fbb2

View File

@@ -65,11 +65,11 @@ def open_text_with_fallbacks(file_path: str):
"""
Open a text file trying multiple encodings commonly seen in legacy CSVs.
Attempts in order: utf-8, utf-8-sig, cp1252, latin-1.
Attempts in order: utf-8, utf-8-sig, cp1252, windows-1252, cp1250, iso-8859-1, latin-1.
Returns a tuple of (file_object, encoding_used). Caller is responsible to close file.
"""
encodings = ["utf-8", "utf-8-sig", "cp1252", "latin-1"]
encodings = ["utf-8", "utf-8-sig", "cp1252", "windows-1252", "cp1250", "iso-8859-1", "latin-1"]
last_error = None
for enc in encodings:
try:
@@ -81,8 +81,13 @@ def open_text_with_fallbacks(file_path: str):
return f, enc
except Exception as e:
last_error = e
logger.warning("encoding_fallback_failed", file=file_path, encoding=enc, error=str(e))
continue
raise last_error if last_error else RuntimeError("Unable to open file with known encodings")
error_msg = f"Unable to open file '{file_path}' with any of the supported encodings: {', '.join(encodings)}"
if last_error:
error_msg += f". Last error: {str(last_error)}"
raise RuntimeError(error_msg)
# Configure Jinja2 templates