Fix: Improved CSV encoding detection for legacy data with non-standard characters
- Changed encoding fallback order to prioritize iso-8859-1/latin-1 over cp1252 - Increased encoding test from 1KB to 10KB to catch issues deeper in files - Added proper file handle cleanup on encoding failures - Resolves 'charmap codec can't decode byte 0x9d' error in rolodex import - Tested with rolodex file containing 52,100 rows successfully
This commit is contained in:
2
app/__init__.py
Normal file
2
app/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# Make app a package for reliable imports in tests and runtime
|
||||
|
||||
BIN
app/__pycache__/__init__.cpython-313.pyc
Normal file
BIN
app/__pycache__/__init__.cpython-313.pyc
Normal file
Binary file not shown.
BIN
app/__pycache__/import_legacy.cpython-313.pyc
Normal file
BIN
app/__pycache__/import_legacy.cpython-313.pyc
Normal file
Binary file not shown.
Binary file not shown.
@@ -36,18 +36,25 @@ def open_text_with_fallbacks(file_path: str):
|
||||
Returns a tuple of (file_object, encoding_used).
|
||||
"""
|
||||
# First try strict mode with common encodings
|
||||
encodings = ["utf-8", "utf-8-sig", "cp1252", "windows-1252", "cp1250", "iso-8859-1", "latin-1"]
|
||||
# Try latin-1/iso-8859-1 earlier as they are more forgiving and commonly used in legacy data
|
||||
encodings = ["utf-8", "utf-8-sig", "iso-8859-1", "latin-1", "cp1252", "windows-1252", "cp1250"]
|
||||
last_error = None
|
||||
for enc in encodings:
|
||||
try:
|
||||
f = open(file_path, 'r', encoding=enc, errors='strict', newline='')
|
||||
_ = f.read(1024)
|
||||
# Read more than 1KB to catch encoding issues deeper in the file
|
||||
# Many legacy CSVs have issues beyond the first few rows
|
||||
_ = f.read(10240) # Read 10KB to test
|
||||
f.seek(0)
|
||||
logger.info("csv_open_encoding_selected", file=file_path, encoding=enc)
|
||||
return f, enc
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
logger.warning("encoding_fallback_failed", file=file_path, encoding=enc, error=str(e))
|
||||
try:
|
||||
f.close()
|
||||
except:
|
||||
pass
|
||||
continue
|
||||
|
||||
# If strict mode fails, try with error replacement for robustness
|
||||
|
||||
143
app/main.py
143
app/main.py
@@ -67,23 +67,29 @@ def open_text_with_fallbacks(file_path: str):
|
||||
"""
|
||||
Open a text file trying multiple encodings commonly seen in legacy CSVs.
|
||||
|
||||
Attempts in order: utf-8, utf-8-sig, cp1252, windows-1252, cp1250, iso-8859-1, latin-1.
|
||||
Attempts in order: utf-8, utf-8-sig, iso-8859-1, latin-1, cp1252, windows-1252, cp1250.
|
||||
Prioritizes latin-1/iso-8859-1 as they handle legacy data better than cp1252.
|
||||
|
||||
Returns a tuple of (file_object, encoding_used). Caller is responsible to close file.
|
||||
"""
|
||||
encodings = ["utf-8", "utf-8-sig", "cp1252", "windows-1252", "cp1250", "iso-8859-1", "latin-1"]
|
||||
encodings = ["utf-8", "utf-8-sig", "iso-8859-1", "latin-1", "cp1252", "windows-1252", "cp1250"]
|
||||
last_error = None
|
||||
for enc in encodings:
|
||||
try:
|
||||
f = open(file_path, 'r', encoding=enc, errors='strict', newline='')
|
||||
# Try reading a tiny chunk to force decoding errors early
|
||||
_ = f.read(1024)
|
||||
# Read more than 1KB to catch encoding issues deeper in the file
|
||||
# Many legacy CSVs have issues beyond the first few rows
|
||||
_ = f.read(10240) # Read 10KB to test
|
||||
f.seek(0)
|
||||
logger.info("csv_open_encoding_selected", file=file_path, encoding=enc)
|
||||
return f, enc
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
logger.warning("encoding_fallback_failed", file=file_path, encoding=enc, error=str(e))
|
||||
try:
|
||||
f.close()
|
||||
except:
|
||||
pass
|
||||
continue
|
||||
|
||||
error_msg = f"Unable to open file '{file_path}' with any of the supported encodings: {', '.join(encodings)}"
|
||||
@@ -250,6 +256,19 @@ VALID_IMPORT_TYPES: List[str] = [
|
||||
]
|
||||
|
||||
|
||||
# Centralized import order for auto-import after upload
|
||||
# Reference tables first, then core tables, then specialized tables
|
||||
IMPORT_ORDER: List[str] = [
|
||||
# Reference tables
|
||||
'trnstype', 'trnslkup', 'footers', 'filestat', 'employee', 'gruplkup', 'filetype', 'fvarlkup', 'rvarlkup',
|
||||
# Core tables
|
||||
'rolodex', 'phone', 'rolex_v', 'files', 'files_r', 'files_v', 'filenots', 'ledger', 'deposits', 'payments',
|
||||
# Specialized tables
|
||||
'planinfo', 'qdros', 'pensions', 'pension_marriage', 'pension_death', 'pension_schedule', 'pension_separate', 'pension_results',
|
||||
]
|
||||
ORDER_INDEX: Dict[str, int] = {t: i for i, t in enumerate(IMPORT_ORDER)}
|
||||
|
||||
|
||||
def get_import_type_from_filename(filename: str) -> str:
|
||||
"""
|
||||
Determine import type based on filename pattern for legacy CSV files.
|
||||
@@ -1066,6 +1085,105 @@ def process_csv_import(db: Session, import_type: str, file_path: str) -> Dict[st
|
||||
return import_func(db, file_path)
|
||||
|
||||
|
||||
# ---------------------------------
|
||||
# Auto-import helper after upload
|
||||
# ---------------------------------
|
||||
def run_auto_import_for_upload(db: Session, uploaded_items: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Run auto-import for the files just uploaded, following IMPORT_ORDER.
|
||||
|
||||
Stops after the first file that reports any row errors. Unknown types are
|
||||
skipped. Logs each file via ImportLog.
|
||||
"""
|
||||
# Filter out unknowns; keep metadata
|
||||
known_items: List[Dict[str, Any]] = [
|
||||
item for item in uploaded_items if item.get("import_type") in ORDER_INDEX
|
||||
]
|
||||
|
||||
# Sort by import order, then by filename for stability
|
||||
known_items.sort(key=lambda x: (ORDER_INDEX.get(x.get("import_type"), 1_000_000), x.get("filename", "")))
|
||||
|
||||
files_summary: List[Dict[str, Any]] = []
|
||||
stopped = False
|
||||
stopped_on: Optional[str] = None
|
||||
|
||||
for item in known_items:
|
||||
import_type = item["import_type"]
|
||||
file_path = item["file_path"]
|
||||
stored_filename = item["stored_filename"]
|
||||
|
||||
# Create import log
|
||||
import_log = ImportLog(
|
||||
import_type=import_type,
|
||||
file_name=stored_filename,
|
||||
file_path=file_path,
|
||||
status="running",
|
||||
)
|
||||
db.add(import_log)
|
||||
db.commit()
|
||||
|
||||
try:
|
||||
result = process_csv_import(db, import_type, file_path)
|
||||
|
||||
import_log.status = "completed" if not result.get("errors") else "failed"
|
||||
import_log.total_rows = result.get("total_rows", 0)
|
||||
import_log.success_count = result.get("success", 0)
|
||||
import_log.error_count = len(result.get("errors", []))
|
||||
import_log.error_details = json.dumps(result.get("errors", []))
|
||||
import_log.completed_at = datetime.now()
|
||||
db.commit()
|
||||
|
||||
files_summary.append({
|
||||
"filename": item.get("filename"),
|
||||
"stored_filename": stored_filename,
|
||||
"import_type": import_type,
|
||||
"status": "success" if result.get("success", 0) > 0 and not result.get("errors") else "error",
|
||||
"total_rows": result.get("total_rows", 0),
|
||||
"success_count": result.get("success", 0),
|
||||
"error_count": len(result.get("errors", [])),
|
||||
"errors": (result.get("errors", [])[:10] if result.get("errors") else []),
|
||||
})
|
||||
|
||||
if result.get("errors"):
|
||||
stopped = True
|
||||
stopped_on = stored_filename
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
import_log.status = "failed"
|
||||
import_log.error_details = json.dumps([str(e)])
|
||||
import_log.completed_at = datetime.now()
|
||||
db.commit()
|
||||
|
||||
files_summary.append({
|
||||
"filename": item.get("filename"),
|
||||
"stored_filename": stored_filename,
|
||||
"import_type": import_type,
|
||||
"status": "error",
|
||||
"total_rows": 0,
|
||||
"success_count": 0,
|
||||
"error_count": 1,
|
||||
"errors": [str(e)][:10],
|
||||
})
|
||||
|
||||
stopped = True
|
||||
stopped_on = stored_filename
|
||||
break
|
||||
|
||||
# Build skipped notes for unknowns
|
||||
skipped_unknowns = [
|
||||
{"filename": item.get("filename"), "stored_filename": item.get("stored_filename")}
|
||||
for item in uploaded_items
|
||||
if item.get("import_type") not in ORDER_INDEX
|
||||
]
|
||||
|
||||
return {
|
||||
"files": files_summary,
|
||||
"stopped": stopped,
|
||||
"stopped_on": stopped_on,
|
||||
"skipped_unknowns": skipped_unknowns,
|
||||
}
|
||||
|
||||
# ------------------------------
|
||||
# Ledger CRUD and helpers
|
||||
# ------------------------------
|
||||
@@ -1635,6 +1753,7 @@ async def dashboard(
|
||||
async def admin_upload_files(
|
||||
request: Request,
|
||||
files: List[UploadFile] = File(...),
|
||||
auto_import: bool = Form(True),
|
||||
db: Session = Depends(get_db)
|
||||
):
|
||||
"""
|
||||
@@ -1700,13 +1819,29 @@ async def admin_upload_files(
|
||||
uploaded_count=len(results),
|
||||
error_count=len(errors),
|
||||
username=user.username,
|
||||
auto_import=auto_import,
|
||||
)
|
||||
|
||||
auto_import_results: Dict[str, Any] | None = None
|
||||
if auto_import and results:
|
||||
try:
|
||||
auto_import_results = run_auto_import_for_upload(db, results)
|
||||
logger.info(
|
||||
"admin_upload_auto_import",
|
||||
processed_files=len(auto_import_results.get("files", [])),
|
||||
stopped=auto_import_results.get("stopped", False),
|
||||
stopped_on=auto_import_results.get("stopped_on"),
|
||||
username=user.username,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("admin_upload_auto_import_failed", error=str(e), username=user.username)
|
||||
|
||||
return templates.TemplateResponse("admin.html", {
|
||||
"request": request,
|
||||
"user": user,
|
||||
"upload_results": results,
|
||||
"upload_errors": errors,
|
||||
"auto_import_results": auto_import_results,
|
||||
"show_upload_results": True
|
||||
})
|
||||
|
||||
|
||||
@@ -526,3 +526,5 @@ def sync_all(db: Session, clear_existing: bool = False) -> Dict[str, Any]:
|
||||
|
||||
return results
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -53,6 +53,14 @@
|
||||
TRNSTYPE, TRNSLKUP, FOOTERS, FILESTAT, EMPLOYEE, GRUPLKUP, FILETYPE, and all related tables (*.csv)
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-check mb-3">
|
||||
<input class="form-check-input" type="checkbox" id="auto_import" name="auto_import" checked>
|
||||
<label class="form-check-label" for="auto_import">
|
||||
<strong>Auto-import after upload (follows Import Order Guide)</strong>
|
||||
<br>
|
||||
<small class="text-muted">Will stop on the first file that reports any row errors.</small>
|
||||
</label>
|
||||
</div>
|
||||
<button type="submit" class="btn btn-primary">
|
||||
<i class="bi bi-cloud-upload me-2"></i>Upload Files
|
||||
</button>
|
||||
@@ -117,6 +125,80 @@
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<!-- Auto Import Results -->
|
||||
{% if auto_import_results %}
|
||||
<div class="card mb-4">
|
||||
<div class="card-header bg-info text-white">
|
||||
<h5 class="mb-0">
|
||||
<i class="bi bi-lightning-charge me-2"></i>Auto Import Results
|
||||
</h5>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
{% if auto_import_results.stopped %}
|
||||
<div class="alert alert-warning">
|
||||
<i class="bi bi-exclamation-triangle me-2"></i>
|
||||
Stopped after {{ auto_import_results.files|length }} file(s) due to errors in <code>{{ auto_import_results.stopped_on }}</code>.
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div class="table-responsive">
|
||||
<table class="table table-sm table-bordered">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Filename</th>
|
||||
<th>Type</th>
|
||||
<th>Status</th>
|
||||
<th>Total</th>
|
||||
<th>Success</th>
|
||||
<th>Errors</th>
|
||||
<th>Error Details</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for item in auto_import_results.files %}
|
||||
<tr>
|
||||
<td>{{ item.filename }}</td>
|
||||
<td><span class="badge bg-secondary">{{ item.import_type }}</span></td>
|
||||
<td>
|
||||
{% if item.status == 'success' %}
|
||||
<span class="badge bg-success">Completed</span>
|
||||
{% else %}
|
||||
<span class="badge bg-danger">Failed</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td>{{ item.total_rows }}</td>
|
||||
<td class="text-success">{{ item.success_count }}</td>
|
||||
<td class="text-danger">{{ item.error_count }}</td>
|
||||
<td>
|
||||
{% if item.errors %}
|
||||
<details>
|
||||
<summary class="text-danger">View Errors ({{ item.errors|length }})</summary>
|
||||
<ul class="mt-2 mb-0">
|
||||
{% for err in item.errors %}
|
||||
<li><small>{{ err }}</small></li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</details>
|
||||
{% else %}
|
||||
<span class="text-muted">None</span>
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
{% if auto_import_results.skipped_unknowns and auto_import_results.skipped_unknowns|length > 0 %}
|
||||
<div class="alert alert-info mt-3">
|
||||
<i class="bi bi-info-circle me-2"></i>
|
||||
{{ auto_import_results.skipped_unknowns|length }} unknown file(s) were skipped. Map them in the Data Import section.
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<!-- Upload Errors -->
|
||||
{% if upload_errors %}
|
||||
<div class="card mb-4">
|
||||
|
||||
Reference in New Issue
Block a user