fixed sort time
This commit is contained in:
@@ -41,11 +41,27 @@ def open_text_with_fallbacks(file_path: str):
|
||||
last_error = None
|
||||
for enc in encodings:
|
||||
try:
|
||||
f = open(file_path, 'r', encoding=enc, errors='strict', newline='')
|
||||
# Read more than 1KB to catch encoding issues deeper in the file
|
||||
# Many legacy CSVs have issues beyond the first few rows
|
||||
_ = f.read(51200) # Read 50KB to test (increased from 20KB)
|
||||
f.seek(0)
|
||||
# First open in strict mode just for a quick sanity check on the first
|
||||
# chunk of the file. We do *not* keep this handle because a later
|
||||
# unexpected character could still trigger a UnicodeDecodeError when
|
||||
# the CSV iterator continues reading. After the quick check we
|
||||
# immediately close the handle and reopen with `errors="replace"`
|
||||
# which guarantees that *any* undecodable bytes that appear further
|
||||
# down will be replaced with the official Unicode replacement
|
||||
# character (U+FFFD) instead of raising an exception and aborting the
|
||||
# import. This keeps the import pipeline resilient while still
|
||||
# letting us log the originally detected encoding for auditing.
|
||||
|
||||
test_f = open(file_path, 'r', encoding=enc, errors='strict', newline='')
|
||||
# Read 50 KB from the start of the file – enough to catch the vast
|
||||
# majority of encoding problems without loading the entire file into
|
||||
# memory.
|
||||
_ = test_f.read(51200)
|
||||
test_f.close()
|
||||
|
||||
# Re-open for the real CSV processing pass using a forgiving error
|
||||
# strategy.
|
||||
f = open(file_path, 'r', encoding=enc, errors='replace', newline='')
|
||||
logger.info("csv_open_encoding_selected", file=file_path, encoding=enc)
|
||||
return f, enc
|
||||
except Exception as e:
|
||||
@@ -124,10 +140,25 @@ def parse_decimal(value: str) -> Optional[Decimal]:
|
||||
|
||||
|
||||
def clean_string(value: str) -> Optional[str]:
|
||||
"""Clean string value, return None if blank."""
|
||||
if not value or not value.strip():
|
||||
"""Return a sanitized string or None if blank/only junk.
|
||||
|
||||
• Strips leading/trailing whitespace
|
||||
• Removes Unicode replacement characters ( / U+FFFD) introduced by our
|
||||
liberal decoder
|
||||
• Removes ASCII control characters (0x00-0x1F, 0x7F)
|
||||
"""
|
||||
if not value:
|
||||
return None
|
||||
return value.strip()
|
||||
|
||||
# Remove replacement chars created by errors="replace" decoding
|
||||
cleaned = value.replace("", "").replace("\uFFFD", "")
|
||||
|
||||
# Strip out remaining control chars
|
||||
cleaned = "".join(ch for ch in cleaned if ch >= " " and ch != "\x7f")
|
||||
|
||||
cleaned = cleaned.strip()
|
||||
|
||||
return cleaned or None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@@ -1522,15 +1553,51 @@ def import_planinfo(db: Session, file_path: str) -> Dict[str, Any]:
|
||||
f, encoding = open_text_with_fallbacks(file_path)
|
||||
reader = csv.DictReader(f)
|
||||
|
||||
batch = []
|
||||
# Fetch once to avoid many round-trips
|
||||
existing_ids: set[str] = {
|
||||
pid for (pid,) in db.query(PlanInfo.plan_id).all()
|
||||
}
|
||||
|
||||
batch: list[PlanInfo] = []
|
||||
updating: list[PlanInfo] = []
|
||||
|
||||
for row_num, row in enumerate(reader, start=2):
|
||||
result['total_rows'] += 1
|
||||
|
||||
try:
|
||||
plan_id = clean_string(row.get('Plan_Id'))
|
||||
# Skip rows where plan_id is missing or clearly corrupted (contains replacement character)
|
||||
if not plan_id:
|
||||
# Record as warning so user can review later
|
||||
result['errors'].append(
|
||||
f"Row {row_num}: skipped due to invalid plan_id '{plan_id}'"
|
||||
)
|
||||
continue
|
||||
|
||||
|
||||
if plan_id in existing_ids:
|
||||
# Update existing record in place (UPSERT)
|
||||
rec: PlanInfo = db.query(PlanInfo).filter_by(plan_id=plan_id).first()
|
||||
if rec:
|
||||
rec.plan_name = clean_string(row.get('Plan_Name'))
|
||||
rec.plan_type = clean_string(row.get('Plan_Type'))
|
||||
rec.empl_id_no = clean_string(row.get('Empl_Id_No'))
|
||||
rec.plan_no = clean_string(row.get('Plan_No'))
|
||||
rec.nra = clean_string(row.get('NRA'))
|
||||
rec.era = clean_string(row.get('ERA'))
|
||||
rec.errf = clean_string(row.get('ERRF'))
|
||||
rec.colas = clean_string(row.get('COLAS'))
|
||||
rec.divided_by = clean_string(row.get('Divided_By'))
|
||||
rec.drafted = clean_string(row.get('Drafted'))
|
||||
rec.benefit_c = clean_string(row.get('Benefit_C'))
|
||||
rec.qdro_c = clean_string(row.get('QDRO_C'))
|
||||
rec.rev = clean_string(row.get('^REV'))
|
||||
rec.pa = clean_string(row.get('^PA'))
|
||||
rec.form_name = clean_string(row.get('Form_Name'))
|
||||
rec.drafted_on = parse_date(row.get('Drafted_On'))
|
||||
rec.memo = clean_string(row.get('Memo'))
|
||||
updating.append(rec)
|
||||
continue
|
||||
|
||||
record = PlanInfo(
|
||||
plan_id=plan_id,
|
||||
plan_name=clean_string(row.get('Plan_Name')),
|
||||
@@ -1552,6 +1619,9 @@ def import_planinfo(db: Session, file_path: str) -> Dict[str, Any]:
|
||||
memo=clean_string(row.get('Memo'))
|
||||
)
|
||||
batch.append(record)
|
||||
|
||||
# Track to prevent duplicates within same import
|
||||
existing_ids.add(plan_id)
|
||||
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
db.bulk_save_objects(batch)
|
||||
@@ -1562,6 +1632,10 @@ def import_planinfo(db: Session, file_path: str) -> Dict[str, Any]:
|
||||
except Exception as e:
|
||||
result['errors'].append(f"Row {row_num}: {str(e)}")
|
||||
|
||||
# First flush updates if any
|
||||
if updating:
|
||||
db.commit()
|
||||
|
||||
if batch:
|
||||
db.bulk_save_objects(batch)
|
||||
db.commit()
|
||||
|
||||
Reference in New Issue
Block a user