fixed sort time

This commit is contained in:
HotSwapp
2025-10-14 07:56:13 -05:00
parent 9b2ce0d28f
commit 65e4995a5b
26 changed files with 99601 additions and 28 deletions

View File

@@ -41,11 +41,27 @@ def open_text_with_fallbacks(file_path: str):
last_error = None
for enc in encodings:
try:
f = open(file_path, 'r', encoding=enc, errors='strict', newline='')
# Read more than 1KB to catch encoding issues deeper in the file
# Many legacy CSVs have issues beyond the first few rows
_ = f.read(51200) # Read 50KB to test (increased from 20KB)
f.seek(0)
# First open in strict mode just for a quick sanity check on the first
# chunk of the file. We do *not* keep this handle because a later
# unexpected character could still trigger a UnicodeDecodeError when
# the CSV iterator continues reading. After the quick check we
# immediately close the handle and reopen with `errors="replace"`
# which guarantees that *any* undecodable bytes that appear further
# down will be replaced with the official Unicode replacement
# character (U+FFFD) instead of raising an exception and aborting the
# import. This keeps the import pipeline resilient while still
# letting us log the originally detected encoding for auditing.
test_f = open(file_path, 'r', encoding=enc, errors='strict', newline='')
# Read 50 KB from the start of the file enough to catch the vast
# majority of encoding problems without loading the entire file into
# memory.
_ = test_f.read(51200)
test_f.close()
# Re-open for the real CSV processing pass using a forgiving error
# strategy.
f = open(file_path, 'r', encoding=enc, errors='replace', newline='')
logger.info("csv_open_encoding_selected", file=file_path, encoding=enc)
return f, enc
except Exception as e:
@@ -124,10 +140,25 @@ def parse_decimal(value: str) -> Optional[Decimal]:
def clean_string(value: str) -> Optional[str]:
"""Clean string value, return None if blank."""
if not value or not value.strip():
"""Return a sanitized string or None if blank/only junk.
• Strips leading/trailing whitespace
• Removes Unicode replacement characters ( / U+FFFD) introduced by our
liberal decoder
• Removes ASCII control characters (0x00-0x1F, 0x7F)
"""
if not value:
return None
return value.strip()
# Remove replacement chars created by errors="replace" decoding
cleaned = value.replace("", "").replace("\uFFFD", "")
# Strip out remaining control chars
cleaned = "".join(ch for ch in cleaned if ch >= " " and ch != "\x7f")
cleaned = cleaned.strip()
return cleaned or None
# ============================================================================
@@ -1522,15 +1553,51 @@ def import_planinfo(db: Session, file_path: str) -> Dict[str, Any]:
f, encoding = open_text_with_fallbacks(file_path)
reader = csv.DictReader(f)
batch = []
# Fetch once to avoid many round-trips
existing_ids: set[str] = {
pid for (pid,) in db.query(PlanInfo.plan_id).all()
}
batch: list[PlanInfo] = []
updating: list[PlanInfo] = []
for row_num, row in enumerate(reader, start=2):
result['total_rows'] += 1
try:
plan_id = clean_string(row.get('Plan_Id'))
# Skip rows where plan_id is missing or clearly corrupted (contains replacement character)
if not plan_id:
# Record as warning so user can review later
result['errors'].append(
f"Row {row_num}: skipped due to invalid plan_id '{plan_id}'"
)
continue
if plan_id in existing_ids:
# Update existing record in place (UPSERT)
rec: PlanInfo = db.query(PlanInfo).filter_by(plan_id=plan_id).first()
if rec:
rec.plan_name = clean_string(row.get('Plan_Name'))
rec.plan_type = clean_string(row.get('Plan_Type'))
rec.empl_id_no = clean_string(row.get('Empl_Id_No'))
rec.plan_no = clean_string(row.get('Plan_No'))
rec.nra = clean_string(row.get('NRA'))
rec.era = clean_string(row.get('ERA'))
rec.errf = clean_string(row.get('ERRF'))
rec.colas = clean_string(row.get('COLAS'))
rec.divided_by = clean_string(row.get('Divided_By'))
rec.drafted = clean_string(row.get('Drafted'))
rec.benefit_c = clean_string(row.get('Benefit_C'))
rec.qdro_c = clean_string(row.get('QDRO_C'))
rec.rev = clean_string(row.get('^REV'))
rec.pa = clean_string(row.get('^PA'))
rec.form_name = clean_string(row.get('Form_Name'))
rec.drafted_on = parse_date(row.get('Drafted_On'))
rec.memo = clean_string(row.get('Memo'))
updating.append(rec)
continue
record = PlanInfo(
plan_id=plan_id,
plan_name=clean_string(row.get('Plan_Name')),
@@ -1552,6 +1619,9 @@ def import_planinfo(db: Session, file_path: str) -> Dict[str, Any]:
memo=clean_string(row.get('Memo'))
)
batch.append(record)
# Track to prevent duplicates within same import
existing_ids.add(plan_id)
if len(batch) >= BATCH_SIZE:
db.bulk_save_objects(batch)
@@ -1562,6 +1632,10 @@ def import_planinfo(db: Session, file_path: str) -> Dict[str, Any]:
except Exception as e:
result['errors'].append(f"Row {row_num}: {str(e)}")
# First flush updates if any
if updating:
db.commit()
if batch:
db.bulk_save_objects(batch)
db.commit()