fixed sort time

2025-10-14 07:56:13 -05:00
parent 9b2ce0d28f
commit 65e4995a5b
26 changed files with 99601 additions and 28 deletions
--- a/app/import_legacy.py
+++ b/app/import_legacy.py
@@ -41,11 +41,27 @@ def open_text_with_fallbacks(file_path: str):
    last_error = None
    for enc in encodings:
        try:
-            f = open(file_path, 'r', encoding=enc, errors='strict', newline='')
-            # Read more than 1KB to catch encoding issues deeper in the file
-            # Many legacy CSVs have issues beyond the first few rows
-            _ = f.read(51200)  # Read 50KB to test (increased from 20KB)
-            f.seek(0)
+            # First open in strict mode just for a quick sanity check on the first
+            # chunk of the file. We do *not* keep this handle because a later
+            # unexpected character could still trigger a UnicodeDecodeError when
+            # the CSV iterator continues reading.  After the quick check we
+            # immediately close the handle and reopen with `errors="replace"`
+            # which guarantees that *any* undecodable bytes that appear further
+            # down will be replaced with the official Unicode replacement
+            # character (U+FFFD) instead of raising an exception and aborting the
+            # import.  This keeps the import pipeline resilient while still
+            # letting us log the originally detected encoding for auditing.
+
+            test_f = open(file_path, 'r', encoding=enc, errors='strict', newline='')
+            # Read 50 KB from the start of the file – enough to catch the vast
+            # majority of encoding problems without loading the entire file into
+            # memory.
+            _ = test_f.read(51200)
+            test_f.close()
+
+            # Re-open for the real CSV processing pass using a forgiving error
+            # strategy.
+            f = open(file_path, 'r', encoding=enc, errors='replace', newline='')
            logger.info("csv_open_encoding_selected", file=file_path, encoding=enc)
            return f, enc
        except Exception as e:
@@ -124,10 +140,25 @@ def parse_decimal(value: str) -> Optional[Decimal]:


 def clean_string(value: str) -> Optional[str]:
-    """Clean string value, return None if blank."""
-    if not value or not value.strip():
+    """Return a sanitized string or None if blank/only junk.
+
+    • Strips leading/trailing whitespace
+    • Removes Unicode replacement characters ( / U+FFFD) introduced by our
+      liberal decoder
+    • Removes ASCII control characters (0x00-0x1F, 0x7F)
+    """
+    if not value:
        return None
-    return value.strip()
+
+    # Remove replacement chars created by errors="replace" decoding
+    cleaned = value.replace("", "").replace("\uFFFD", "")
+
+    # Strip out remaining control chars
+    cleaned = "".join(ch for ch in cleaned if ch >= " " and ch != "\x7f")
+
+    cleaned = cleaned.strip()
+
+    return cleaned or None


 # ============================================================================
@@ -1522,15 +1553,51 @@ def import_planinfo(db: Session, file_path: str) -> Dict[str, Any]:
        f, encoding = open_text_with_fallbacks(file_path)
        reader = csv.DictReader(f)
        
-        batch = []
+        # Fetch once to avoid many round-trips
+        existing_ids: set[str] = {
+            pid for (pid,) in db.query(PlanInfo.plan_id).all()
+        }
+
+        batch: list[PlanInfo] = []
+        updating: list[PlanInfo] = []
+
        for row_num, row in enumerate(reader, start=2):
            result['total_rows'] += 1
            
            try:
                plan_id = clean_string(row.get('Plan_Id'))
+                # Skip rows where plan_id is missing or clearly corrupted (contains replacement character)
                if not plan_id:
+                    # Record as warning so user can review later
+                    result['errors'].append(
+                        f"Row {row_num}: skipped due to invalid plan_id '{plan_id}'"
+                    )
                    continue
-                
+
+                if plan_id in existing_ids:
+                    # Update existing record in place (UPSERT)
+                    rec: PlanInfo = db.query(PlanInfo).filter_by(plan_id=plan_id).first()
+                    if rec:
+                        rec.plan_name = clean_string(row.get('Plan_Name'))
+                        rec.plan_type = clean_string(row.get('Plan_Type'))
+                        rec.empl_id_no = clean_string(row.get('Empl_Id_No'))
+                        rec.plan_no = clean_string(row.get('Plan_No'))
+                        rec.nra = clean_string(row.get('NRA'))
+                        rec.era = clean_string(row.get('ERA'))
+                        rec.errf = clean_string(row.get('ERRF'))
+                        rec.colas = clean_string(row.get('COLAS'))
+                        rec.divided_by = clean_string(row.get('Divided_By'))
+                        rec.drafted = clean_string(row.get('Drafted'))
+                        rec.benefit_c = clean_string(row.get('Benefit_C'))
+                        rec.qdro_c = clean_string(row.get('QDRO_C'))
+                        rec.rev = clean_string(row.get('^REV'))
+                        rec.pa = clean_string(row.get('^PA'))
+                        rec.form_name = clean_string(row.get('Form_Name'))
+                        rec.drafted_on = parse_date(row.get('Drafted_On'))
+                        rec.memo = clean_string(row.get('Memo'))
+                        updating.append(rec)
+                    continue
+
                record = PlanInfo(
                    plan_id=plan_id,
                    plan_name=clean_string(row.get('Plan_Name')),
@@ -1552,6 +1619,9 @@ def import_planinfo(db: Session, file_path: str) -> Dict[str, Any]:
                    memo=clean_string(row.get('Memo'))
                )
                batch.append(record)
+
+                # Track to prevent duplicates within same import
+                existing_ids.add(plan_id)
                
                if len(batch) >= BATCH_SIZE:
                    db.bulk_save_objects(batch)
@@ -1562,6 +1632,10 @@ def import_planinfo(db: Session, file_path: str) -> Dict[str, Any]:
            except Exception as e:
                result['errors'].append(f"Row {row_num}: {str(e)}")
        
+        # First flush updates if any
+        if updating:
+            db.commit()
+
        if batch:
            db.bulk_save_objects(batch)
            db.commit()