""" Generic CSV Importer - handles any CSV structure dynamically """ import csv import io import logging import re from typing import Dict, Any, List, Optional from datetime import datetime from sqlalchemy import text, Column, String, Integer, Text, MetaData, Table, create_engine, Date from sqlalchemy.orm import Session from sqlalchemy.exc import SQLAlchemyError from .base import BaseCSVImporter, ImportResult logger = logging.getLogger(__name__) class GenericCSVImporter(BaseCSVImporter): """Generic importer that can handle any CSV structure by creating tables dynamically""" def __init__(self, db_session: Session, table_name: str, import_id: str = None): # Set table name first, before calling super().__init__() # because BaseCSVImporter.__init__ calls self.table_name self._table_name = table_name.lower() self.dynamic_table = None self.csv_headers = [] super().__init__(db_session, import_id) @property def table_name(self) -> str: return self._table_name @property def required_fields(self) -> List[str]: """No required fields for generic import""" return [] @property def field_mapping(self) -> Dict[str, str]: """Dynamic mapping based on CSV headers""" if self.csv_headers: mapping = {} for header in self.csv_headers: safe_name = self._make_safe_name(header) # Handle 'id' column renaming for conflict avoidance if safe_name.lower() == 'id': safe_name = 'csv_id' mapping[header] = safe_name return mapping return {} def create_model_instance(self, row_data: Dict[str, Any]) -> Dict[str, Any]: """For generic import, just return the processed row data""" return row_data def create_dynamic_table(self, headers: List[str]) -> Table: """Create a table dynamically based on CSV headers""" try: # Create metadata metadata = MetaData() # Clean table name safe_table_name = self._make_safe_name(self.table_name) # Check if table already exists BEFORE creating the Table object from sqlalchemy import inspect inspector = inspect(self.db_session.bind) existing_tables = inspector.get_table_names() if safe_table_name in existing_tables: logger.info(f"Table '{safe_table_name}' already exists, using unique table name") # Instead of trying to drop, create a new table with timestamp suffix import time timestamp = str(int(time.time())) safe_table_name = f"{safe_table_name}_{timestamp}" logger.info(f"Creating new table with unique name: '{safe_table_name}'") else: logger.info(f"Creating new table: '{safe_table_name}'") # Create columns dynamically columns = [Column('id', Integer, primary_key=True, autoincrement=True)] for header in headers: if header and header.strip(): safe_column_name = self._make_safe_name(header.strip()) # Skip if this would create a duplicate 'id' column if safe_column_name.lower() == 'id': # Rename the CSV column to avoid conflict with auto-generated id safe_column_name = 'csv_id' columns.append(Column(safe_column_name, Text)) # Create table with the final table name table = Table(safe_table_name, metadata, *columns) # Store the actual table name for use in data insertion self.actual_table_name = safe_table_name self._table_name = safe_table_name # Update the stored table name to use the timestamped version logger.info(f"Using table name for data insertion: '{safe_table_name}'") # Create the table in the database with retry logic for locks max_retries = 3 retry_delay = 1.0 for attempt in range(max_retries): try: # Use explicit transaction to avoid deadlocks self.db_session.begin() metadata.create_all(self.db_session.bind) self.db_session.commit() logger.info(f"Created dynamic table '{safe_table_name}' with {len(columns)} columns") return table except Exception as create_error: self.db_session.rollback() if "database is locked" in str(create_error).lower() and attempt < max_retries - 1: import time logger.warning(f"Database locked, retrying in {retry_delay}s (attempt {attempt + 1}/{max_retries})") time.sleep(retry_delay) retry_delay *= 2 # Exponential backoff continue elif "already present" in str(create_error).lower(): # Table was created by another process, reflect it logger.info(f"Table '{safe_table_name}' created by another process, reflecting existing table") try: metadata.reflect(bind=self.db_session.bind, only=[safe_table_name]) return metadata.tables[safe_table_name] except Exception: # If reflection fails, re-raise original error raise create_error else: # Re-raise if not a recoverable error raise create_error except Exception as e: logger.error(f"Error creating dynamic table: {e}") raise def _make_safe_name(self, name: str) -> str: """Make a database-safe name from any string""" import re # Remove special characters and replace with underscore safe_name = re.sub(r'[^a-zA-Z0-9_]', '_', name) # Remove multiple underscores safe_name = re.sub(r'_+', '_', safe_name) # Remove trailing underscore safe_name = safe_name.strip('_') # Ensure it's not empty if not safe_name: safe_name = 'unnamed_column' # Special handling for purely numeric names or names starting with numbers if safe_name.isdigit() or (safe_name and safe_name[0].isdigit()): safe_name = f'col_{safe_name}' # Ensure it starts with a letter or underscore (final check) elif safe_name and not (safe_name[0].isalpha() or safe_name[0] == '_'): safe_name = 'col_' + safe_name return safe_name.lower() def _parse_date_value(self, value: str) -> Optional[str]: """Try to parse a date value and return it in ISO format""" if not value or value.strip() == '': return None value = value.strip() # Common date formats to try date_formats = [ '%m/%d/%Y', # MM/DD/YYYY '%m/%d/%y', # MM/DD/YY '%Y-%m-%d', # YYYY-MM-DD '%d/%m/%Y', # DD/MM/YYYY '%d-%m-%Y', # DD-MM-YYYY '%Y/%m/%d', # YYYY/MM/DD ] for fmt in date_formats: try: parsed_date = datetime.strptime(value, fmt) return parsed_date.strftime('%Y-%m-%d') # Return in ISO format except ValueError: continue # If no format matches, return the original value return value def process_csv_content(self, csv_content: str, encoding: str = "utf-8") -> ImportResult: """Override the main processing method to handle dynamic table creation""" try: # Preprocess CSV content to handle common issues # Remove trailing empty lines and normalize line endings lines = csv_content.strip().splitlines() # Remove empty lines that might cause parsing issues non_empty_lines = [line for line in lines if line.strip()] if not non_empty_lines: result = ImportResult() result.add_error("CSV file is empty or contains only empty lines") return result # Reconstruct CSV content with clean line endings cleaned_csv_content = '\n'.join(non_empty_lines) # Parse CSV and get headers with flexible parsing # Handle various CSV format issues including embedded newlines csv_file = io.StringIO(cleaned_csv_content) # Try with different CSV dialect configurations headers = None parsing_strategies = [ # Strategy 1: Standard CSV parsing lambda f: csv.DictReader(f), # Strategy 2: Handle newlines in fields with strict quoting lambda f: csv.DictReader(f, skipinitialspace=True, quoting=csv.QUOTE_MINIMAL, strict=False), # Strategy 3: More flexible quoting lambda f: csv.DictReader(f, quoting=csv.QUOTE_ALL, strict=False), # Strategy 4: Excel dialect lambda f: csv.DictReader(f, dialect='excel'), # Strategy 5: Unix dialect lambda f: csv.DictReader(f, dialect='unix'), # Strategy 6: Very permissive - ignore malformed lines lambda f: csv.DictReader(f, quoting=csv.QUOTE_NONE, escapechar='\\', strict=False) ] for i, strategy in enumerate(parsing_strategies): try: csv_file.seek(0) csv_reader = strategy(csv_file) headers = csv_reader.fieldnames if headers: logger.debug(f"CSV parsing successful with strategy {i+1}") break except (csv.Error, UnicodeDecodeError) as e: logger.debug(f"CSV parsing strategy {i+1} failed: {e}") continue if not headers: result = ImportResult() result.add_error("No headers found in CSV file") return result # Store headers and create dynamic table self.csv_headers = [h.strip() for h in headers if h and h.strip()] if not self.csv_headers: result = ImportResult() result.add_error("No valid headers found in CSV file") return result self.dynamic_table = self.create_dynamic_table(self.csv_headers) # Reset reader and process rows with the same successful parsing strategy csv_file = io.StringIO(cleaned_csv_content) csv_reader = None # Use the same parsing strategies to ensure consistency for i, strategy in enumerate(parsing_strategies): try: csv_file.seek(0) csv_reader = strategy(csv_file) # Test that it works by trying to read headers test_headers = csv_reader.fieldnames if test_headers: logger.debug(f"Data parsing using strategy {i+1}") break except (csv.Error, UnicodeDecodeError) as e: logger.debug(f"Data parsing strategy {i+1} failed: {e}") continue if not csv_reader: result = ImportResult() result.add_error("Unable to parse CSV file with any available strategy") return result imported_count = 0 error_count = 0 total_count = 0 # Check if file has any data rows rows = list(csv_reader) if not rows: logger.info(f"CSV file for table '{self.table_name}' contains headers only, no data rows to import") self.result.success = True self.result.total_rows = 0 self.result.imported_rows = 0 self.result.error_rows = 0 self.result.add_warning("File contains headers only, no data rows found") return self.result # Process all rows in a single transaction try: self.db_session.begin() for row_num, row in enumerate(rows, start=2): total_count += 1 try: # Prepare row data row_data = {} for header in self.csv_headers: safe_column_name = self._make_safe_name(header) # Handle 'id' column renaming for conflict avoidance if safe_column_name.lower() == 'id': safe_column_name = 'csv_id' value = row.get(header, '').strip() if row.get(header) else None # Convert empty strings to None for better database handling if value == '': value = None elif value and ('date' in header.lower() or 'time' in header.lower()): # Try to parse date values for better format consistency value = self._parse_date_value(value) row_data[safe_column_name] = value # Insert into database with conflict resolution # Use INSERT OR IGNORE to handle potential duplicates gracefully # Use the actual table name (which may have timestamp suffix) instead of dynamic_table.name table_name = getattr(self, 'actual_table_name', self.dynamic_table.name) logger.debug(f"Inserting into table: '{table_name}' (original: '{self._table_name}', dynamic: '{self.dynamic_table.name}')") columns = list(row_data.keys()) values = list(row_data.values()) placeholders = ', '.join([':param' + str(i) for i in range(len(values))]) column_names = ', '.join(columns) # Create parameter dictionary for SQLAlchemy params = {f'param{i}': value for i, value in enumerate(values)} ignore_sql = f"INSERT OR IGNORE INTO {table_name} ({column_names}) VALUES ({placeholders})" result = self.db_session.execute(text(ignore_sql), params) # Check if the row was actually inserted (rowcount > 0) or ignored (rowcount = 0) if result.rowcount == 0: logger.debug(f"Row {row_num}: Skipped duplicate record") else: logger.debug(f"Row {row_num}: Inserted successfully") imported_count += 1 except Exception as e: error_count += 1 error_msg = str(e) # Provide more specific error messages for common database issues if "NOT NULL constraint failed" in error_msg: self.result.add_error(f"Row {row_num}: Missing required value in column") elif "UNIQUE constraint failed" in error_msg: self.result.add_error(f"Row {row_num}: Duplicate value detected") elif "no such column" in error_msg: self.result.add_error(f"Row {row_num}: Column structure mismatch") else: self.result.add_error(f"Row {row_num}: {error_msg}") logger.warning(f"Error importing row {row_num}: {e}") continue # Commit all changes self.db_session.commit() except Exception as transaction_error: self.db_session.rollback() logger.error(f"Transaction failed, rolled back: {transaction_error}") self.result.add_error(f"Transaction failed: {str(transaction_error)}") # Update result self.result.success = imported_count > 0 self.result.total_rows = total_count self.result.imported_rows = imported_count self.result.error_rows = error_count if imported_count > 0: logger.info(f"Successfully imported {imported_count} rows into {self.table_name}") return self.result except Exception as e: logger.error(f"Error during CSV import: {e}") self.result.add_error(f"Import failed: {str(e)}") return self.result