377 lines
17 KiB
Python
377 lines
17 KiB
Python
"""
|
|
Generic CSV Importer - handles any CSV structure dynamically
|
|
"""
|
|
import csv
|
|
import io
|
|
import logging
|
|
import re
|
|
from typing import Dict, Any, List, Optional
|
|
from datetime import datetime
|
|
from sqlalchemy import text, Column, String, Integer, Text, MetaData, Table, create_engine, Date
|
|
from sqlalchemy.orm import Session
|
|
from sqlalchemy.exc import SQLAlchemyError
|
|
|
|
from .base import BaseCSVImporter, ImportResult
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class GenericCSVImporter(BaseCSVImporter):
|
|
"""Generic importer that can handle any CSV structure by creating tables dynamically"""
|
|
|
|
def __init__(self, db_session: Session, table_name: str, import_id: str = None):
|
|
# Set table name first, before calling super().__init__()
|
|
# because BaseCSVImporter.__init__ calls self.table_name
|
|
self._table_name = table_name.lower()
|
|
self.dynamic_table = None
|
|
self.csv_headers = []
|
|
super().__init__(db_session, import_id)
|
|
|
|
@property
|
|
def table_name(self) -> str:
|
|
return self._table_name
|
|
|
|
@property
|
|
def required_fields(self) -> List[str]:
|
|
"""No required fields for generic import"""
|
|
return []
|
|
|
|
@property
|
|
def field_mapping(self) -> Dict[str, str]:
|
|
"""Dynamic mapping based on CSV headers"""
|
|
if self.csv_headers:
|
|
mapping = {}
|
|
for header in self.csv_headers:
|
|
safe_name = self._make_safe_name(header)
|
|
# Handle 'id' column renaming for conflict avoidance
|
|
if safe_name.lower() == 'id':
|
|
safe_name = 'csv_id'
|
|
mapping[header] = safe_name
|
|
return mapping
|
|
return {}
|
|
|
|
def create_model_instance(self, row_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""For generic import, just return the processed row data"""
|
|
return row_data
|
|
|
|
def create_dynamic_table(self, headers: List[str]) -> Table:
|
|
"""Create a table dynamically based on CSV headers"""
|
|
try:
|
|
# Create metadata
|
|
metadata = MetaData()
|
|
|
|
# Clean table name
|
|
safe_table_name = self._make_safe_name(self.table_name)
|
|
|
|
# Check if table already exists BEFORE creating the Table object
|
|
from sqlalchemy import inspect
|
|
inspector = inspect(self.db_session.bind)
|
|
existing_tables = inspector.get_table_names()
|
|
|
|
if safe_table_name in existing_tables:
|
|
logger.info(f"Table '{safe_table_name}' already exists, using unique table name")
|
|
# Instead of trying to drop, create a new table with timestamp suffix
|
|
import time
|
|
timestamp = str(int(time.time()))
|
|
safe_table_name = f"{safe_table_name}_{timestamp}"
|
|
logger.info(f"Creating new table with unique name: '{safe_table_name}'")
|
|
else:
|
|
logger.info(f"Creating new table: '{safe_table_name}'")
|
|
|
|
# Create columns dynamically
|
|
columns = [Column('id', Integer, primary_key=True, autoincrement=True)]
|
|
|
|
for header in headers:
|
|
if header and header.strip():
|
|
safe_column_name = self._make_safe_name(header.strip())
|
|
# Skip if this would create a duplicate 'id' column
|
|
if safe_column_name.lower() == 'id':
|
|
# Rename the CSV column to avoid conflict with auto-generated id
|
|
safe_column_name = 'csv_id'
|
|
columns.append(Column(safe_column_name, Text))
|
|
|
|
# Create table with the final table name
|
|
table = Table(safe_table_name, metadata, *columns)
|
|
|
|
# Store the actual table name for use in data insertion
|
|
self.actual_table_name = safe_table_name
|
|
self._table_name = safe_table_name # Update the stored table name to use the timestamped version
|
|
logger.info(f"Using table name for data insertion: '{safe_table_name}'")
|
|
|
|
# Create the table in the database with retry logic for locks
|
|
max_retries = 3
|
|
retry_delay = 1.0
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
# Use explicit transaction to avoid deadlocks
|
|
self.db_session.begin()
|
|
metadata.create_all(self.db_session.bind)
|
|
self.db_session.commit()
|
|
|
|
logger.info(f"Created dynamic table '{safe_table_name}' with {len(columns)} columns")
|
|
return table
|
|
|
|
except Exception as create_error:
|
|
self.db_session.rollback()
|
|
|
|
if "database is locked" in str(create_error).lower() and attempt < max_retries - 1:
|
|
import time
|
|
logger.warning(f"Database locked, retrying in {retry_delay}s (attempt {attempt + 1}/{max_retries})")
|
|
time.sleep(retry_delay)
|
|
retry_delay *= 2 # Exponential backoff
|
|
continue
|
|
elif "already present" in str(create_error).lower():
|
|
# Table was created by another process, reflect it
|
|
logger.info(f"Table '{safe_table_name}' created by another process, reflecting existing table")
|
|
try:
|
|
metadata.reflect(bind=self.db_session.bind, only=[safe_table_name])
|
|
return metadata.tables[safe_table_name]
|
|
except Exception:
|
|
# If reflection fails, re-raise original error
|
|
raise create_error
|
|
else:
|
|
# Re-raise if not a recoverable error
|
|
raise create_error
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error creating dynamic table: {e}")
|
|
raise
|
|
|
|
def _make_safe_name(self, name: str) -> str:
|
|
"""Make a database-safe name from any string"""
|
|
import re
|
|
# Remove special characters and replace with underscore
|
|
safe_name = re.sub(r'[^a-zA-Z0-9_]', '_', name)
|
|
# Remove multiple underscores
|
|
safe_name = re.sub(r'_+', '_', safe_name)
|
|
# Remove trailing underscore
|
|
safe_name = safe_name.strip('_')
|
|
# Ensure it's not empty
|
|
if not safe_name:
|
|
safe_name = 'unnamed_column'
|
|
# Special handling for purely numeric names or names starting with numbers
|
|
if safe_name.isdigit() or (safe_name and safe_name[0].isdigit()):
|
|
safe_name = f'col_{safe_name}'
|
|
# Ensure it starts with a letter or underscore (final check)
|
|
elif safe_name and not (safe_name[0].isalpha() or safe_name[0] == '_'):
|
|
safe_name = 'col_' + safe_name
|
|
return safe_name.lower()
|
|
|
|
def _parse_date_value(self, value: str) -> Optional[str]:
|
|
"""Try to parse a date value and return it in ISO format"""
|
|
if not value or value.strip() == '':
|
|
return None
|
|
|
|
value = value.strip()
|
|
|
|
# Common date formats to try
|
|
date_formats = [
|
|
'%m/%d/%Y', # MM/DD/YYYY
|
|
'%m/%d/%y', # MM/DD/YY
|
|
'%Y-%m-%d', # YYYY-MM-DD
|
|
'%d/%m/%Y', # DD/MM/YYYY
|
|
'%d-%m-%Y', # DD-MM-YYYY
|
|
'%Y/%m/%d', # YYYY/MM/DD
|
|
]
|
|
|
|
for fmt in date_formats:
|
|
try:
|
|
parsed_date = datetime.strptime(value, fmt)
|
|
return parsed_date.strftime('%Y-%m-%d') # Return in ISO format
|
|
except ValueError:
|
|
continue
|
|
|
|
# If no format matches, return the original value
|
|
return value
|
|
|
|
def process_csv_content(self, csv_content: str, encoding: str = "utf-8") -> ImportResult:
|
|
"""Override the main processing method to handle dynamic table creation"""
|
|
try:
|
|
# Preprocess CSV content to handle common issues
|
|
# Remove trailing empty lines and normalize line endings
|
|
lines = csv_content.strip().splitlines()
|
|
# Remove empty lines that might cause parsing issues
|
|
non_empty_lines = [line for line in lines if line.strip()]
|
|
if not non_empty_lines:
|
|
result = ImportResult()
|
|
result.add_error("CSV file is empty or contains only empty lines")
|
|
return result
|
|
|
|
# Reconstruct CSV content with clean line endings
|
|
cleaned_csv_content = '\n'.join(non_empty_lines)
|
|
|
|
# Parse CSV and get headers with flexible parsing
|
|
# Handle various CSV format issues including embedded newlines
|
|
csv_file = io.StringIO(cleaned_csv_content)
|
|
|
|
# Try with different CSV dialect configurations
|
|
headers = None
|
|
parsing_strategies = [
|
|
# Strategy 1: Standard CSV parsing
|
|
lambda f: csv.DictReader(f),
|
|
# Strategy 2: Handle newlines in fields with strict quoting
|
|
lambda f: csv.DictReader(f, skipinitialspace=True, quoting=csv.QUOTE_MINIMAL, strict=False),
|
|
# Strategy 3: More flexible quoting
|
|
lambda f: csv.DictReader(f, quoting=csv.QUOTE_ALL, strict=False),
|
|
# Strategy 4: Excel dialect
|
|
lambda f: csv.DictReader(f, dialect='excel'),
|
|
# Strategy 5: Unix dialect
|
|
lambda f: csv.DictReader(f, dialect='unix'),
|
|
# Strategy 6: Very permissive - ignore malformed lines
|
|
lambda f: csv.DictReader(f, quoting=csv.QUOTE_NONE, escapechar='\\', strict=False)
|
|
]
|
|
|
|
for i, strategy in enumerate(parsing_strategies):
|
|
try:
|
|
csv_file.seek(0)
|
|
csv_reader = strategy(csv_file)
|
|
headers = csv_reader.fieldnames
|
|
if headers:
|
|
logger.debug(f"CSV parsing successful with strategy {i+1}")
|
|
break
|
|
except (csv.Error, UnicodeDecodeError) as e:
|
|
logger.debug(f"CSV parsing strategy {i+1} failed: {e}")
|
|
continue
|
|
|
|
if not headers:
|
|
result = ImportResult()
|
|
result.add_error("No headers found in CSV file")
|
|
return result
|
|
|
|
# Store headers and create dynamic table
|
|
self.csv_headers = [h.strip() for h in headers if h and h.strip()]
|
|
|
|
if not self.csv_headers:
|
|
result = ImportResult()
|
|
result.add_error("No valid headers found in CSV file")
|
|
return result
|
|
|
|
self.dynamic_table = self.create_dynamic_table(self.csv_headers)
|
|
|
|
# Reset reader and process rows with the same successful parsing strategy
|
|
csv_file = io.StringIO(cleaned_csv_content)
|
|
csv_reader = None
|
|
|
|
# Use the same parsing strategies to ensure consistency
|
|
for i, strategy in enumerate(parsing_strategies):
|
|
try:
|
|
csv_file.seek(0)
|
|
csv_reader = strategy(csv_file)
|
|
# Test that it works by trying to read headers
|
|
test_headers = csv_reader.fieldnames
|
|
if test_headers:
|
|
logger.debug(f"Data parsing using strategy {i+1}")
|
|
break
|
|
except (csv.Error, UnicodeDecodeError) as e:
|
|
logger.debug(f"Data parsing strategy {i+1} failed: {e}")
|
|
continue
|
|
|
|
if not csv_reader:
|
|
result = ImportResult()
|
|
result.add_error("Unable to parse CSV file with any available strategy")
|
|
return result
|
|
|
|
imported_count = 0
|
|
error_count = 0
|
|
total_count = 0
|
|
|
|
# Check if file has any data rows
|
|
rows = list(csv_reader)
|
|
if not rows:
|
|
logger.info(f"CSV file for table '{self.table_name}' contains headers only, no data rows to import")
|
|
self.result.success = True
|
|
self.result.total_rows = 0
|
|
self.result.imported_rows = 0
|
|
self.result.error_rows = 0
|
|
self.result.add_warning("File contains headers only, no data rows found")
|
|
return self.result
|
|
|
|
# Process all rows in a single transaction
|
|
try:
|
|
self.db_session.begin()
|
|
|
|
for row_num, row in enumerate(rows, start=2):
|
|
total_count += 1
|
|
|
|
try:
|
|
# Prepare row data
|
|
row_data = {}
|
|
for header in self.csv_headers:
|
|
safe_column_name = self._make_safe_name(header)
|
|
# Handle 'id' column renaming for conflict avoidance
|
|
if safe_column_name.lower() == 'id':
|
|
safe_column_name = 'csv_id'
|
|
value = row.get(header, '').strip() if row.get(header) else None
|
|
# Convert empty strings to None for better database handling
|
|
if value == '':
|
|
value = None
|
|
elif value and ('date' in header.lower() or 'time' in header.lower()):
|
|
# Try to parse date values for better format consistency
|
|
value = self._parse_date_value(value)
|
|
row_data[safe_column_name] = value
|
|
|
|
# Insert into database with conflict resolution
|
|
# Use INSERT OR IGNORE to handle potential duplicates gracefully
|
|
# Use the actual table name (which may have timestamp suffix) instead of dynamic_table.name
|
|
table_name = getattr(self, 'actual_table_name', self.dynamic_table.name)
|
|
logger.debug(f"Inserting into table: '{table_name}' (original: '{self._table_name}', dynamic: '{self.dynamic_table.name}')")
|
|
columns = list(row_data.keys())
|
|
values = list(row_data.values())
|
|
placeholders = ', '.join([':param' + str(i) for i in range(len(values))])
|
|
column_names = ', '.join(columns)
|
|
|
|
# Create parameter dictionary for SQLAlchemy
|
|
params = {f'param{i}': value for i, value in enumerate(values)}
|
|
|
|
ignore_sql = f"INSERT OR IGNORE INTO {table_name} ({column_names}) VALUES ({placeholders})"
|
|
result = self.db_session.execute(text(ignore_sql), params)
|
|
|
|
# Check if the row was actually inserted (rowcount > 0) or ignored (rowcount = 0)
|
|
if result.rowcount == 0:
|
|
logger.debug(f"Row {row_num}: Skipped duplicate record")
|
|
else:
|
|
logger.debug(f"Row {row_num}: Inserted successfully")
|
|
|
|
imported_count += 1
|
|
|
|
except Exception as e:
|
|
error_count += 1
|
|
error_msg = str(e)
|
|
|
|
# Provide more specific error messages for common database issues
|
|
if "NOT NULL constraint failed" in error_msg:
|
|
self.result.add_error(f"Row {row_num}: Missing required value in column")
|
|
elif "UNIQUE constraint failed" in error_msg:
|
|
self.result.add_error(f"Row {row_num}: Duplicate value detected")
|
|
elif "no such column" in error_msg:
|
|
self.result.add_error(f"Row {row_num}: Column structure mismatch")
|
|
else:
|
|
self.result.add_error(f"Row {row_num}: {error_msg}")
|
|
|
|
logger.warning(f"Error importing row {row_num}: {e}")
|
|
continue
|
|
|
|
# Commit all changes
|
|
self.db_session.commit()
|
|
|
|
except Exception as transaction_error:
|
|
self.db_session.rollback()
|
|
logger.error(f"Transaction failed, rolled back: {transaction_error}")
|
|
self.result.add_error(f"Transaction failed: {str(transaction_error)}")
|
|
|
|
# Update result
|
|
self.result.success = imported_count > 0
|
|
self.result.total_rows = total_count
|
|
self.result.imported_rows = imported_count
|
|
self.result.error_rows = error_count
|
|
|
|
if imported_count > 0:
|
|
logger.info(f"Successfully imported {imported_count} rows into {self.table_name}")
|
|
|
|
return self.result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during CSV import: {e}")
|
|
self.result.add_error(f"Import failed: {str(e)}")
|
|
return self.result |