Files
delphi-database/app/services/async_file_operations.py
HotSwapp bac8cc4bd5 changes
2025-08-18 20:20:04 -05:00

528 lines
18 KiB
Python

"""
Async file operations service for handling large files efficiently.
Provides streaming file operations, chunked processing, and progress tracking
to improve performance with large files and prevent memory exhaustion.
"""
import asyncio
import aiofiles
import os
import hashlib
import uuid
from pathlib import Path
from typing import AsyncGenerator, Callable, Optional, Tuple, Dict, Any
from fastapi import UploadFile, HTTPException
from app.config import settings
from app.utils.logging import get_logger
logger = get_logger("async_file_ops")
# Configuration constants
CHUNK_SIZE = 64 * 1024 # 64KB chunks for streaming
LARGE_FILE_THRESHOLD = 10 * 1024 * 1024 # 10MB - files larger than this use streaming
MAX_MEMORY_BUFFER = 50 * 1024 * 1024 # 50MB - max memory buffer for file operations
class AsyncFileOperations:
"""
Service for handling large file operations asynchronously with streaming support.
Features:
- Streaming file uploads/downloads
- Chunked processing for large files
- Progress tracking callbacks
- Memory-efficient operations
- Async file validation
"""
def __init__(self, base_upload_dir: Optional[str] = None):
self.base_upload_dir = Path(base_upload_dir or settings.upload_dir)
self.base_upload_dir.mkdir(parents=True, exist_ok=True)
async def stream_upload_file(
self,
file: UploadFile,
destination_path: str,
progress_callback: Optional[Callable[[int, int], None]] = None,
validate_callback: Optional[Callable[[bytes], None]] = None
) -> Tuple[str, int, str]:
"""
Stream upload file to destination with progress tracking.
Args:
file: The uploaded file
destination_path: Relative path where to save the file
progress_callback: Optional callback for progress tracking (bytes_read, total_size)
validate_callback: Optional callback for chunk validation
Returns:
Tuple of (final_path, file_size, checksum)
"""
final_path = self.base_upload_dir / destination_path
final_path.parent.mkdir(parents=True, exist_ok=True)
file_size = 0
checksum = hashlib.sha256()
try:
async with aiofiles.open(final_path, 'wb') as dest_file:
# Reset file pointer to beginning
await file.seek(0)
while True:
chunk = await file.read(CHUNK_SIZE)
if not chunk:
break
# Update size and checksum
file_size += len(chunk)
checksum.update(chunk)
# Optional chunk validation
if validate_callback:
try:
validate_callback(chunk)
except Exception as e:
logger.warning(f"Chunk validation failed: {str(e)}")
raise HTTPException(status_code=400, detail=f"File validation failed: {str(e)}")
# Write chunk asynchronously
await dest_file.write(chunk)
# Progress callback
if progress_callback:
progress_callback(file_size, file_size) # We don't know total size in advance
# Yield control to prevent blocking
await asyncio.sleep(0)
except Exception as e:
# Clean up partial file on error
if final_path.exists():
try:
final_path.unlink()
except:
pass
raise HTTPException(status_code=500, detail=f"File upload failed: {str(e)}")
return str(final_path), file_size, checksum.hexdigest()
async def stream_read_file(
self,
file_path: str,
chunk_size: int = CHUNK_SIZE
) -> AsyncGenerator[bytes, None]:
"""
Stream read file in chunks.
Args:
file_path: Path to the file to read
chunk_size: Size of chunks to read
Yields:
File content chunks
"""
full_path = self.base_upload_dir / file_path
if not full_path.exists():
raise HTTPException(status_code=404, detail="File not found")
try:
async with aiofiles.open(full_path, 'rb') as file:
while True:
chunk = await file.read(chunk_size)
if not chunk:
break
yield chunk
# Yield control
await asyncio.sleep(0)
except Exception as e:
logger.error(f"Failed to stream read file {file_path}: {str(e)}")
raise HTTPException(status_code=500, detail=f"Failed to read file: {str(e)}")
async def validate_file_streaming(
self,
file: UploadFile,
max_size: Optional[int] = None,
allowed_extensions: Optional[set] = None,
malware_patterns: Optional[list] = None
) -> Tuple[bool, str, Dict[str, Any]]:
"""
Validate file using streaming to handle large files efficiently.
Args:
file: The uploaded file
max_size: Maximum allowed file size
allowed_extensions: Set of allowed file extensions
malware_patterns: List of malware patterns to check for
Returns:
Tuple of (is_valid, error_message, file_metadata)
"""
metadata = {
"filename": file.filename,
"size": 0,
"checksum": "",
"content_type": file.content_type
}
# Check filename and extension
if not file.filename:
return False, "No filename provided", metadata
file_ext = Path(file.filename).suffix.lower()
if allowed_extensions and file_ext not in allowed_extensions:
return False, f"File extension {file_ext} not allowed", metadata
# Stream validation
checksum = hashlib.sha256()
file_size = 0
first_chunk = b""
try:
await file.seek(0)
# Read and validate in chunks
is_first_chunk = True
while True:
chunk = await file.read(CHUNK_SIZE)
if not chunk:
break
file_size += len(chunk)
checksum.update(chunk)
# Store first chunk for content type detection
if is_first_chunk:
first_chunk = chunk
is_first_chunk = False
# Check size limit
if max_size and file_size > max_size:
# Standardized message to match envelope tests
return False, "File too large", metadata
# Check for malware patterns
if malware_patterns:
chunk_str = chunk.decode('utf-8', errors='ignore').lower()
for pattern in malware_patterns:
if pattern in chunk_str:
return False, f"Malicious content detected", metadata
# Yield control
await asyncio.sleep(0)
# Update metadata
metadata.update({
"size": file_size,
"checksum": checksum.hexdigest(),
"first_chunk": first_chunk[:512] # First 512 bytes for content detection
})
return True, "", metadata
except Exception as e:
logger.error(f"File validation failed: {str(e)}")
return False, f"Validation error: {str(e)}", metadata
finally:
# Reset file pointer
await file.seek(0)
async def process_csv_file_streaming(
self,
file: UploadFile,
row_processor: Callable[[str], Any],
progress_callback: Optional[Callable[[int], None]] = None,
batch_size: int = 1000
) -> Tuple[int, int, list]:
"""
Process CSV file in streaming fashion for large files.
Args:
file: The CSV file to process
row_processor: Function to process each row
progress_callback: Optional callback for progress (rows_processed)
batch_size: Number of rows to process in each batch
Returns:
Tuple of (total_rows, successful_rows, errors)
"""
total_rows = 0
successful_rows = 0
errors = []
batch = []
try:
await file.seek(0)
# Read file in chunks and process line by line
buffer = ""
header_processed = False
while True:
chunk = await file.read(CHUNK_SIZE)
if not chunk:
# Process remaining buffer
if buffer.strip():
lines = buffer.split('\n')
for line in lines:
if line.strip():
await self._process_csv_line(
line, row_processor, batch, batch_size,
total_rows, successful_rows, errors,
progress_callback, header_processed
)
total_rows += 1
if not header_processed:
header_processed = True
break
# Decode chunk and add to buffer
try:
chunk_text = chunk.decode('utf-8')
except UnicodeDecodeError:
# Try with error handling
chunk_text = chunk.decode('utf-8', errors='replace')
buffer += chunk_text
# Process complete lines
while '\n' in buffer:
line, buffer = buffer.split('\n', 1)
if line.strip(): # Skip empty lines
success = await self._process_csv_line(
line, row_processor, batch, batch_size,
total_rows, successful_rows, errors,
progress_callback, header_processed
)
total_rows += 1
if success:
successful_rows += 1
if not header_processed:
header_processed = True
# Yield control
await asyncio.sleep(0)
# Process any remaining batch
if batch:
await self._process_csv_batch(batch, errors)
except Exception as e:
logger.error(f"CSV processing failed: {str(e)}")
errors.append(f"Processing error: {str(e)}")
return total_rows, successful_rows, errors
async def _process_csv_line(
self,
line: str,
row_processor: Callable,
batch: list,
batch_size: int,
total_rows: int,
successful_rows: int,
errors: list,
progress_callback: Optional[Callable],
header_processed: bool
) -> bool:
"""Process a single CSV line"""
try:
# Skip header row
if not header_processed:
return True
# Add to batch
batch.append(line)
# Process batch when full
if len(batch) >= batch_size:
await self._process_csv_batch(batch, errors)
batch.clear()
# Progress callback
if progress_callback:
progress_callback(total_rows)
return True
except Exception as e:
errors.append(f"Row {total_rows}: {str(e)}")
return False
async def _process_csv_batch(self, batch: list, errors: list):
"""Process a batch of CSV rows"""
try:
# Process batch - this would be customized based on needs
for line in batch:
# Individual row processing would happen here
pass
except Exception as e:
errors.append(f"Batch processing error: {str(e)}")
async def copy_file_async(
self,
source_path: str,
destination_path: str,
progress_callback: Optional[Callable[[int, int], None]] = None
) -> bool:
"""
Copy file asynchronously with progress tracking.
Args:
source_path: Source file path
destination_path: Destination file path
progress_callback: Optional progress callback
Returns:
True if successful, False otherwise
"""
source = self.base_upload_dir / source_path
destination = self.base_upload_dir / destination_path
if not source.exists():
logger.error(f"Source file does not exist: {source}")
return False
try:
# Create destination directory
destination.parent.mkdir(parents=True, exist_ok=True)
file_size = source.stat().st_size
bytes_copied = 0
async with aiofiles.open(source, 'rb') as src_file:
async with aiofiles.open(destination, 'wb') as dest_file:
while True:
chunk = await src_file.read(CHUNK_SIZE)
if not chunk:
break
await dest_file.write(chunk)
bytes_copied += len(chunk)
if progress_callback:
progress_callback(bytes_copied, file_size)
# Yield control
await asyncio.sleep(0)
return True
except Exception as e:
logger.error(f"Failed to copy file {source} to {destination}: {str(e)}")
return False
async def get_file_info_async(self, file_path: str) -> Optional[Dict[str, Any]]:
"""
Get file information asynchronously.
Args:
file_path: Path to the file
Returns:
File information dictionary or None if file doesn't exist
"""
full_path = self.base_upload_dir / file_path
if not full_path.exists():
return None
try:
stat = full_path.stat()
# Calculate checksum for smaller files
checksum = None
if stat.st_size <= LARGE_FILE_THRESHOLD:
checksum = hashlib.sha256()
async with aiofiles.open(full_path, 'rb') as file:
while True:
chunk = await file.read(CHUNK_SIZE)
if not chunk:
break
checksum.update(chunk)
await asyncio.sleep(0)
checksum = checksum.hexdigest()
return {
"path": file_path,
"size": stat.st_size,
"created": stat.st_ctime,
"modified": stat.st_mtime,
"checksum": checksum,
"is_large_file": stat.st_size > LARGE_FILE_THRESHOLD
}
except Exception as e:
logger.error(f"Failed to get file info for {file_path}: {str(e)}")
return None
# Global instance
async_file_ops = AsyncFileOperations()
# Utility functions for backward compatibility
async def stream_save_upload(
file: UploadFile,
subdir: str,
filename_override: Optional[str] = None,
progress_callback: Optional[Callable[[int, int], None]] = None
) -> Tuple[str, int]:
"""
Save uploaded file using streaming operations.
Returns:
Tuple of (relative_path, file_size)
"""
# Generate safe filename
safe_filename = filename_override or file.filename
if not safe_filename:
safe_filename = f"upload_{uuid.uuid4().hex}"
# Create unique filename to prevent conflicts
unique_filename = f"{uuid.uuid4().hex}_{safe_filename}"
relative_path = f"{subdir}/{unique_filename}"
final_path, file_size, checksum = await async_file_ops.stream_upload_file(
file, relative_path, progress_callback
)
return relative_path, file_size
async def validate_large_upload(
file: UploadFile,
category: str = "document",
max_size: Optional[int] = None
) -> Tuple[bool, str, Dict[str, Any]]:
"""
Validate uploaded file using streaming for large files.
Returns:
Tuple of (is_valid, error_message, metadata)
"""
# Define allowed extensions by category
allowed_extensions = {
"document": {".pdf", ".doc", ".docx", ".txt", ".rtf"},
"image": {".jpg", ".jpeg", ".png", ".gif", ".bmp"},
"csv": {".csv", ".txt"},
"archive": {".zip", ".rar", ".7z", ".tar", ".gz"}
}
# Define basic malware patterns
malware_patterns = [
"eval(", "exec(", "system(", "shell_exec(",
"<script", "javascript:", "vbscript:",
"cmd.exe", "powershell.exe"
]
extensions = allowed_extensions.get(category, set())
return await async_file_ops.validate_file_streaming(
file, max_size, extensions, malware_patterns
)