""" Async file operations service for handling large files efficiently. Provides streaming file operations, chunked processing, and progress tracking to improve performance with large files and prevent memory exhaustion. """ import asyncio import aiofiles import os import hashlib import uuid from pathlib import Path from typing import AsyncGenerator, Callable, Optional, Tuple, Dict, Any from fastapi import UploadFile, HTTPException from app.config import settings from app.utils.logging import get_logger logger = get_logger("async_file_ops") # Configuration constants CHUNK_SIZE = 64 * 1024 # 64KB chunks for streaming LARGE_FILE_THRESHOLD = 10 * 1024 * 1024 # 10MB - files larger than this use streaming MAX_MEMORY_BUFFER = 50 * 1024 * 1024 # 50MB - max memory buffer for file operations class AsyncFileOperations: """ Service for handling large file operations asynchronously with streaming support. Features: - Streaming file uploads/downloads - Chunked processing for large files - Progress tracking callbacks - Memory-efficient operations - Async file validation """ def __init__(self, base_upload_dir: Optional[str] = None): self.base_upload_dir = Path(base_upload_dir or settings.upload_dir) self.base_upload_dir.mkdir(parents=True, exist_ok=True) async def stream_upload_file( self, file: UploadFile, destination_path: str, progress_callback: Optional[Callable[[int, int], None]] = None, validate_callback: Optional[Callable[[bytes], None]] = None ) -> Tuple[str, int, str]: """ Stream upload file to destination with progress tracking. Args: file: The uploaded file destination_path: Relative path where to save the file progress_callback: Optional callback for progress tracking (bytes_read, total_size) validate_callback: Optional callback for chunk validation Returns: Tuple of (final_path, file_size, checksum) """ final_path = self.base_upload_dir / destination_path final_path.parent.mkdir(parents=True, exist_ok=True) file_size = 0 checksum = hashlib.sha256() try: async with aiofiles.open(final_path, 'wb') as dest_file: # Reset file pointer to beginning await file.seek(0) while True: chunk = await file.read(CHUNK_SIZE) if not chunk: break # Update size and checksum file_size += len(chunk) checksum.update(chunk) # Optional chunk validation if validate_callback: try: validate_callback(chunk) except Exception as e: logger.warning(f"Chunk validation failed: {str(e)}") raise HTTPException(status_code=400, detail=f"File validation failed: {str(e)}") # Write chunk asynchronously await dest_file.write(chunk) # Progress callback if progress_callback: progress_callback(file_size, file_size) # We don't know total size in advance # Yield control to prevent blocking await asyncio.sleep(0) except Exception as e: # Clean up partial file on error if final_path.exists(): try: final_path.unlink() except: pass raise HTTPException(status_code=500, detail=f"File upload failed: {str(e)}") return str(final_path), file_size, checksum.hexdigest() async def stream_read_file( self, file_path: str, chunk_size: int = CHUNK_SIZE ) -> AsyncGenerator[bytes, None]: """ Stream read file in chunks. Args: file_path: Path to the file to read chunk_size: Size of chunks to read Yields: File content chunks """ full_path = self.base_upload_dir / file_path if not full_path.exists(): raise HTTPException(status_code=404, detail="File not found") try: async with aiofiles.open(full_path, 'rb') as file: while True: chunk = await file.read(chunk_size) if not chunk: break yield chunk # Yield control await asyncio.sleep(0) except Exception as e: logger.error(f"Failed to stream read file {file_path}: {str(e)}") raise HTTPException(status_code=500, detail=f"Failed to read file: {str(e)}") async def validate_file_streaming( self, file: UploadFile, max_size: Optional[int] = None, allowed_extensions: Optional[set] = None, malware_patterns: Optional[list] = None ) -> Tuple[bool, str, Dict[str, Any]]: """ Validate file using streaming to handle large files efficiently. Args: file: The uploaded file max_size: Maximum allowed file size allowed_extensions: Set of allowed file extensions malware_patterns: List of malware patterns to check for Returns: Tuple of (is_valid, error_message, file_metadata) """ metadata = { "filename": file.filename, "size": 0, "checksum": "", "content_type": file.content_type } # Check filename and extension if not file.filename: return False, "No filename provided", metadata file_ext = Path(file.filename).suffix.lower() if allowed_extensions and file_ext not in allowed_extensions: return False, f"File extension {file_ext} not allowed", metadata # Stream validation checksum = hashlib.sha256() file_size = 0 first_chunk = b"" try: await file.seek(0) # Read and validate in chunks is_first_chunk = True while True: chunk = await file.read(CHUNK_SIZE) if not chunk: break file_size += len(chunk) checksum.update(chunk) # Store first chunk for content type detection if is_first_chunk: first_chunk = chunk is_first_chunk = False # Check size limit if max_size and file_size > max_size: # Standardized message to match envelope tests return False, "File too large", metadata # Check for malware patterns if malware_patterns: chunk_str = chunk.decode('utf-8', errors='ignore').lower() for pattern in malware_patterns: if pattern in chunk_str: return False, f"Malicious content detected", metadata # Yield control await asyncio.sleep(0) # Update metadata metadata.update({ "size": file_size, "checksum": checksum.hexdigest(), "first_chunk": first_chunk[:512] # First 512 bytes for content detection }) return True, "", metadata except Exception as e: logger.error(f"File validation failed: {str(e)}") return False, f"Validation error: {str(e)}", metadata finally: # Reset file pointer await file.seek(0) async def process_csv_file_streaming( self, file: UploadFile, row_processor: Callable[[str], Any], progress_callback: Optional[Callable[[int], None]] = None, batch_size: int = 1000 ) -> Tuple[int, int, list]: """ Process CSV file in streaming fashion for large files. Args: file: The CSV file to process row_processor: Function to process each row progress_callback: Optional callback for progress (rows_processed) batch_size: Number of rows to process in each batch Returns: Tuple of (total_rows, successful_rows, errors) """ total_rows = 0 successful_rows = 0 errors = [] batch = [] try: await file.seek(0) # Read file in chunks and process line by line buffer = "" header_processed = False while True: chunk = await file.read(CHUNK_SIZE) if not chunk: # Process remaining buffer if buffer.strip(): lines = buffer.split('\n') for line in lines: if line.strip(): await self._process_csv_line( line, row_processor, batch, batch_size, total_rows, successful_rows, errors, progress_callback, header_processed ) total_rows += 1 if not header_processed: header_processed = True break # Decode chunk and add to buffer try: chunk_text = chunk.decode('utf-8') except UnicodeDecodeError: # Try with error handling chunk_text = chunk.decode('utf-8', errors='replace') buffer += chunk_text # Process complete lines while '\n' in buffer: line, buffer = buffer.split('\n', 1) if line.strip(): # Skip empty lines success = await self._process_csv_line( line, row_processor, batch, batch_size, total_rows, successful_rows, errors, progress_callback, header_processed ) total_rows += 1 if success: successful_rows += 1 if not header_processed: header_processed = True # Yield control await asyncio.sleep(0) # Process any remaining batch if batch: await self._process_csv_batch(batch, errors) except Exception as e: logger.error(f"CSV processing failed: {str(e)}") errors.append(f"Processing error: {str(e)}") return total_rows, successful_rows, errors async def _process_csv_line( self, line: str, row_processor: Callable, batch: list, batch_size: int, total_rows: int, successful_rows: int, errors: list, progress_callback: Optional[Callable], header_processed: bool ) -> bool: """Process a single CSV line""" try: # Skip header row if not header_processed: return True # Add to batch batch.append(line) # Process batch when full if len(batch) >= batch_size: await self._process_csv_batch(batch, errors) batch.clear() # Progress callback if progress_callback: progress_callback(total_rows) return True except Exception as e: errors.append(f"Row {total_rows}: {str(e)}") return False async def _process_csv_batch(self, batch: list, errors: list): """Process a batch of CSV rows""" try: # Process batch - this would be customized based on needs for line in batch: # Individual row processing would happen here pass except Exception as e: errors.append(f"Batch processing error: {str(e)}") async def copy_file_async( self, source_path: str, destination_path: str, progress_callback: Optional[Callable[[int, int], None]] = None ) -> bool: """ Copy file asynchronously with progress tracking. Args: source_path: Source file path destination_path: Destination file path progress_callback: Optional progress callback Returns: True if successful, False otherwise """ source = self.base_upload_dir / source_path destination = self.base_upload_dir / destination_path if not source.exists(): logger.error(f"Source file does not exist: {source}") return False try: # Create destination directory destination.parent.mkdir(parents=True, exist_ok=True) file_size = source.stat().st_size bytes_copied = 0 async with aiofiles.open(source, 'rb') as src_file: async with aiofiles.open(destination, 'wb') as dest_file: while True: chunk = await src_file.read(CHUNK_SIZE) if not chunk: break await dest_file.write(chunk) bytes_copied += len(chunk) if progress_callback: progress_callback(bytes_copied, file_size) # Yield control await asyncio.sleep(0) return True except Exception as e: logger.error(f"Failed to copy file {source} to {destination}: {str(e)}") return False async def get_file_info_async(self, file_path: str) -> Optional[Dict[str, Any]]: """ Get file information asynchronously. Args: file_path: Path to the file Returns: File information dictionary or None if file doesn't exist """ full_path = self.base_upload_dir / file_path if not full_path.exists(): return None try: stat = full_path.stat() # Calculate checksum for smaller files checksum = None if stat.st_size <= LARGE_FILE_THRESHOLD: checksum = hashlib.sha256() async with aiofiles.open(full_path, 'rb') as file: while True: chunk = await file.read(CHUNK_SIZE) if not chunk: break checksum.update(chunk) await asyncio.sleep(0) checksum = checksum.hexdigest() return { "path": file_path, "size": stat.st_size, "created": stat.st_ctime, "modified": stat.st_mtime, "checksum": checksum, "is_large_file": stat.st_size > LARGE_FILE_THRESHOLD } except Exception as e: logger.error(f"Failed to get file info for {file_path}: {str(e)}") return None # Global instance async_file_ops = AsyncFileOperations() # Utility functions for backward compatibility async def stream_save_upload( file: UploadFile, subdir: str, filename_override: Optional[str] = None, progress_callback: Optional[Callable[[int, int], None]] = None ) -> Tuple[str, int]: """ Save uploaded file using streaming operations. Returns: Tuple of (relative_path, file_size) """ # Generate safe filename safe_filename = filename_override or file.filename if not safe_filename: safe_filename = f"upload_{uuid.uuid4().hex}" # Create unique filename to prevent conflicts unique_filename = f"{uuid.uuid4().hex}_{safe_filename}" relative_path = f"{subdir}/{unique_filename}" final_path, file_size, checksum = await async_file_ops.stream_upload_file( file, relative_path, progress_callback ) return relative_path, file_size async def validate_large_upload( file: UploadFile, category: str = "document", max_size: Optional[int] = None ) -> Tuple[bool, str, Dict[str, Any]]: """ Validate uploaded file using streaming for large files. Returns: Tuple of (is_valid, error_message, metadata) """ # Define allowed extensions by category allowed_extensions = { "document": {".pdf", ".doc", ".docx", ".txt", ".rtf"}, "image": {".jpg", ".jpeg", ".png", ".gif", ".bmp"}, "csv": {".csv", ".txt"}, "archive": {".zip", ".rar", ".7z", ".tar", ".gz"} } # Define basic malware patterns malware_patterns = [ "eval(", "exec(", "system(", "shell_exec(", "