changes

2025-08-18 20:20:04 -05:00
parent 89b2bc0aa2
commit bac8cc4bd5
114 changed files with 30258 additions and 1341 deletions
--- a/app/services/async_file_operations.py
+++ b/app/services/async_file_operations.py
@@ -0,0 +1,527 @@
+"""
+Async file operations service for handling large files efficiently.
+
+Provides streaming file operations, chunked processing, and progress tracking
+to improve performance with large files and prevent memory exhaustion.
+"""
+import asyncio
+import aiofiles
+import os
+import hashlib
+import uuid
+from pathlib import Path
+from typing import AsyncGenerator, Callable, Optional, Tuple, Dict, Any
+from fastapi import UploadFile, HTTPException
+from app.config import settings
+from app.utils.logging import get_logger
+
+logger = get_logger("async_file_ops")
+
+# Configuration constants
+CHUNK_SIZE = 64 * 1024  # 64KB chunks for streaming
+LARGE_FILE_THRESHOLD = 10 * 1024 * 1024  # 10MB - files larger than this use streaming
+MAX_MEMORY_BUFFER = 50 * 1024 * 1024  # 50MB - max memory buffer for file operations
+
+
+class AsyncFileOperations:
+    """
+    Service for handling large file operations asynchronously with streaming support.
+    
+    Features:
+    - Streaming file uploads/downloads
+    - Chunked processing for large files
+    - Progress tracking callbacks
+    - Memory-efficient operations
+    - Async file validation
+    """
+    
+    def __init__(self, base_upload_dir: Optional[str] = None):
+        self.base_upload_dir = Path(base_upload_dir or settings.upload_dir)
+        self.base_upload_dir.mkdir(parents=True, exist_ok=True)
+    
+    async def stream_upload_file(
+        self,
+        file: UploadFile,
+        destination_path: str,
+        progress_callback: Optional[Callable[[int, int], None]] = None,
+        validate_callback: Optional[Callable[[bytes], None]] = None
+    ) -> Tuple[str, int, str]:
+        """
+        Stream upload file to destination with progress tracking.
+        
+        Args:
+            file: The uploaded file
+            destination_path: Relative path where to save the file
+            progress_callback: Optional callback for progress tracking (bytes_read, total_size)
+            validate_callback: Optional callback for chunk validation
+            
+        Returns:
+            Tuple of (final_path, file_size, checksum)
+        """
+        final_path = self.base_upload_dir / destination_path
+        final_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        file_size = 0
+        checksum = hashlib.sha256()
+        
+        try:
+            async with aiofiles.open(final_path, 'wb') as dest_file:
+                # Reset file pointer to beginning
+                await file.seek(0)
+                
+                while True:
+                    chunk = await file.read(CHUNK_SIZE)
+                    if not chunk:
+                        break
+                    
+                    # Update size and checksum
+                    file_size += len(chunk)
+                    checksum.update(chunk)
+                    
+                    # Optional chunk validation
+                    if validate_callback:
+                        try:
+                            validate_callback(chunk)
+                        except Exception as e:
+                            logger.warning(f"Chunk validation failed: {str(e)}")
+                            raise HTTPException(status_code=400, detail=f"File validation failed: {str(e)}")
+                    
+                    # Write chunk asynchronously
+                    await dest_file.write(chunk)
+                    
+                    # Progress callback
+                    if progress_callback:
+                        progress_callback(file_size, file_size)  # We don't know total size in advance
+                        
+                    # Yield control to prevent blocking
+                    await asyncio.sleep(0)
+                    
+        except Exception as e:
+            # Clean up partial file on error
+            if final_path.exists():
+                try:
+                    final_path.unlink()
+                except:
+                    pass
+            raise HTTPException(status_code=500, detail=f"File upload failed: {str(e)}")
+        
+        return str(final_path), file_size, checksum.hexdigest()
+    
+    async def stream_read_file(
+        self,
+        file_path: str,
+        chunk_size: int = CHUNK_SIZE
+    ) -> AsyncGenerator[bytes, None]:
+        """
+        Stream read file in chunks.
+        
+        Args:
+            file_path: Path to the file to read
+            chunk_size: Size of chunks to read
+            
+        Yields:
+            File content chunks
+        """
+        full_path = self.base_upload_dir / file_path
+        
+        if not full_path.exists():
+            raise HTTPException(status_code=404, detail="File not found")
+            
+        try:
+            async with aiofiles.open(full_path, 'rb') as file:
+                while True:
+                    chunk = await file.read(chunk_size)
+                    if not chunk:
+                        break
+                    yield chunk
+                    # Yield control
+                    await asyncio.sleep(0)
+        except Exception as e:
+            logger.error(f"Failed to stream read file {file_path}: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Failed to read file: {str(e)}")
+    
+    async def validate_file_streaming(
+        self,
+        file: UploadFile,
+        max_size: Optional[int] = None,
+        allowed_extensions: Optional[set] = None,
+        malware_patterns: Optional[list] = None
+    ) -> Tuple[bool, str, Dict[str, Any]]:
+        """
+        Validate file using streaming to handle large files efficiently.
+        
+        Args:
+            file: The uploaded file
+            max_size: Maximum allowed file size
+            allowed_extensions: Set of allowed file extensions
+            malware_patterns: List of malware patterns to check for
+            
+        Returns:
+            Tuple of (is_valid, error_message, file_metadata)
+        """
+        metadata = {
+            "filename": file.filename,
+            "size": 0,
+            "checksum": "",
+            "content_type": file.content_type
+        }
+        
+        # Check filename and extension
+        if not file.filename:
+            return False, "No filename provided", metadata
+            
+        file_ext = Path(file.filename).suffix.lower()
+        if allowed_extensions and file_ext not in allowed_extensions:
+            return False, f"File extension {file_ext} not allowed", metadata
+        
+        # Stream validation
+        checksum = hashlib.sha256()
+        file_size = 0
+        first_chunk = b""
+        
+        try:
+            await file.seek(0)
+            
+            # Read and validate in chunks
+            is_first_chunk = True
+            while True:
+                chunk = await file.read(CHUNK_SIZE)
+                if not chunk:
+                    break
+                
+                file_size += len(chunk)
+                checksum.update(chunk)
+                
+                # Store first chunk for content type detection
+                if is_first_chunk:
+                    first_chunk = chunk
+                    is_first_chunk = False
+                
+                # Check size limit
+                if max_size and file_size > max_size:
+                    # Standardized message to match envelope tests
+                    return False, "File too large", metadata
+                
+                # Check for malware patterns
+                if malware_patterns:
+                    chunk_str = chunk.decode('utf-8', errors='ignore').lower()
+                    for pattern in malware_patterns:
+                        if pattern in chunk_str:
+                            return False, f"Malicious content detected", metadata
+                
+                # Yield control
+                await asyncio.sleep(0)
+            
+            # Update metadata
+            metadata.update({
+                "size": file_size,
+                "checksum": checksum.hexdigest(),
+                "first_chunk": first_chunk[:512]  # First 512 bytes for content detection
+            })
+            
+            return True, "", metadata
+            
+        except Exception as e:
+            logger.error(f"File validation failed: {str(e)}")
+            return False, f"Validation error: {str(e)}", metadata
+        finally:
+            # Reset file pointer
+            await file.seek(0)
+    
+    async def process_csv_file_streaming(
+        self,
+        file: UploadFile,
+        row_processor: Callable[[str], Any],
+        progress_callback: Optional[Callable[[int], None]] = None,
+        batch_size: int = 1000
+    ) -> Tuple[int, int, list]:
+        """
+        Process CSV file in streaming fashion for large files.
+        
+        Args:
+            file: The CSV file to process
+            row_processor: Function to process each row
+            progress_callback: Optional callback for progress (rows_processed)
+            batch_size: Number of rows to process in each batch
+            
+        Returns:
+            Tuple of (total_rows, successful_rows, errors)
+        """
+        total_rows = 0
+        successful_rows = 0
+        errors = []
+        batch = []
+        
+        try:
+            await file.seek(0)
+            
+            # Read file in chunks and process line by line
+            buffer = ""
+            header_processed = False
+            
+            while True:
+                chunk = await file.read(CHUNK_SIZE)
+                if not chunk:
+                    # Process remaining buffer
+                    if buffer.strip():
+                        lines = buffer.split('\n')
+                        for line in lines:
+                            if line.strip():
+                                await self._process_csv_line(
+                                    line, row_processor, batch, batch_size,
+                                    total_rows, successful_rows, errors,
+                                    progress_callback, header_processed
+                                )
+                                total_rows += 1
+                                if not header_processed:
+                                    header_processed = True
+                    break
+                
+                # Decode chunk and add to buffer
+                try:
+                    chunk_text = chunk.decode('utf-8')
+                except UnicodeDecodeError:
+                    # Try with error handling
+                    chunk_text = chunk.decode('utf-8', errors='replace')
+                
+                buffer += chunk_text
+                
+                # Process complete lines
+                while '\n' in buffer:
+                    line, buffer = buffer.split('\n', 1)
+                    
+                    if line.strip():  # Skip empty lines
+                        success = await self._process_csv_line(
+                            line, row_processor, batch, batch_size,
+                            total_rows, successful_rows, errors,
+                            progress_callback, header_processed
+                        )
+                        
+                        total_rows += 1
+                        if success:
+                            successful_rows += 1
+                        
+                        if not header_processed:
+                            header_processed = True
+                
+                # Yield control
+                await asyncio.sleep(0)
+            
+            # Process any remaining batch
+            if batch:
+                await self._process_csv_batch(batch, errors)
+            
+        except Exception as e:
+            logger.error(f"CSV processing failed: {str(e)}")
+            errors.append(f"Processing error: {str(e)}")
+        
+        return total_rows, successful_rows, errors
+    
+    async def _process_csv_line(
+        self,
+        line: str,
+        row_processor: Callable,
+        batch: list,
+        batch_size: int,
+        total_rows: int,
+        successful_rows: int,
+        errors: list,
+        progress_callback: Optional[Callable],
+        header_processed: bool
+    ) -> bool:
+        """Process a single CSV line"""
+        try:
+            # Skip header row
+            if not header_processed:
+                return True
+                
+            # Add to batch
+            batch.append(line)
+            
+            # Process batch when full
+            if len(batch) >= batch_size:
+                await self._process_csv_batch(batch, errors)
+                batch.clear()
+            
+            # Progress callback
+            if progress_callback:
+                progress_callback(total_rows)
+            
+            return True
+            
+        except Exception as e:
+            errors.append(f"Row {total_rows}: {str(e)}")
+            return False
+    
+    async def _process_csv_batch(self, batch: list, errors: list):
+        """Process a batch of CSV rows"""
+        try:
+            # Process batch - this would be customized based on needs
+            for line in batch:
+                # Individual row processing would happen here
+                pass
+        except Exception as e:
+            errors.append(f"Batch processing error: {str(e)}")
+    
+    async def copy_file_async(
+        self,
+        source_path: str,
+        destination_path: str,
+        progress_callback: Optional[Callable[[int, int], None]] = None
+    ) -> bool:
+        """
+        Copy file asynchronously with progress tracking.
+        
+        Args:
+            source_path: Source file path
+            destination_path: Destination file path
+            progress_callback: Optional progress callback
+            
+        Returns:
+            True if successful, False otherwise
+        """
+        source = self.base_upload_dir / source_path
+        destination = self.base_upload_dir / destination_path
+        
+        if not source.exists():
+            logger.error(f"Source file does not exist: {source}")
+            return False
+        
+        try:
+            # Create destination directory
+            destination.parent.mkdir(parents=True, exist_ok=True)
+            
+            file_size = source.stat().st_size
+            bytes_copied = 0
+            
+            async with aiofiles.open(source, 'rb') as src_file:
+                async with aiofiles.open(destination, 'wb') as dest_file:
+                    while True:
+                        chunk = await src_file.read(CHUNK_SIZE)
+                        if not chunk:
+                            break
+                        
+                        await dest_file.write(chunk)
+                        bytes_copied += len(chunk)
+                        
+                        if progress_callback:
+                            progress_callback(bytes_copied, file_size)
+                        
+                        # Yield control
+                        await asyncio.sleep(0)
+            
+            return True
+            
+        except Exception as e:
+            logger.error(f"Failed to copy file {source} to {destination}: {str(e)}")
+            return False
+    
+    async def get_file_info_async(self, file_path: str) -> Optional[Dict[str, Any]]:
+        """
+        Get file information asynchronously.
+        
+        Args:
+            file_path: Path to the file
+            
+        Returns:
+            File information dictionary or None if file doesn't exist
+        """
+        full_path = self.base_upload_dir / file_path
+        
+        if not full_path.exists():
+            return None
+        
+        try:
+            stat = full_path.stat()
+            
+            # Calculate checksum for smaller files
+            checksum = None
+            if stat.st_size <= LARGE_FILE_THRESHOLD:
+                checksum = hashlib.sha256()
+                async with aiofiles.open(full_path, 'rb') as file:
+                    while True:
+                        chunk = await file.read(CHUNK_SIZE)
+                        if not chunk:
+                            break
+                        checksum.update(chunk)
+                        await asyncio.sleep(0)
+                checksum = checksum.hexdigest()
+            
+            return {
+                "path": file_path,
+                "size": stat.st_size,
+                "created": stat.st_ctime,
+                "modified": stat.st_mtime,
+                "checksum": checksum,
+                "is_large_file": stat.st_size > LARGE_FILE_THRESHOLD
+            }
+            
+        except Exception as e:
+            logger.error(f"Failed to get file info for {file_path}: {str(e)}")
+            return None
+
+
+# Global instance
+async_file_ops = AsyncFileOperations()
+
+
+# Utility functions for backward compatibility
+async def stream_save_upload(
+    file: UploadFile,
+    subdir: str,
+    filename_override: Optional[str] = None,
+    progress_callback: Optional[Callable[[int, int], None]] = None
+) -> Tuple[str, int]:
+    """
+    Save uploaded file using streaming operations.
+    
+    Returns:
+        Tuple of (relative_path, file_size)
+    """
+    # Generate safe filename
+    safe_filename = filename_override or file.filename
+    if not safe_filename:
+        safe_filename = f"upload_{uuid.uuid4().hex}"
+    
+    # Create unique filename to prevent conflicts
+    unique_filename = f"{uuid.uuid4().hex}_{safe_filename}"
+    relative_path = f"{subdir}/{unique_filename}"
+    
+    final_path, file_size, checksum = await async_file_ops.stream_upload_file(
+        file, relative_path, progress_callback
+    )
+    
+    return relative_path, file_size
+
+
+async def validate_large_upload(
+    file: UploadFile,
+    category: str = "document",
+    max_size: Optional[int] = None
+) -> Tuple[bool, str, Dict[str, Any]]:
+    """
+    Validate uploaded file using streaming for large files.
+    
+    Returns:
+        Tuple of (is_valid, error_message, metadata)
+    """
+    # Define allowed extensions by category
+    allowed_extensions = {
+        "document": {".pdf", ".doc", ".docx", ".txt", ".rtf"},
+        "image": {".jpg", ".jpeg", ".png", ".gif", ".bmp"},
+        "csv": {".csv", ".txt"},
+        "archive": {".zip", ".rar", ".7z", ".tar", ".gz"}
+    }
+    
+    # Define basic malware patterns
+    malware_patterns = [
+        "eval(", "exec(", "system(", "shell_exec(",
+        "<script", "javascript:", "vbscript:",
+        "cmd.exe", "powershell.exe"
+    ]
+    
+    extensions = allowed_extensions.get(category, set())
+    
+    return await async_file_ops.validate_file_streaming(
+        file, max_size, extensions, malware_patterns
+    )