delphi-database/app/services/async_file_operations.py

"""
Async file operations service for handling large files efficiently.

Provides streaming file operations, chunked processing, and progress tracking
to improve performance with large files and prevent memory exhaustion.
"""
import asyncio
import aiofiles
import os
import hashlib
import uuid
from pathlib import Path
from typing import AsyncGenerator, Callable, Optional, Tuple, Dict, Any
from fastapi import UploadFile, HTTPException
from app.config import settings
from app.utils.logging import get_logger

logger = get_logger("async_file_ops")

# Configuration constants
CHUNK_SIZE = 64 * 1024  # 64KB chunks for streaming
LARGE_FILE_THRESHOLD = 10 * 1024 * 1024  # 10MB - files larger than this use streaming
MAX_MEMORY_BUFFER = 50 * 1024 * 1024  # 50MB - max memory buffer for file operations


class AsyncFileOperations:
    """
    Service for handling large file operations asynchronously with streaming support.

    Features:
    - Streaming file uploads/downloads
    - Chunked processing for large files
    - Progress tracking callbacks
    - Memory-efficient operations
    - Async file validation
    """

    def __init__(self, base_upload_dir: Optional[str] = None):
        self.base_upload_dir = Path(base_upload_dir or settings.upload_dir)
        self.base_upload_dir.mkdir(parents=True, exist_ok=True)

    async def stream_upload_file(
        self,
        file: UploadFile,
        destination_path: str,
        progress_callback: Optional[Callable[[int, int], None]] = None,
        validate_callback: Optional[Callable[[bytes], None]] = None
    ) -> Tuple[str, int, str]:
        """
        Stream upload file to destination with progress tracking.

        Args:
            file: The uploaded file
            destination_path: Relative path where to save the file
            progress_callback: Optional callback for progress tracking (bytes_read, total_size)
            validate_callback: Optional callback for chunk validation

        Returns:
            Tuple of (final_path, file_size, checksum)
        """
        final_path = self.base_upload_dir / destination_path
        final_path.parent.mkdir(parents=True, exist_ok=True)

        file_size = 0
        checksum = hashlib.sha256()

        try:
            async with aiofiles.open(final_path, 'wb') as dest_file:
                # Reset file pointer to beginning
                await file.seek(0)

                while True:
                    chunk = await file.read(CHUNK_SIZE)
                    if not chunk:
                        break

                    # Update size and checksum
                    file_size += len(chunk)
                    checksum.update(chunk)

                    # Optional chunk validation
                    if validate_callback:
                        try:
                            validate_callback(chunk)
                        except Exception as e:
                            logger.warning(f"Chunk validation failed: {str(e)}")
                            raise HTTPException(status_code=400, detail=f"File validation failed: {str(e)}")

                    # Write chunk asynchronously
                    await dest_file.write(chunk)

                    # Progress callback
                    if progress_callback:
                        progress_callback(file_size, file_size)  # We don't know total size in advance

                    # Yield control to prevent blocking
                    await asyncio.sleep(0)

        except Exception as e:
            # Clean up partial file on error
            if final_path.exists():
                try:
                    final_path.unlink()
                except:
                    pass
            raise HTTPException(status_code=500, detail=f"File upload failed: {str(e)}")

        return str(final_path), file_size, checksum.hexdigest()

    async def stream_read_file(
        self,
        file_path: str,
        chunk_size: int = CHUNK_SIZE
    ) -> AsyncGenerator[bytes, None]:
        """
        Stream read file in chunks.

        Args:
            file_path: Path to the file to read
            chunk_size: Size of chunks to read

        Yields:
            File content chunks
        """
        full_path = self.base_upload_dir / file_path

        if not full_path.exists():
            raise HTTPException(status_code=404, detail="File not found")

        try:
            async with aiofiles.open(full_path, 'rb') as file:
                while True:
                    chunk = await file.read(chunk_size)
                    if not chunk:
                        break
                    yield chunk
                    # Yield control
                    await asyncio.sleep(0)
        except Exception as e:
            logger.error(f"Failed to stream read file {file_path}: {str(e)}")
            raise HTTPException(status_code=500, detail=f"Failed to read file: {str(e)}")

    async def validate_file_streaming(
        self,
        file: UploadFile,
        max_size: Optional[int] = None,
        allowed_extensions: Optional[set] = None,
        malware_patterns: Optional[list] = None
    ) -> Tuple[bool, str, Dict[str, Any]]:
        """
        Validate file using streaming to handle large files efficiently.

        Args:
            file: The uploaded file
            max_size: Maximum allowed file size
            allowed_extensions: Set of allowed file extensions
            malware_patterns: List of malware patterns to check for

        Returns:
            Tuple of (is_valid, error_message, file_metadata)
        """
        metadata = {
            "filename": file.filename,
            "size": 0,
            "checksum": "",
            "content_type": file.content_type
        }

        # Check filename and extension
        if not file.filename:
            return False, "No filename provided", metadata

        file_ext = Path(file.filename).suffix.lower()
        if allowed_extensions and file_ext not in allowed_extensions:
            return False, f"File extension {file_ext} not allowed", metadata

        # Stream validation
        checksum = hashlib.sha256()
        file_size = 0
        first_chunk = b""

        try:
            await file.seek(0)

            # Read and validate in chunks
            is_first_chunk = True
            while True:
                chunk = await file.read(CHUNK_SIZE)
                if not chunk:
                    break

                file_size += len(chunk)
                checksum.update(chunk)

                # Store first chunk for content type detection
                if is_first_chunk:
                    first_chunk = chunk
                    is_first_chunk = False

                # Check size limit
                if max_size and file_size > max_size:
                    # Standardized message to match envelope tests
                    return False, "File too large", metadata

                # Check for malware patterns
                if malware_patterns:
                    chunk_str = chunk.decode('utf-8', errors='ignore').lower()
                    for pattern in malware_patterns:
                        if pattern in chunk_str:
                            return False, f"Malicious content detected", metadata

                # Yield control
                await asyncio.sleep(0)

            # Update metadata
            metadata.update({
                "size": file_size,
                "checksum": checksum.hexdigest(),
                "first_chunk": first_chunk[:512]  # First 512 bytes for content detection
            })

            return True, "", metadata

        except Exception as e:
            logger.error(f"File validation failed: {str(e)}")
            return False, f"Validation error: {str(e)}", metadata
        finally:
            # Reset file pointer
            await file.seek(0)

    async def process_csv_file_streaming(
        self,
        file: UploadFile,
        row_processor: Callable[[str], Any],
        progress_callback: Optional[Callable[[int], None]] = None,
        batch_size: int = 1000
    ) -> Tuple[int, int, list]:
        """
        Process CSV file in streaming fashion for large files.

        Args:
            file: The CSV file to process
            row_processor: Function to process each row
            progress_callback: Optional callback for progress (rows_processed)
            batch_size: Number of rows to process in each batch

        Returns:
            Tuple of (total_rows, successful_rows, errors)
        """
        total_rows = 0
        successful_rows = 0
        errors = []
        batch = []

        try:
            await file.seek(0)

            # Read file in chunks and process line by line
            buffer = ""
            header_processed = False

            while True:
                chunk = await file.read(CHUNK_SIZE)
                if not chunk:
                    # Process remaining buffer
                    if buffer.strip():
                        lines = buffer.split('\n')
                        for line in lines:
                            if line.strip():
                                await self._process_csv_line(
                                    line, row_processor, batch, batch_size,
                                    total_rows, successful_rows, errors,
                                    progress_callback, header_processed
                                )
                                total_rows += 1
                                if not header_processed:
                                    header_processed = True
                    break

                # Decode chunk and add to buffer
                try:
                    chunk_text = chunk.decode('utf-8')
                except UnicodeDecodeError:
                    # Try with error handling
                    chunk_text = chunk.decode('utf-8', errors='replace')

                buffer += chunk_text

                # Process complete lines
                while '\n' in buffer:
                    line, buffer = buffer.split('\n', 1)

                    if line.strip():  # Skip empty lines
                        success = await self._process_csv_line(
                            line, row_processor, batch, batch_size,
                            total_rows, successful_rows, errors,
                            progress_callback, header_processed
                        )

                        total_rows += 1
                        if success:
                            successful_rows += 1

                        if not header_processed:
                            header_processed = True

                # Yield control
                await asyncio.sleep(0)

            # Process any remaining batch
            if batch:
                await self._process_csv_batch(batch, errors)

        except Exception as e:
            logger.error(f"CSV processing failed: {str(e)}")
            errors.append(f"Processing error: {str(e)}")

        return total_rows, successful_rows, errors

    async def _process_csv_line(
        self,
        line: str,
        row_processor: Callable,
        batch: list,
        batch_size: int,
        total_rows: int,
        successful_rows: int,
        errors: list,
        progress_callback: Optional[Callable],
        header_processed: bool
    ) -> bool:
        """Process a single CSV line"""
        try:
            # Skip header row
            if not header_processed:
                return True

            # Add to batch
            batch.append(line)

            # Process batch when full
            if len(batch) >= batch_size:
                await self._process_csv_batch(batch, errors)
                batch.clear()

            # Progress callback
            if progress_callback:
                progress_callback(total_rows)

            return True

        except Exception as e:
            errors.append(f"Row {total_rows}: {str(e)}")
            return False

    async def _process_csv_batch(self, batch: list, errors: list):
        """Process a batch of CSV rows"""
        try:
            # Process batch - this would be customized based on needs
            for line in batch:
                # Individual row processing would happen here
                pass
        except Exception as e:
            errors.append(f"Batch processing error: {str(e)}")

    async def copy_file_async(
        self,
        source_path: str,
        destination_path: str,
        progress_callback: Optional[Callable[[int, int], None]] = None
    ) -> bool:
        """
        Copy file asynchronously with progress tracking.

        Args:
            source_path: Source file path
            destination_path: Destination file path
            progress_callback: Optional progress callback

        Returns:
            True if successful, False otherwise
        """
        source = self.base_upload_dir / source_path
        destination = self.base_upload_dir / destination_path

        if not source.exists():
            logger.error(f"Source file does not exist: {source}")
            return False

        try:
            # Create destination directory
            destination.parent.mkdir(parents=True, exist_ok=True)

            file_size = source.stat().st_size
            bytes_copied = 0

            async with aiofiles.open(source, 'rb') as src_file:
                async with aiofiles.open(destination, 'wb') as dest_file:
                    while True:
                        chunk = await src_file.read(CHUNK_SIZE)
                        if not chunk:
                            break

                        await dest_file.write(chunk)
                        bytes_copied += len(chunk)

                        if progress_callback:
                            progress_callback(bytes_copied, file_size)

                        # Yield control
                        await asyncio.sleep(0)

            return True

        except Exception as e:
            logger.error(f"Failed to copy file {source} to {destination}: {str(e)}")
            return False

    async def get_file_info_async(self, file_path: str) -> Optional[Dict[str, Any]]:
        """
        Get file information asynchronously.

        Args:
            file_path: Path to the file

        Returns:
            File information dictionary or None if file doesn't exist
        """
        full_path = self.base_upload_dir / file_path

        if not full_path.exists():
            return None

        try:
            stat = full_path.stat()

            # Calculate checksum for smaller files
            checksum = None
            if stat.st_size <= LARGE_FILE_THRESHOLD:
                checksum = hashlib.sha256()
                async with aiofiles.open(full_path, 'rb') as file:
                    while True:
                        chunk = await file.read(CHUNK_SIZE)
                        if not chunk:
                            break
                        checksum.update(chunk)
                        await asyncio.sleep(0)
                checksum = checksum.hexdigest()

            return {
                "path": file_path,
                "size": stat.st_size,
                "created": stat.st_ctime,
                "modified": stat.st_mtime,
                "checksum": checksum,
                "is_large_file": stat.st_size > LARGE_FILE_THRESHOLD
            }

        except Exception as e:
            logger.error(f"Failed to get file info for {file_path}: {str(e)}")
            return None


# Global instance
async_file_ops = AsyncFileOperations()


# Utility functions for backward compatibility
async def stream_save_upload(
    file: UploadFile,
    subdir: str,
    filename_override: Optional[str] = None,
    progress_callback: Optional[Callable[[int, int], None]] = None
) -> Tuple[str, int]:
    """
    Save uploaded file using streaming operations.

    Returns:
        Tuple of (relative_path, file_size)
    """
    # Generate safe filename
    safe_filename = filename_override or file.filename
    if not safe_filename:
        safe_filename = f"upload_{uuid.uuid4().hex}"

    # Create unique filename to prevent conflicts
    unique_filename = f"{uuid.uuid4().hex}_{safe_filename}"
    relative_path = f"{subdir}/{unique_filename}"

    final_path, file_size, checksum = await async_file_ops.stream_upload_file(
        file, relative_path, progress_callback
    )

    return relative_path, file_size


async def validate_large_upload(
    file: UploadFile,
    category: str = "document",
    max_size: Optional[int] = None
) -> Tuple[bool, str, Dict[str, Any]]:
    """
    Validate uploaded file using streaming for large files.

    Returns:
        Tuple of (is_valid, error_message, metadata)
    """
    # Define allowed extensions by category
    allowed_extensions = {
        "document": {".pdf", ".doc", ".docx", ".txt", ".rtf"},
        "image": {".jpg", ".jpeg", ".png", ".gif", ".bmp"},
        "csv": {".csv", ".txt"},
        "archive": {".zip", ".rar", ".7z", ".tar", ".gz"}
    }

    # Define basic malware patterns
    malware_patterns = [
        "eval(", "exec(", "system(", "shell_exec(",
        "<script", "javascript:", "vbscript:",
        "cmd.exe", "powershell.exe"
    ]

    extensions = allowed_extensions.get(category, set())

    return await async_file_ops.validate_file_streaming(
        file, max_size, extensions, malware_patterns
    )