changes
This commit is contained in:
527
app/services/async_file_operations.py
Normal file
527
app/services/async_file_operations.py
Normal file
@@ -0,0 +1,527 @@
|
||||
"""
|
||||
Async file operations service for handling large files efficiently.
|
||||
|
||||
Provides streaming file operations, chunked processing, and progress tracking
|
||||
to improve performance with large files and prevent memory exhaustion.
|
||||
"""
|
||||
import asyncio
|
||||
import aiofiles
|
||||
import os
|
||||
import hashlib
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import AsyncGenerator, Callable, Optional, Tuple, Dict, Any
|
||||
from fastapi import UploadFile, HTTPException
|
||||
from app.config import settings
|
||||
from app.utils.logging import get_logger
|
||||
|
||||
logger = get_logger("async_file_ops")
|
||||
|
||||
# Configuration constants
|
||||
CHUNK_SIZE = 64 * 1024 # 64KB chunks for streaming
|
||||
LARGE_FILE_THRESHOLD = 10 * 1024 * 1024 # 10MB - files larger than this use streaming
|
||||
MAX_MEMORY_BUFFER = 50 * 1024 * 1024 # 50MB - max memory buffer for file operations
|
||||
|
||||
|
||||
class AsyncFileOperations:
|
||||
"""
|
||||
Service for handling large file operations asynchronously with streaming support.
|
||||
|
||||
Features:
|
||||
- Streaming file uploads/downloads
|
||||
- Chunked processing for large files
|
||||
- Progress tracking callbacks
|
||||
- Memory-efficient operations
|
||||
- Async file validation
|
||||
"""
|
||||
|
||||
def __init__(self, base_upload_dir: Optional[str] = None):
|
||||
self.base_upload_dir = Path(base_upload_dir or settings.upload_dir)
|
||||
self.base_upload_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
async def stream_upload_file(
|
||||
self,
|
||||
file: UploadFile,
|
||||
destination_path: str,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None,
|
||||
validate_callback: Optional[Callable[[bytes], None]] = None
|
||||
) -> Tuple[str, int, str]:
|
||||
"""
|
||||
Stream upload file to destination with progress tracking.
|
||||
|
||||
Args:
|
||||
file: The uploaded file
|
||||
destination_path: Relative path where to save the file
|
||||
progress_callback: Optional callback for progress tracking (bytes_read, total_size)
|
||||
validate_callback: Optional callback for chunk validation
|
||||
|
||||
Returns:
|
||||
Tuple of (final_path, file_size, checksum)
|
||||
"""
|
||||
final_path = self.base_upload_dir / destination_path
|
||||
final_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
file_size = 0
|
||||
checksum = hashlib.sha256()
|
||||
|
||||
try:
|
||||
async with aiofiles.open(final_path, 'wb') as dest_file:
|
||||
# Reset file pointer to beginning
|
||||
await file.seek(0)
|
||||
|
||||
while True:
|
||||
chunk = await file.read(CHUNK_SIZE)
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
# Update size and checksum
|
||||
file_size += len(chunk)
|
||||
checksum.update(chunk)
|
||||
|
||||
# Optional chunk validation
|
||||
if validate_callback:
|
||||
try:
|
||||
validate_callback(chunk)
|
||||
except Exception as e:
|
||||
logger.warning(f"Chunk validation failed: {str(e)}")
|
||||
raise HTTPException(status_code=400, detail=f"File validation failed: {str(e)}")
|
||||
|
||||
# Write chunk asynchronously
|
||||
await dest_file.write(chunk)
|
||||
|
||||
# Progress callback
|
||||
if progress_callback:
|
||||
progress_callback(file_size, file_size) # We don't know total size in advance
|
||||
|
||||
# Yield control to prevent blocking
|
||||
await asyncio.sleep(0)
|
||||
|
||||
except Exception as e:
|
||||
# Clean up partial file on error
|
||||
if final_path.exists():
|
||||
try:
|
||||
final_path.unlink()
|
||||
except:
|
||||
pass
|
||||
raise HTTPException(status_code=500, detail=f"File upload failed: {str(e)}")
|
||||
|
||||
return str(final_path), file_size, checksum.hexdigest()
|
||||
|
||||
async def stream_read_file(
|
||||
self,
|
||||
file_path: str,
|
||||
chunk_size: int = CHUNK_SIZE
|
||||
) -> AsyncGenerator[bytes, None]:
|
||||
"""
|
||||
Stream read file in chunks.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to read
|
||||
chunk_size: Size of chunks to read
|
||||
|
||||
Yields:
|
||||
File content chunks
|
||||
"""
|
||||
full_path = self.base_upload_dir / file_path
|
||||
|
||||
if not full_path.exists():
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
|
||||
try:
|
||||
async with aiofiles.open(full_path, 'rb') as file:
|
||||
while True:
|
||||
chunk = await file.read(chunk_size)
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
# Yield control
|
||||
await asyncio.sleep(0)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to stream read file {file_path}: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"Failed to read file: {str(e)}")
|
||||
|
||||
async def validate_file_streaming(
|
||||
self,
|
||||
file: UploadFile,
|
||||
max_size: Optional[int] = None,
|
||||
allowed_extensions: Optional[set] = None,
|
||||
malware_patterns: Optional[list] = None
|
||||
) -> Tuple[bool, str, Dict[str, Any]]:
|
||||
"""
|
||||
Validate file using streaming to handle large files efficiently.
|
||||
|
||||
Args:
|
||||
file: The uploaded file
|
||||
max_size: Maximum allowed file size
|
||||
allowed_extensions: Set of allowed file extensions
|
||||
malware_patterns: List of malware patterns to check for
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, error_message, file_metadata)
|
||||
"""
|
||||
metadata = {
|
||||
"filename": file.filename,
|
||||
"size": 0,
|
||||
"checksum": "",
|
||||
"content_type": file.content_type
|
||||
}
|
||||
|
||||
# Check filename and extension
|
||||
if not file.filename:
|
||||
return False, "No filename provided", metadata
|
||||
|
||||
file_ext = Path(file.filename).suffix.lower()
|
||||
if allowed_extensions and file_ext not in allowed_extensions:
|
||||
return False, f"File extension {file_ext} not allowed", metadata
|
||||
|
||||
# Stream validation
|
||||
checksum = hashlib.sha256()
|
||||
file_size = 0
|
||||
first_chunk = b""
|
||||
|
||||
try:
|
||||
await file.seek(0)
|
||||
|
||||
# Read and validate in chunks
|
||||
is_first_chunk = True
|
||||
while True:
|
||||
chunk = await file.read(CHUNK_SIZE)
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
file_size += len(chunk)
|
||||
checksum.update(chunk)
|
||||
|
||||
# Store first chunk for content type detection
|
||||
if is_first_chunk:
|
||||
first_chunk = chunk
|
||||
is_first_chunk = False
|
||||
|
||||
# Check size limit
|
||||
if max_size and file_size > max_size:
|
||||
# Standardized message to match envelope tests
|
||||
return False, "File too large", metadata
|
||||
|
||||
# Check for malware patterns
|
||||
if malware_patterns:
|
||||
chunk_str = chunk.decode('utf-8', errors='ignore').lower()
|
||||
for pattern in malware_patterns:
|
||||
if pattern in chunk_str:
|
||||
return False, f"Malicious content detected", metadata
|
||||
|
||||
# Yield control
|
||||
await asyncio.sleep(0)
|
||||
|
||||
# Update metadata
|
||||
metadata.update({
|
||||
"size": file_size,
|
||||
"checksum": checksum.hexdigest(),
|
||||
"first_chunk": first_chunk[:512] # First 512 bytes for content detection
|
||||
})
|
||||
|
||||
return True, "", metadata
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"File validation failed: {str(e)}")
|
||||
return False, f"Validation error: {str(e)}", metadata
|
||||
finally:
|
||||
# Reset file pointer
|
||||
await file.seek(0)
|
||||
|
||||
async def process_csv_file_streaming(
|
||||
self,
|
||||
file: UploadFile,
|
||||
row_processor: Callable[[str], Any],
|
||||
progress_callback: Optional[Callable[[int], None]] = None,
|
||||
batch_size: int = 1000
|
||||
) -> Tuple[int, int, list]:
|
||||
"""
|
||||
Process CSV file in streaming fashion for large files.
|
||||
|
||||
Args:
|
||||
file: The CSV file to process
|
||||
row_processor: Function to process each row
|
||||
progress_callback: Optional callback for progress (rows_processed)
|
||||
batch_size: Number of rows to process in each batch
|
||||
|
||||
Returns:
|
||||
Tuple of (total_rows, successful_rows, errors)
|
||||
"""
|
||||
total_rows = 0
|
||||
successful_rows = 0
|
||||
errors = []
|
||||
batch = []
|
||||
|
||||
try:
|
||||
await file.seek(0)
|
||||
|
||||
# Read file in chunks and process line by line
|
||||
buffer = ""
|
||||
header_processed = False
|
||||
|
||||
while True:
|
||||
chunk = await file.read(CHUNK_SIZE)
|
||||
if not chunk:
|
||||
# Process remaining buffer
|
||||
if buffer.strip():
|
||||
lines = buffer.split('\n')
|
||||
for line in lines:
|
||||
if line.strip():
|
||||
await self._process_csv_line(
|
||||
line, row_processor, batch, batch_size,
|
||||
total_rows, successful_rows, errors,
|
||||
progress_callback, header_processed
|
||||
)
|
||||
total_rows += 1
|
||||
if not header_processed:
|
||||
header_processed = True
|
||||
break
|
||||
|
||||
# Decode chunk and add to buffer
|
||||
try:
|
||||
chunk_text = chunk.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
# Try with error handling
|
||||
chunk_text = chunk.decode('utf-8', errors='replace')
|
||||
|
||||
buffer += chunk_text
|
||||
|
||||
# Process complete lines
|
||||
while '\n' in buffer:
|
||||
line, buffer = buffer.split('\n', 1)
|
||||
|
||||
if line.strip(): # Skip empty lines
|
||||
success = await self._process_csv_line(
|
||||
line, row_processor, batch, batch_size,
|
||||
total_rows, successful_rows, errors,
|
||||
progress_callback, header_processed
|
||||
)
|
||||
|
||||
total_rows += 1
|
||||
if success:
|
||||
successful_rows += 1
|
||||
|
||||
if not header_processed:
|
||||
header_processed = True
|
||||
|
||||
# Yield control
|
||||
await asyncio.sleep(0)
|
||||
|
||||
# Process any remaining batch
|
||||
if batch:
|
||||
await self._process_csv_batch(batch, errors)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"CSV processing failed: {str(e)}")
|
||||
errors.append(f"Processing error: {str(e)}")
|
||||
|
||||
return total_rows, successful_rows, errors
|
||||
|
||||
async def _process_csv_line(
|
||||
self,
|
||||
line: str,
|
||||
row_processor: Callable,
|
||||
batch: list,
|
||||
batch_size: int,
|
||||
total_rows: int,
|
||||
successful_rows: int,
|
||||
errors: list,
|
||||
progress_callback: Optional[Callable],
|
||||
header_processed: bool
|
||||
) -> bool:
|
||||
"""Process a single CSV line"""
|
||||
try:
|
||||
# Skip header row
|
||||
if not header_processed:
|
||||
return True
|
||||
|
||||
# Add to batch
|
||||
batch.append(line)
|
||||
|
||||
# Process batch when full
|
||||
if len(batch) >= batch_size:
|
||||
await self._process_csv_batch(batch, errors)
|
||||
batch.clear()
|
||||
|
||||
# Progress callback
|
||||
if progress_callback:
|
||||
progress_callback(total_rows)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
errors.append(f"Row {total_rows}: {str(e)}")
|
||||
return False
|
||||
|
||||
async def _process_csv_batch(self, batch: list, errors: list):
|
||||
"""Process a batch of CSV rows"""
|
||||
try:
|
||||
# Process batch - this would be customized based on needs
|
||||
for line in batch:
|
||||
# Individual row processing would happen here
|
||||
pass
|
||||
except Exception as e:
|
||||
errors.append(f"Batch processing error: {str(e)}")
|
||||
|
||||
async def copy_file_async(
|
||||
self,
|
||||
source_path: str,
|
||||
destination_path: str,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None
|
||||
) -> bool:
|
||||
"""
|
||||
Copy file asynchronously with progress tracking.
|
||||
|
||||
Args:
|
||||
source_path: Source file path
|
||||
destination_path: Destination file path
|
||||
progress_callback: Optional progress callback
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
"""
|
||||
source = self.base_upload_dir / source_path
|
||||
destination = self.base_upload_dir / destination_path
|
||||
|
||||
if not source.exists():
|
||||
logger.error(f"Source file does not exist: {source}")
|
||||
return False
|
||||
|
||||
try:
|
||||
# Create destination directory
|
||||
destination.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
file_size = source.stat().st_size
|
||||
bytes_copied = 0
|
||||
|
||||
async with aiofiles.open(source, 'rb') as src_file:
|
||||
async with aiofiles.open(destination, 'wb') as dest_file:
|
||||
while True:
|
||||
chunk = await src_file.read(CHUNK_SIZE)
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
await dest_file.write(chunk)
|
||||
bytes_copied += len(chunk)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(bytes_copied, file_size)
|
||||
|
||||
# Yield control
|
||||
await asyncio.sleep(0)
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to copy file {source} to {destination}: {str(e)}")
|
||||
return False
|
||||
|
||||
async def get_file_info_async(self, file_path: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get file information asynchronously.
|
||||
|
||||
Args:
|
||||
file_path: Path to the file
|
||||
|
||||
Returns:
|
||||
File information dictionary or None if file doesn't exist
|
||||
"""
|
||||
full_path = self.base_upload_dir / file_path
|
||||
|
||||
if not full_path.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
stat = full_path.stat()
|
||||
|
||||
# Calculate checksum for smaller files
|
||||
checksum = None
|
||||
if stat.st_size <= LARGE_FILE_THRESHOLD:
|
||||
checksum = hashlib.sha256()
|
||||
async with aiofiles.open(full_path, 'rb') as file:
|
||||
while True:
|
||||
chunk = await file.read(CHUNK_SIZE)
|
||||
if not chunk:
|
||||
break
|
||||
checksum.update(chunk)
|
||||
await asyncio.sleep(0)
|
||||
checksum = checksum.hexdigest()
|
||||
|
||||
return {
|
||||
"path": file_path,
|
||||
"size": stat.st_size,
|
||||
"created": stat.st_ctime,
|
||||
"modified": stat.st_mtime,
|
||||
"checksum": checksum,
|
||||
"is_large_file": stat.st_size > LARGE_FILE_THRESHOLD
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get file info for {file_path}: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
# Global instance
|
||||
async_file_ops = AsyncFileOperations()
|
||||
|
||||
|
||||
# Utility functions for backward compatibility
|
||||
async def stream_save_upload(
|
||||
file: UploadFile,
|
||||
subdir: str,
|
||||
filename_override: Optional[str] = None,
|
||||
progress_callback: Optional[Callable[[int, int], None]] = None
|
||||
) -> Tuple[str, int]:
|
||||
"""
|
||||
Save uploaded file using streaming operations.
|
||||
|
||||
Returns:
|
||||
Tuple of (relative_path, file_size)
|
||||
"""
|
||||
# Generate safe filename
|
||||
safe_filename = filename_override or file.filename
|
||||
if not safe_filename:
|
||||
safe_filename = f"upload_{uuid.uuid4().hex}"
|
||||
|
||||
# Create unique filename to prevent conflicts
|
||||
unique_filename = f"{uuid.uuid4().hex}_{safe_filename}"
|
||||
relative_path = f"{subdir}/{unique_filename}"
|
||||
|
||||
final_path, file_size, checksum = await async_file_ops.stream_upload_file(
|
||||
file, relative_path, progress_callback
|
||||
)
|
||||
|
||||
return relative_path, file_size
|
||||
|
||||
|
||||
async def validate_large_upload(
|
||||
file: UploadFile,
|
||||
category: str = "document",
|
||||
max_size: Optional[int] = None
|
||||
) -> Tuple[bool, str, Dict[str, Any]]:
|
||||
"""
|
||||
Validate uploaded file using streaming for large files.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, error_message, metadata)
|
||||
"""
|
||||
# Define allowed extensions by category
|
||||
allowed_extensions = {
|
||||
"document": {".pdf", ".doc", ".docx", ".txt", ".rtf"},
|
||||
"image": {".jpg", ".jpeg", ".png", ".gif", ".bmp"},
|
||||
"csv": {".csv", ".txt"},
|
||||
"archive": {".zip", ".rar", ".7z", ".tar", ".gz"}
|
||||
}
|
||||
|
||||
# Define basic malware patterns
|
||||
malware_patterns = [
|
||||
"eval(", "exec(", "system(", "shell_exec(",
|
||||
"<script", "javascript:", "vbscript:",
|
||||
"cmd.exe", "powershell.exe"
|
||||
]
|
||||
|
||||
extensions = allowed_extensions.get(category, set())
|
||||
|
||||
return await async_file_ops.validate_file_streaming(
|
||||
file, max_size, extensions, malware_patterns
|
||||
)
|
||||
Reference in New Issue
Block a user