""" Job Management API Provides lightweight monitoring and management endpoints around `JobRecord`. Notes: - This is not a background worker. It exposes status/history/metrics for jobs recorded by various synchronous operations (e.g., documents batch generation). - Retry creates a new queued record that references the original job. Actual processing is not scheduled here. """ from __future__ import annotations from typing import Any, Dict, List, Optional, Union from datetime import datetime, timezone from uuid import uuid4 from fastapi import APIRouter, Depends, HTTPException, Query, status, Request from pydantic import BaseModel, ConfigDict, Field from sqlalchemy.orm import Session from sqlalchemy import func from app.database.base import get_db from app.auth.security import get_current_user, get_admin_user from app.models.user import User from app.models.jobs import JobRecord from app.services.query_utils import apply_sorting, paginate_with_total, tokenized_ilike_filter from app.services.storage import get_default_storage from app.services.audit import audit_service from app.utils.logging import app_logger router = APIRouter() # -------------------- # Pydantic Schemas # -------------------- class JobRecordResponse(BaseModel): id: int job_id: str job_type: str status: str requested_by_username: Optional[str] = None started_at: Optional[datetime] = None completed_at: Optional[datetime] = None total_requested: int = 0 total_success: int = 0 total_failed: int = 0 has_result_bundle: bool = False bundle_url: Optional[str] = None bundle_size: Optional[int] = None duration_seconds: Optional[float] = None details: Optional[Dict[str, Any]] = None model_config = ConfigDict(from_attributes=True) class PaginatedJobsResponse(BaseModel): items: List[JobRecordResponse] total: int class JobFailRequest(BaseModel): reason: str = Field(..., min_length=1, max_length=1000) details_update: Optional[Dict[str, Any]] = None class JobCompletionUpdate(BaseModel): total_success: Optional[int] = None total_failed: Optional[int] = None result_storage_path: Optional[str] = None result_mime_type: Optional[str] = None result_size: Optional[int] = None details_update: Optional[Dict[str, Any]] = None class RetryRequest(BaseModel): note: Optional[str] = None class JobsMetricsResponse(BaseModel): by_status: Dict[str, int] by_type: Dict[str, int] avg_duration_seconds: Optional[float] = None running_count: int failed_last_24h: int completed_last_24h: int # -------------------- # Helpers # -------------------- def _compute_duration_seconds(started_at: Optional[datetime], completed_at: Optional[datetime]) -> Optional[float]: if not started_at or not completed_at: return None try: start_utc = started_at if started_at.tzinfo else started_at.replace(tzinfo=timezone.utc) end_utc = completed_at if completed_at.tzinfo else completed_at.replace(tzinfo=timezone.utc) return max((end_utc - start_utc).total_seconds(), 0.0) except Exception: return None def _to_response( job: JobRecord, *, include_url: bool = False, ) -> JobRecordResponse: has_bundle = bool(getattr(job, "result_storage_path", None)) bundle_url = None if include_url and has_bundle: try: bundle_url = get_default_storage().public_url(job.result_storage_path) # type: ignore[arg-type] except Exception: bundle_url = None return JobRecordResponse( id=job.id, job_id=job.job_id, job_type=job.job_type, status=job.status, requested_by_username=getattr(job, "requested_by_username", None), started_at=getattr(job, "started_at", None), completed_at=getattr(job, "completed_at", None), total_requested=getattr(job, "total_requested", 0) or 0, total_success=getattr(job, "total_success", 0) or 0, total_failed=getattr(job, "total_failed", 0) or 0, has_result_bundle=has_bundle, bundle_url=bundle_url, bundle_size=getattr(job, "result_size", None), duration_seconds=_compute_duration_seconds(getattr(job, "started_at", None), getattr(job, "completed_at", None)), details=getattr(job, "details", None), ) # -------------------- # Endpoints # -------------------- @router.get("/", response_model=Union[List[JobRecordResponse], PaginatedJobsResponse]) async def list_jobs( skip: int = Query(0, ge=0), limit: int = Query(50, ge=1, le=200), include_total: bool = Query(False, description="When true, returns {items, total} instead of a plain list"), include_urls: bool = Query(False, description="Include bundle URLs in responses"), status_filter: Optional[str] = Query(None, description="Filter by status"), type_filter: Optional[str] = Query(None, description="Filter by job type"), requested_by: Optional[str] = Query(None, description="Filter by username"), search: Optional[str] = Query(None, description="Tokenized search across job_id, type, status, username"), mine: bool = Query(True, description="When true, restricts to current user's jobs (admins can set false)"), sort_by: Optional[str] = Query("started", description="Sort by: started, completed, status, type"), sort_dir: Optional[str] = Query("desc", description="Sort direction: asc or desc"), db: Session = Depends(get_db), current_user: User = Depends(get_current_user), ): query = db.query(JobRecord) # Scope: non-admin users always restricted to their jobs is_admin = bool(getattr(current_user, "is_admin", False)) if mine or not is_admin: query = query.filter(JobRecord.requested_by_username == current_user.username) if status_filter: query = query.filter(JobRecord.status == status_filter) if type_filter: query = query.filter(JobRecord.job_type == type_filter) if requested_by and is_admin: query = query.filter(JobRecord.requested_by_username == requested_by) if search: tokens = [t for t in (search or "").split() if t] filter_expr = tokenized_ilike_filter(tokens, [ JobRecord.job_id, JobRecord.job_type, JobRecord.status, JobRecord.requested_by_username, ]) if filter_expr is not None: query = query.filter(filter_expr) # Sorting query = apply_sorting( query, sort_by, sort_dir, allowed={ "started": [JobRecord.started_at, JobRecord.id], "completed": [JobRecord.completed_at, JobRecord.id], "status": [JobRecord.status, JobRecord.started_at], "type": [JobRecord.job_type, JobRecord.started_at], }, ) jobs, total = paginate_with_total(query, skip, limit, include_total) items = [_to_response(j, include_url=include_urls) for j in jobs] if include_total: return {"items": items, "total": total or 0} return items @router.get("/{job_id}", response_model=JobRecordResponse) async def get_job( job_id: str, include_url: bool = Query(True), db: Session = Depends(get_db), current_user: User = Depends(get_current_user), ): job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first() if not job: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found") # Authorization: non-admin users can only access their jobs if not getattr(current_user, "is_admin", False): if getattr(job, "requested_by_username", None) != current_user.username: raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Not enough permissions") return _to_response(job, include_url=include_url) @router.post("/{job_id}/mark-failed", response_model=JobRecordResponse) async def mark_job_failed( job_id: str, payload: JobFailRequest, request: Request, db: Session = Depends(get_db), current_user: User = Depends(get_admin_user), ): job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first() if not job: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found") job.status = "failed" job.completed_at = datetime.now(timezone.utc) details = dict(getattr(job, "details", {}) or {}) details["last_error"] = payload.reason if payload.details_update: details.update(payload.details_update) job.details = details db.commit() db.refresh(job) try: audit_service.log_action( db=db, action="FAIL", resource_type="JOB", user=current_user, resource_id=job.job_id, details={"reason": payload.reason}, request=request, ) except Exception: pass return _to_response(job, include_url=True) @router.post("/{job_id}/mark-running", response_model=JobRecordResponse) async def mark_job_running( job_id: str, request: Request, db: Session = Depends(get_db), current_user: User = Depends(get_admin_user), ): job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first() if not job: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found") job.status = "running" # Reset start time when transitioning to running job.started_at = datetime.now(timezone.utc) job.completed_at = None db.commit() db.refresh(job) try: audit_service.log_action( db=db, action="RUNNING", resource_type="JOB", user=current_user, resource_id=job.job_id, details=None, request=request, ) except Exception: pass return _to_response(job) @router.post("/{job_id}/mark-completed", response_model=JobRecordResponse) async def mark_job_completed( job_id: str, payload: JobCompletionUpdate, request: Request, db: Session = Depends(get_db), current_user: User = Depends(get_admin_user), ): job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first() if not job: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found") job.status = "completed" job.completed_at = datetime.now(timezone.utc) if payload.total_success is not None: job.total_success = max(int(payload.total_success), 0) if payload.total_failed is not None: job.total_failed = max(int(payload.total_failed), 0) if payload.result_storage_path is not None: job.result_storage_path = payload.result_storage_path if payload.result_mime_type is not None: job.result_mime_type = payload.result_mime_type if payload.result_size is not None: job.result_size = max(int(payload.result_size), 0) if payload.details_update: details = dict(getattr(job, "details", {}) or {}) details.update(payload.details_update) job.details = details db.commit() db.refresh(job) try: audit_service.log_action( db=db, action="COMPLETE", resource_type="JOB", user=current_user, resource_id=job.job_id, details={ "total_success": job.total_success, "total_failed": job.total_failed, }, request=request, ) except Exception: pass return _to_response(job, include_url=True) @router.post("/{job_id}/retry") async def retry_job( job_id: str, payload: RetryRequest, request: Request, db: Session = Depends(get_db), current_user: User = Depends(get_admin_user), ): """ Create a new queued job record that references the original job. This endpoint does not execute the job; it enables monitoring UIs to track retry intent and external workers to pick it up if/when implemented. """ job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first() if not job: raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found") new_job_id = uuid4().hex new_details = dict(getattr(job, "details", {}) or {}) new_details["retry_of"] = job.job_id if payload.note: new_details["retry_note"] = payload.note cloned = JobRecord( job_id=new_job_id, job_type=job.job_type, status="queued", requested_by_username=current_user.username, started_at=datetime.now(timezone.utc), completed_at=None, total_requested=getattr(job, "total_requested", 0) or 0, total_success=0, total_failed=0, result_storage_path=None, result_mime_type=None, result_size=None, details=new_details, ) db.add(cloned) db.commit() try: audit_service.log_action( db=db, action="RETRY", resource_type="JOB", user=current_user, resource_id=job.job_id, details={"new_job_id": new_job_id}, request=request, ) except Exception: pass return {"message": "Retry created", "job_id": new_job_id} @router.get("/metrics/summary", response_model=JobsMetricsResponse) async def jobs_metrics( db: Session = Depends(get_db), current_user: User = Depends(get_admin_user), ): """ Basic metrics for dashboards/monitoring. """ # By status rows = db.query(JobRecord.status, func.count(JobRecord.id)).group_by(JobRecord.status).all() by_status = {str(k or "unknown"): int(v or 0) for k, v in rows} # By type rows = db.query(JobRecord.job_type, func.count(JobRecord.id)).group_by(JobRecord.job_type).all() by_type = {str(k or "unknown"): int(v or 0) for k, v in rows} # Running count try: running_count = db.query(func.count(JobRecord.id)).filter(JobRecord.status == "running").scalar() or 0 except Exception: running_count = 0 # Last 24h stats cutoff = datetime.now(timezone.utc).replace(microsecond=0) try: failed_last_24h = db.query(func.count(JobRecord.id)).filter( JobRecord.status == "failed", (JobRecord.completed_at != None), # noqa: E711 JobRecord.completed_at >= (cutoff.replace(hour=0, minute=0, second=0) - func.cast(1, func.INTEGER)) # type: ignore ).scalar() or 0 except Exception: # Fallback without date condition if backend doesn't support the above cast failed_last_24h = db.query(func.count(JobRecord.id)).filter(JobRecord.status == "failed").scalar() or 0 try: completed_last_24h = db.query(func.count(JobRecord.id)).filter( JobRecord.status == "completed", (JobRecord.completed_at != None), # noqa: E711 JobRecord.completed_at >= (cutoff.replace(hour=0, minute=0, second=0) - func.cast(1, func.INTEGER)) # type: ignore ).scalar() or 0 except Exception: completed_last_24h = db.query(func.count(JobRecord.id)).filter(JobRecord.status == "completed").scalar() or 0 # Average duration on completed try: completed_jobs = db.query(JobRecord.started_at, JobRecord.completed_at).filter(JobRecord.completed_at != None).limit(500).all() # noqa: E711 durations: List[float] = [] for s, c in completed_jobs: d = _compute_duration_seconds(s, c) if d is not None: durations.append(d) avg_duration = (sum(durations) / len(durations)) if durations else None except Exception: avg_duration = None return JobsMetricsResponse( by_status=by_status, by_type=by_type, avg_duration_seconds=(round(avg_duration, 2) if isinstance(avg_duration, (int, float)) else None), running_count=int(running_count), failed_last_24h=int(failed_last_24h), completed_last_24h=int(completed_last_24h), )