Files
delphi-database/app/api/jobs.py
HotSwapp bac8cc4bd5 changes
2025-08-18 20:20:04 -05:00

470 lines
16 KiB
Python

"""
Job Management API
Provides lightweight monitoring and management endpoints around `JobRecord`.
Notes:
- This is not a background worker. It exposes status/history/metrics for jobs
recorded by various synchronous operations (e.g., documents batch generation).
- Retry creates a new queued record that references the original job. Actual
processing is not scheduled here.
"""
from __future__ import annotations
from typing import Any, Dict, List, Optional, Union
from datetime import datetime, timezone
from uuid import uuid4
from fastapi import APIRouter, Depends, HTTPException, Query, status, Request
from pydantic import BaseModel, ConfigDict, Field
from sqlalchemy.orm import Session
from sqlalchemy import func
from app.database.base import get_db
from app.auth.security import get_current_user, get_admin_user
from app.models.user import User
from app.models.jobs import JobRecord
from app.services.query_utils import apply_sorting, paginate_with_total, tokenized_ilike_filter
from app.services.storage import get_default_storage
from app.services.audit import audit_service
from app.utils.logging import app_logger
router = APIRouter()
# --------------------
# Pydantic Schemas
# --------------------
class JobRecordResponse(BaseModel):
id: int
job_id: str
job_type: str
status: str
requested_by_username: Optional[str] = None
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
total_requested: int = 0
total_success: int = 0
total_failed: int = 0
has_result_bundle: bool = False
bundle_url: Optional[str] = None
bundle_size: Optional[int] = None
duration_seconds: Optional[float] = None
details: Optional[Dict[str, Any]] = None
model_config = ConfigDict(from_attributes=True)
class PaginatedJobsResponse(BaseModel):
items: List[JobRecordResponse]
total: int
class JobFailRequest(BaseModel):
reason: str = Field(..., min_length=1, max_length=1000)
details_update: Optional[Dict[str, Any]] = None
class JobCompletionUpdate(BaseModel):
total_success: Optional[int] = None
total_failed: Optional[int] = None
result_storage_path: Optional[str] = None
result_mime_type: Optional[str] = None
result_size: Optional[int] = None
details_update: Optional[Dict[str, Any]] = None
class RetryRequest(BaseModel):
note: Optional[str] = None
class JobsMetricsResponse(BaseModel):
by_status: Dict[str, int]
by_type: Dict[str, int]
avg_duration_seconds: Optional[float] = None
running_count: int
failed_last_24h: int
completed_last_24h: int
# --------------------
# Helpers
# --------------------
def _compute_duration_seconds(started_at: Optional[datetime], completed_at: Optional[datetime]) -> Optional[float]:
if not started_at or not completed_at:
return None
try:
start_utc = started_at if started_at.tzinfo else started_at.replace(tzinfo=timezone.utc)
end_utc = completed_at if completed_at.tzinfo else completed_at.replace(tzinfo=timezone.utc)
return max((end_utc - start_utc).total_seconds(), 0.0)
except Exception:
return None
def _to_response(
job: JobRecord,
*,
include_url: bool = False,
) -> JobRecordResponse:
has_bundle = bool(getattr(job, "result_storage_path", None))
bundle_url = None
if include_url and has_bundle:
try:
bundle_url = get_default_storage().public_url(job.result_storage_path) # type: ignore[arg-type]
except Exception:
bundle_url = None
return JobRecordResponse(
id=job.id,
job_id=job.job_id,
job_type=job.job_type,
status=job.status,
requested_by_username=getattr(job, "requested_by_username", None),
started_at=getattr(job, "started_at", None),
completed_at=getattr(job, "completed_at", None),
total_requested=getattr(job, "total_requested", 0) or 0,
total_success=getattr(job, "total_success", 0) or 0,
total_failed=getattr(job, "total_failed", 0) or 0,
has_result_bundle=has_bundle,
bundle_url=bundle_url,
bundle_size=getattr(job, "result_size", None),
duration_seconds=_compute_duration_seconds(getattr(job, "started_at", None), getattr(job, "completed_at", None)),
details=getattr(job, "details", None),
)
# --------------------
# Endpoints
# --------------------
@router.get("/", response_model=Union[List[JobRecordResponse], PaginatedJobsResponse])
async def list_jobs(
skip: int = Query(0, ge=0),
limit: int = Query(50, ge=1, le=200),
include_total: bool = Query(False, description="When true, returns {items, total} instead of a plain list"),
include_urls: bool = Query(False, description="Include bundle URLs in responses"),
status_filter: Optional[str] = Query(None, description="Filter by status"),
type_filter: Optional[str] = Query(None, description="Filter by job type"),
requested_by: Optional[str] = Query(None, description="Filter by username"),
search: Optional[str] = Query(None, description="Tokenized search across job_id, type, status, username"),
mine: bool = Query(True, description="When true, restricts to current user's jobs (admins can set false)"),
sort_by: Optional[str] = Query("started", description="Sort by: started, completed, status, type"),
sort_dir: Optional[str] = Query("desc", description="Sort direction: asc or desc"),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
):
query = db.query(JobRecord)
# Scope: non-admin users always restricted to their jobs
is_admin = bool(getattr(current_user, "is_admin", False))
if mine or not is_admin:
query = query.filter(JobRecord.requested_by_username == current_user.username)
if status_filter:
query = query.filter(JobRecord.status == status_filter)
if type_filter:
query = query.filter(JobRecord.job_type == type_filter)
if requested_by and is_admin:
query = query.filter(JobRecord.requested_by_username == requested_by)
if search:
tokens = [t for t in (search or "").split() if t]
filter_expr = tokenized_ilike_filter(tokens, [
JobRecord.job_id,
JobRecord.job_type,
JobRecord.status,
JobRecord.requested_by_username,
])
if filter_expr is not None:
query = query.filter(filter_expr)
# Sorting
query = apply_sorting(
query,
sort_by,
sort_dir,
allowed={
"started": [JobRecord.started_at, JobRecord.id],
"completed": [JobRecord.completed_at, JobRecord.id],
"status": [JobRecord.status, JobRecord.started_at],
"type": [JobRecord.job_type, JobRecord.started_at],
},
)
jobs, total = paginate_with_total(query, skip, limit, include_total)
items = [_to_response(j, include_url=include_urls) for j in jobs]
if include_total:
return {"items": items, "total": total or 0}
return items
@router.get("/{job_id}", response_model=JobRecordResponse)
async def get_job(
job_id: str,
include_url: bool = Query(True),
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user),
):
job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first()
if not job:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
# Authorization: non-admin users can only access their jobs
if not getattr(current_user, "is_admin", False):
if getattr(job, "requested_by_username", None) != current_user.username:
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Not enough permissions")
return _to_response(job, include_url=include_url)
@router.post("/{job_id}/mark-failed", response_model=JobRecordResponse)
async def mark_job_failed(
job_id: str,
payload: JobFailRequest,
request: Request,
db: Session = Depends(get_db),
current_user: User = Depends(get_admin_user),
):
job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first()
if not job:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
job.status = "failed"
job.completed_at = datetime.now(timezone.utc)
details = dict(getattr(job, "details", {}) or {})
details["last_error"] = payload.reason
if payload.details_update:
details.update(payload.details_update)
job.details = details
db.commit()
db.refresh(job)
try:
audit_service.log_action(
db=db,
action="FAIL",
resource_type="JOB",
user=current_user,
resource_id=job.job_id,
details={"reason": payload.reason},
request=request,
)
except Exception:
pass
return _to_response(job, include_url=True)
@router.post("/{job_id}/mark-running", response_model=JobRecordResponse)
async def mark_job_running(
job_id: str,
request: Request,
db: Session = Depends(get_db),
current_user: User = Depends(get_admin_user),
):
job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first()
if not job:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
job.status = "running"
# Reset start time when transitioning to running
job.started_at = datetime.now(timezone.utc)
job.completed_at = None
db.commit()
db.refresh(job)
try:
audit_service.log_action(
db=db,
action="RUNNING",
resource_type="JOB",
user=current_user,
resource_id=job.job_id,
details=None,
request=request,
)
except Exception:
pass
return _to_response(job)
@router.post("/{job_id}/mark-completed", response_model=JobRecordResponse)
async def mark_job_completed(
job_id: str,
payload: JobCompletionUpdate,
request: Request,
db: Session = Depends(get_db),
current_user: User = Depends(get_admin_user),
):
job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first()
if not job:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
job.status = "completed"
job.completed_at = datetime.now(timezone.utc)
if payload.total_success is not None:
job.total_success = max(int(payload.total_success), 0)
if payload.total_failed is not None:
job.total_failed = max(int(payload.total_failed), 0)
if payload.result_storage_path is not None:
job.result_storage_path = payload.result_storage_path
if payload.result_mime_type is not None:
job.result_mime_type = payload.result_mime_type
if payload.result_size is not None:
job.result_size = max(int(payload.result_size), 0)
if payload.details_update:
details = dict(getattr(job, "details", {}) or {})
details.update(payload.details_update)
job.details = details
db.commit()
db.refresh(job)
try:
audit_service.log_action(
db=db,
action="COMPLETE",
resource_type="JOB",
user=current_user,
resource_id=job.job_id,
details={
"total_success": job.total_success,
"total_failed": job.total_failed,
},
request=request,
)
except Exception:
pass
return _to_response(job, include_url=True)
@router.post("/{job_id}/retry")
async def retry_job(
job_id: str,
payload: RetryRequest,
request: Request,
db: Session = Depends(get_db),
current_user: User = Depends(get_admin_user),
):
"""
Create a new queued job record that references the original job.
This endpoint does not execute the job; it enables monitoring UIs to
track retry intent and external workers to pick it up if/when implemented.
"""
job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first()
if not job:
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
new_job_id = uuid4().hex
new_details = dict(getattr(job, "details", {}) or {})
new_details["retry_of"] = job.job_id
if payload.note:
new_details["retry_note"] = payload.note
cloned = JobRecord(
job_id=new_job_id,
job_type=job.job_type,
status="queued",
requested_by_username=current_user.username,
started_at=datetime.now(timezone.utc),
completed_at=None,
total_requested=getattr(job, "total_requested", 0) or 0,
total_success=0,
total_failed=0,
result_storage_path=None,
result_mime_type=None,
result_size=None,
details=new_details,
)
db.add(cloned)
db.commit()
try:
audit_service.log_action(
db=db,
action="RETRY",
resource_type="JOB",
user=current_user,
resource_id=job.job_id,
details={"new_job_id": new_job_id},
request=request,
)
except Exception:
pass
return {"message": "Retry created", "job_id": new_job_id}
@router.get("/metrics/summary", response_model=JobsMetricsResponse)
async def jobs_metrics(
db: Session = Depends(get_db),
current_user: User = Depends(get_admin_user),
):
"""
Basic metrics for dashboards/monitoring.
"""
# By status
rows = db.query(JobRecord.status, func.count(JobRecord.id)).group_by(JobRecord.status).all()
by_status = {str(k or "unknown"): int(v or 0) for k, v in rows}
# By type
rows = db.query(JobRecord.job_type, func.count(JobRecord.id)).group_by(JobRecord.job_type).all()
by_type = {str(k or "unknown"): int(v or 0) for k, v in rows}
# Running count
try:
running_count = db.query(func.count(JobRecord.id)).filter(JobRecord.status == "running").scalar() or 0
except Exception:
running_count = 0
# Last 24h stats
cutoff = datetime.now(timezone.utc).replace(microsecond=0)
try:
failed_last_24h = db.query(func.count(JobRecord.id)).filter(
JobRecord.status == "failed",
(JobRecord.completed_at != None), # noqa: E711
JobRecord.completed_at >= (cutoff.replace(hour=0, minute=0, second=0) - func.cast(1, func.INTEGER)) # type: ignore
).scalar() or 0
except Exception:
# Fallback without date condition if backend doesn't support the above cast
failed_last_24h = db.query(func.count(JobRecord.id)).filter(JobRecord.status == "failed").scalar() or 0
try:
completed_last_24h = db.query(func.count(JobRecord.id)).filter(
JobRecord.status == "completed",
(JobRecord.completed_at != None), # noqa: E711
JobRecord.completed_at >= (cutoff.replace(hour=0, minute=0, second=0) - func.cast(1, func.INTEGER)) # type: ignore
).scalar() or 0
except Exception:
completed_last_24h = db.query(func.count(JobRecord.id)).filter(JobRecord.status == "completed").scalar() or 0
# Average duration on completed
try:
completed_jobs = db.query(JobRecord.started_at, JobRecord.completed_at).filter(JobRecord.completed_at != None).limit(500).all() # noqa: E711
durations: List[float] = []
for s, c in completed_jobs:
d = _compute_duration_seconds(s, c)
if d is not None:
durations.append(d)
avg_duration = (sum(durations) / len(durations)) if durations else None
except Exception:
avg_duration = None
return JobsMetricsResponse(
by_status=by_status,
by_type=by_type,
avg_duration_seconds=(round(avg_duration, 2) if isinstance(avg_duration, (int, float)) else None),
running_count=int(running_count),
failed_last_24h=int(failed_last_24h),
completed_last_24h=int(completed_last_24h),
)