470 lines
16 KiB
Python
470 lines
16 KiB
Python
"""
|
|
Job Management API
|
|
|
|
Provides lightweight monitoring and management endpoints around `JobRecord`.
|
|
|
|
Notes:
|
|
- This is not a background worker. It exposes status/history/metrics for jobs
|
|
recorded by various synchronous operations (e.g., documents batch generation).
|
|
- Retry creates a new queued record that references the original job. Actual
|
|
processing is not scheduled here.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from typing import Any, Dict, List, Optional, Union
|
|
from datetime import datetime, timezone
|
|
from uuid import uuid4
|
|
|
|
from fastapi import APIRouter, Depends, HTTPException, Query, status, Request
|
|
from pydantic import BaseModel, ConfigDict, Field
|
|
from sqlalchemy.orm import Session
|
|
from sqlalchemy import func
|
|
|
|
from app.database.base import get_db
|
|
from app.auth.security import get_current_user, get_admin_user
|
|
from app.models.user import User
|
|
from app.models.jobs import JobRecord
|
|
from app.services.query_utils import apply_sorting, paginate_with_total, tokenized_ilike_filter
|
|
from app.services.storage import get_default_storage
|
|
from app.services.audit import audit_service
|
|
from app.utils.logging import app_logger
|
|
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
# --------------------
|
|
# Pydantic Schemas
|
|
# --------------------
|
|
|
|
class JobRecordResponse(BaseModel):
|
|
id: int
|
|
job_id: str
|
|
job_type: str
|
|
status: str
|
|
requested_by_username: Optional[str] = None
|
|
started_at: Optional[datetime] = None
|
|
completed_at: Optional[datetime] = None
|
|
total_requested: int = 0
|
|
total_success: int = 0
|
|
total_failed: int = 0
|
|
has_result_bundle: bool = False
|
|
bundle_url: Optional[str] = None
|
|
bundle_size: Optional[int] = None
|
|
duration_seconds: Optional[float] = None
|
|
details: Optional[Dict[str, Any]] = None
|
|
|
|
model_config = ConfigDict(from_attributes=True)
|
|
|
|
|
|
class PaginatedJobsResponse(BaseModel):
|
|
items: List[JobRecordResponse]
|
|
total: int
|
|
|
|
|
|
class JobFailRequest(BaseModel):
|
|
reason: str = Field(..., min_length=1, max_length=1000)
|
|
details_update: Optional[Dict[str, Any]] = None
|
|
|
|
|
|
class JobCompletionUpdate(BaseModel):
|
|
total_success: Optional[int] = None
|
|
total_failed: Optional[int] = None
|
|
result_storage_path: Optional[str] = None
|
|
result_mime_type: Optional[str] = None
|
|
result_size: Optional[int] = None
|
|
details_update: Optional[Dict[str, Any]] = None
|
|
|
|
|
|
class RetryRequest(BaseModel):
|
|
note: Optional[str] = None
|
|
|
|
|
|
class JobsMetricsResponse(BaseModel):
|
|
by_status: Dict[str, int]
|
|
by_type: Dict[str, int]
|
|
avg_duration_seconds: Optional[float] = None
|
|
running_count: int
|
|
failed_last_24h: int
|
|
completed_last_24h: int
|
|
|
|
|
|
# --------------------
|
|
# Helpers
|
|
# --------------------
|
|
|
|
def _compute_duration_seconds(started_at: Optional[datetime], completed_at: Optional[datetime]) -> Optional[float]:
|
|
if not started_at or not completed_at:
|
|
return None
|
|
try:
|
|
start_utc = started_at if started_at.tzinfo else started_at.replace(tzinfo=timezone.utc)
|
|
end_utc = completed_at if completed_at.tzinfo else completed_at.replace(tzinfo=timezone.utc)
|
|
return max((end_utc - start_utc).total_seconds(), 0.0)
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def _to_response(
|
|
job: JobRecord,
|
|
*,
|
|
include_url: bool = False,
|
|
) -> JobRecordResponse:
|
|
has_bundle = bool(getattr(job, "result_storage_path", None))
|
|
bundle_url = None
|
|
if include_url and has_bundle:
|
|
try:
|
|
bundle_url = get_default_storage().public_url(job.result_storage_path) # type: ignore[arg-type]
|
|
except Exception:
|
|
bundle_url = None
|
|
return JobRecordResponse(
|
|
id=job.id,
|
|
job_id=job.job_id,
|
|
job_type=job.job_type,
|
|
status=job.status,
|
|
requested_by_username=getattr(job, "requested_by_username", None),
|
|
started_at=getattr(job, "started_at", None),
|
|
completed_at=getattr(job, "completed_at", None),
|
|
total_requested=getattr(job, "total_requested", 0) or 0,
|
|
total_success=getattr(job, "total_success", 0) or 0,
|
|
total_failed=getattr(job, "total_failed", 0) or 0,
|
|
has_result_bundle=has_bundle,
|
|
bundle_url=bundle_url,
|
|
bundle_size=getattr(job, "result_size", None),
|
|
duration_seconds=_compute_duration_seconds(getattr(job, "started_at", None), getattr(job, "completed_at", None)),
|
|
details=getattr(job, "details", None),
|
|
)
|
|
|
|
|
|
# --------------------
|
|
# Endpoints
|
|
# --------------------
|
|
|
|
|
|
@router.get("/", response_model=Union[List[JobRecordResponse], PaginatedJobsResponse])
|
|
async def list_jobs(
|
|
skip: int = Query(0, ge=0),
|
|
limit: int = Query(50, ge=1, le=200),
|
|
include_total: bool = Query(False, description="When true, returns {items, total} instead of a plain list"),
|
|
include_urls: bool = Query(False, description="Include bundle URLs in responses"),
|
|
status_filter: Optional[str] = Query(None, description="Filter by status"),
|
|
type_filter: Optional[str] = Query(None, description="Filter by job type"),
|
|
requested_by: Optional[str] = Query(None, description="Filter by username"),
|
|
search: Optional[str] = Query(None, description="Tokenized search across job_id, type, status, username"),
|
|
mine: bool = Query(True, description="When true, restricts to current user's jobs (admins can set false)"),
|
|
sort_by: Optional[str] = Query("started", description="Sort by: started, completed, status, type"),
|
|
sort_dir: Optional[str] = Query("desc", description="Sort direction: asc or desc"),
|
|
db: Session = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
query = db.query(JobRecord)
|
|
|
|
# Scope: non-admin users always restricted to their jobs
|
|
is_admin = bool(getattr(current_user, "is_admin", False))
|
|
if mine or not is_admin:
|
|
query = query.filter(JobRecord.requested_by_username == current_user.username)
|
|
|
|
if status_filter:
|
|
query = query.filter(JobRecord.status == status_filter)
|
|
if type_filter:
|
|
query = query.filter(JobRecord.job_type == type_filter)
|
|
if requested_by and is_admin:
|
|
query = query.filter(JobRecord.requested_by_username == requested_by)
|
|
|
|
if search:
|
|
tokens = [t for t in (search or "").split() if t]
|
|
filter_expr = tokenized_ilike_filter(tokens, [
|
|
JobRecord.job_id,
|
|
JobRecord.job_type,
|
|
JobRecord.status,
|
|
JobRecord.requested_by_username,
|
|
])
|
|
if filter_expr is not None:
|
|
query = query.filter(filter_expr)
|
|
|
|
# Sorting
|
|
query = apply_sorting(
|
|
query,
|
|
sort_by,
|
|
sort_dir,
|
|
allowed={
|
|
"started": [JobRecord.started_at, JobRecord.id],
|
|
"completed": [JobRecord.completed_at, JobRecord.id],
|
|
"status": [JobRecord.status, JobRecord.started_at],
|
|
"type": [JobRecord.job_type, JobRecord.started_at],
|
|
},
|
|
)
|
|
|
|
jobs, total = paginate_with_total(query, skip, limit, include_total)
|
|
items = [_to_response(j, include_url=include_urls) for j in jobs]
|
|
if include_total:
|
|
return {"items": items, "total": total or 0}
|
|
return items
|
|
|
|
|
|
@router.get("/{job_id}", response_model=JobRecordResponse)
|
|
async def get_job(
|
|
job_id: str,
|
|
include_url: bool = Query(True),
|
|
db: Session = Depends(get_db),
|
|
current_user: User = Depends(get_current_user),
|
|
):
|
|
job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first()
|
|
if not job:
|
|
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
|
|
|
|
# Authorization: non-admin users can only access their jobs
|
|
if not getattr(current_user, "is_admin", False):
|
|
if getattr(job, "requested_by_username", None) != current_user.username:
|
|
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Not enough permissions")
|
|
|
|
return _to_response(job, include_url=include_url)
|
|
|
|
|
|
@router.post("/{job_id}/mark-failed", response_model=JobRecordResponse)
|
|
async def mark_job_failed(
|
|
job_id: str,
|
|
payload: JobFailRequest,
|
|
request: Request,
|
|
db: Session = Depends(get_db),
|
|
current_user: User = Depends(get_admin_user),
|
|
):
|
|
job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first()
|
|
if not job:
|
|
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
|
|
|
|
job.status = "failed"
|
|
job.completed_at = datetime.now(timezone.utc)
|
|
details = dict(getattr(job, "details", {}) or {})
|
|
details["last_error"] = payload.reason
|
|
if payload.details_update:
|
|
details.update(payload.details_update)
|
|
job.details = details
|
|
db.commit()
|
|
db.refresh(job)
|
|
|
|
try:
|
|
audit_service.log_action(
|
|
db=db,
|
|
action="FAIL",
|
|
resource_type="JOB",
|
|
user=current_user,
|
|
resource_id=job.job_id,
|
|
details={"reason": payload.reason},
|
|
request=request,
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
return _to_response(job, include_url=True)
|
|
|
|
|
|
@router.post("/{job_id}/mark-running", response_model=JobRecordResponse)
|
|
async def mark_job_running(
|
|
job_id: str,
|
|
request: Request,
|
|
db: Session = Depends(get_db),
|
|
current_user: User = Depends(get_admin_user),
|
|
):
|
|
job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first()
|
|
if not job:
|
|
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
|
|
|
|
job.status = "running"
|
|
# Reset start time when transitioning to running
|
|
job.started_at = datetime.now(timezone.utc)
|
|
job.completed_at = None
|
|
db.commit()
|
|
db.refresh(job)
|
|
|
|
try:
|
|
audit_service.log_action(
|
|
db=db,
|
|
action="RUNNING",
|
|
resource_type="JOB",
|
|
user=current_user,
|
|
resource_id=job.job_id,
|
|
details=None,
|
|
request=request,
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
return _to_response(job)
|
|
|
|
|
|
@router.post("/{job_id}/mark-completed", response_model=JobRecordResponse)
|
|
async def mark_job_completed(
|
|
job_id: str,
|
|
payload: JobCompletionUpdate,
|
|
request: Request,
|
|
db: Session = Depends(get_db),
|
|
current_user: User = Depends(get_admin_user),
|
|
):
|
|
job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first()
|
|
if not job:
|
|
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
|
|
|
|
job.status = "completed"
|
|
job.completed_at = datetime.now(timezone.utc)
|
|
if payload.total_success is not None:
|
|
job.total_success = max(int(payload.total_success), 0)
|
|
if payload.total_failed is not None:
|
|
job.total_failed = max(int(payload.total_failed), 0)
|
|
if payload.result_storage_path is not None:
|
|
job.result_storage_path = payload.result_storage_path
|
|
if payload.result_mime_type is not None:
|
|
job.result_mime_type = payload.result_mime_type
|
|
if payload.result_size is not None:
|
|
job.result_size = max(int(payload.result_size), 0)
|
|
|
|
if payload.details_update:
|
|
details = dict(getattr(job, "details", {}) or {})
|
|
details.update(payload.details_update)
|
|
job.details = details
|
|
|
|
db.commit()
|
|
db.refresh(job)
|
|
|
|
try:
|
|
audit_service.log_action(
|
|
db=db,
|
|
action="COMPLETE",
|
|
resource_type="JOB",
|
|
user=current_user,
|
|
resource_id=job.job_id,
|
|
details={
|
|
"total_success": job.total_success,
|
|
"total_failed": job.total_failed,
|
|
},
|
|
request=request,
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
return _to_response(job, include_url=True)
|
|
|
|
|
|
@router.post("/{job_id}/retry")
|
|
async def retry_job(
|
|
job_id: str,
|
|
payload: RetryRequest,
|
|
request: Request,
|
|
db: Session = Depends(get_db),
|
|
current_user: User = Depends(get_admin_user),
|
|
):
|
|
"""
|
|
Create a new queued job record that references the original job.
|
|
|
|
This endpoint does not execute the job; it enables monitoring UIs to
|
|
track retry intent and external workers to pick it up if/when implemented.
|
|
"""
|
|
job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first()
|
|
if not job:
|
|
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
|
|
|
|
new_job_id = uuid4().hex
|
|
new_details = dict(getattr(job, "details", {}) or {})
|
|
new_details["retry_of"] = job.job_id
|
|
if payload.note:
|
|
new_details["retry_note"] = payload.note
|
|
|
|
cloned = JobRecord(
|
|
job_id=new_job_id,
|
|
job_type=job.job_type,
|
|
status="queued",
|
|
requested_by_username=current_user.username,
|
|
started_at=datetime.now(timezone.utc),
|
|
completed_at=None,
|
|
total_requested=getattr(job, "total_requested", 0) or 0,
|
|
total_success=0,
|
|
total_failed=0,
|
|
result_storage_path=None,
|
|
result_mime_type=None,
|
|
result_size=None,
|
|
details=new_details,
|
|
)
|
|
db.add(cloned)
|
|
db.commit()
|
|
|
|
try:
|
|
audit_service.log_action(
|
|
db=db,
|
|
action="RETRY",
|
|
resource_type="JOB",
|
|
user=current_user,
|
|
resource_id=job.job_id,
|
|
details={"new_job_id": new_job_id},
|
|
request=request,
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
return {"message": "Retry created", "job_id": new_job_id}
|
|
|
|
|
|
@router.get("/metrics/summary", response_model=JobsMetricsResponse)
|
|
async def jobs_metrics(
|
|
db: Session = Depends(get_db),
|
|
current_user: User = Depends(get_admin_user),
|
|
):
|
|
"""
|
|
Basic metrics for dashboards/monitoring.
|
|
"""
|
|
# By status
|
|
rows = db.query(JobRecord.status, func.count(JobRecord.id)).group_by(JobRecord.status).all()
|
|
by_status = {str(k or "unknown"): int(v or 0) for k, v in rows}
|
|
|
|
# By type
|
|
rows = db.query(JobRecord.job_type, func.count(JobRecord.id)).group_by(JobRecord.job_type).all()
|
|
by_type = {str(k or "unknown"): int(v or 0) for k, v in rows}
|
|
|
|
# Running count
|
|
try:
|
|
running_count = db.query(func.count(JobRecord.id)).filter(JobRecord.status == "running").scalar() or 0
|
|
except Exception:
|
|
running_count = 0
|
|
|
|
# Last 24h stats
|
|
cutoff = datetime.now(timezone.utc).replace(microsecond=0)
|
|
try:
|
|
failed_last_24h = db.query(func.count(JobRecord.id)).filter(
|
|
JobRecord.status == "failed",
|
|
(JobRecord.completed_at != None), # noqa: E711
|
|
JobRecord.completed_at >= (cutoff.replace(hour=0, minute=0, second=0) - func.cast(1, func.INTEGER)) # type: ignore
|
|
).scalar() or 0
|
|
except Exception:
|
|
# Fallback without date condition if backend doesn't support the above cast
|
|
failed_last_24h = db.query(func.count(JobRecord.id)).filter(JobRecord.status == "failed").scalar() or 0
|
|
|
|
try:
|
|
completed_last_24h = db.query(func.count(JobRecord.id)).filter(
|
|
JobRecord.status == "completed",
|
|
(JobRecord.completed_at != None), # noqa: E711
|
|
JobRecord.completed_at >= (cutoff.replace(hour=0, minute=0, second=0) - func.cast(1, func.INTEGER)) # type: ignore
|
|
).scalar() or 0
|
|
except Exception:
|
|
completed_last_24h = db.query(func.count(JobRecord.id)).filter(JobRecord.status == "completed").scalar() or 0
|
|
|
|
# Average duration on completed
|
|
try:
|
|
completed_jobs = db.query(JobRecord.started_at, JobRecord.completed_at).filter(JobRecord.completed_at != None).limit(500).all() # noqa: E711
|
|
durations: List[float] = []
|
|
for s, c in completed_jobs:
|
|
d = _compute_duration_seconds(s, c)
|
|
if d is not None:
|
|
durations.append(d)
|
|
avg_duration = (sum(durations) / len(durations)) if durations else None
|
|
except Exception:
|
|
avg_duration = None
|
|
|
|
return JobsMetricsResponse(
|
|
by_status=by_status,
|
|
by_type=by_type,
|
|
avg_duration_seconds=(round(avg_duration, 2) if isinstance(avg_duration, (int, float)) else None),
|
|
running_count=int(running_count),
|
|
failed_last_24h=int(failed_last_24h),
|
|
completed_last_24h=int(completed_last_24h),
|
|
)
|
|
|
|
|