changes
This commit is contained in:
469
app/api/jobs.py
Normal file
469
app/api/jobs.py
Normal file
@@ -0,0 +1,469 @@
|
||||
"""
|
||||
Job Management API
|
||||
|
||||
Provides lightweight monitoring and management endpoints around `JobRecord`.
|
||||
|
||||
Notes:
|
||||
- This is not a background worker. It exposes status/history/metrics for jobs
|
||||
recorded by various synchronous operations (e.g., documents batch generation).
|
||||
- Retry creates a new queued record that references the original job. Actual
|
||||
processing is not scheduled here.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
from datetime import datetime, timezone
|
||||
from uuid import uuid4
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query, status, Request
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import func
|
||||
|
||||
from app.database.base import get_db
|
||||
from app.auth.security import get_current_user, get_admin_user
|
||||
from app.models.user import User
|
||||
from app.models.jobs import JobRecord
|
||||
from app.services.query_utils import apply_sorting, paginate_with_total, tokenized_ilike_filter
|
||||
from app.services.storage import get_default_storage
|
||||
from app.services.audit import audit_service
|
||||
from app.utils.logging import app_logger
|
||||
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
# --------------------
|
||||
# Pydantic Schemas
|
||||
# --------------------
|
||||
|
||||
class JobRecordResponse(BaseModel):
|
||||
id: int
|
||||
job_id: str
|
||||
job_type: str
|
||||
status: str
|
||||
requested_by_username: Optional[str] = None
|
||||
started_at: Optional[datetime] = None
|
||||
completed_at: Optional[datetime] = None
|
||||
total_requested: int = 0
|
||||
total_success: int = 0
|
||||
total_failed: int = 0
|
||||
has_result_bundle: bool = False
|
||||
bundle_url: Optional[str] = None
|
||||
bundle_size: Optional[int] = None
|
||||
duration_seconds: Optional[float] = None
|
||||
details: Optional[Dict[str, Any]] = None
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
class PaginatedJobsResponse(BaseModel):
|
||||
items: List[JobRecordResponse]
|
||||
total: int
|
||||
|
||||
|
||||
class JobFailRequest(BaseModel):
|
||||
reason: str = Field(..., min_length=1, max_length=1000)
|
||||
details_update: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class JobCompletionUpdate(BaseModel):
|
||||
total_success: Optional[int] = None
|
||||
total_failed: Optional[int] = None
|
||||
result_storage_path: Optional[str] = None
|
||||
result_mime_type: Optional[str] = None
|
||||
result_size: Optional[int] = None
|
||||
details_update: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class RetryRequest(BaseModel):
|
||||
note: Optional[str] = None
|
||||
|
||||
|
||||
class JobsMetricsResponse(BaseModel):
|
||||
by_status: Dict[str, int]
|
||||
by_type: Dict[str, int]
|
||||
avg_duration_seconds: Optional[float] = None
|
||||
running_count: int
|
||||
failed_last_24h: int
|
||||
completed_last_24h: int
|
||||
|
||||
|
||||
# --------------------
|
||||
# Helpers
|
||||
# --------------------
|
||||
|
||||
def _compute_duration_seconds(started_at: Optional[datetime], completed_at: Optional[datetime]) -> Optional[float]:
|
||||
if not started_at or not completed_at:
|
||||
return None
|
||||
try:
|
||||
start_utc = started_at if started_at.tzinfo else started_at.replace(tzinfo=timezone.utc)
|
||||
end_utc = completed_at if completed_at.tzinfo else completed_at.replace(tzinfo=timezone.utc)
|
||||
return max((end_utc - start_utc).total_seconds(), 0.0)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _to_response(
|
||||
job: JobRecord,
|
||||
*,
|
||||
include_url: bool = False,
|
||||
) -> JobRecordResponse:
|
||||
has_bundle = bool(getattr(job, "result_storage_path", None))
|
||||
bundle_url = None
|
||||
if include_url and has_bundle:
|
||||
try:
|
||||
bundle_url = get_default_storage().public_url(job.result_storage_path) # type: ignore[arg-type]
|
||||
except Exception:
|
||||
bundle_url = None
|
||||
return JobRecordResponse(
|
||||
id=job.id,
|
||||
job_id=job.job_id,
|
||||
job_type=job.job_type,
|
||||
status=job.status,
|
||||
requested_by_username=getattr(job, "requested_by_username", None),
|
||||
started_at=getattr(job, "started_at", None),
|
||||
completed_at=getattr(job, "completed_at", None),
|
||||
total_requested=getattr(job, "total_requested", 0) or 0,
|
||||
total_success=getattr(job, "total_success", 0) or 0,
|
||||
total_failed=getattr(job, "total_failed", 0) or 0,
|
||||
has_result_bundle=has_bundle,
|
||||
bundle_url=bundle_url,
|
||||
bundle_size=getattr(job, "result_size", None),
|
||||
duration_seconds=_compute_duration_seconds(getattr(job, "started_at", None), getattr(job, "completed_at", None)),
|
||||
details=getattr(job, "details", None),
|
||||
)
|
||||
|
||||
|
||||
# --------------------
|
||||
# Endpoints
|
||||
# --------------------
|
||||
|
||||
|
||||
@router.get("/", response_model=Union[List[JobRecordResponse], PaginatedJobsResponse])
|
||||
async def list_jobs(
|
||||
skip: int = Query(0, ge=0),
|
||||
limit: int = Query(50, ge=1, le=200),
|
||||
include_total: bool = Query(False, description="When true, returns {items, total} instead of a plain list"),
|
||||
include_urls: bool = Query(False, description="Include bundle URLs in responses"),
|
||||
status_filter: Optional[str] = Query(None, description="Filter by status"),
|
||||
type_filter: Optional[str] = Query(None, description="Filter by job type"),
|
||||
requested_by: Optional[str] = Query(None, description="Filter by username"),
|
||||
search: Optional[str] = Query(None, description="Tokenized search across job_id, type, status, username"),
|
||||
mine: bool = Query(True, description="When true, restricts to current user's jobs (admins can set false)"),
|
||||
sort_by: Optional[str] = Query("started", description="Sort by: started, completed, status, type"),
|
||||
sort_dir: Optional[str] = Query("desc", description="Sort direction: asc or desc"),
|
||||
db: Session = Depends(get_db),
|
||||
current_user: User = Depends(get_current_user),
|
||||
):
|
||||
query = db.query(JobRecord)
|
||||
|
||||
# Scope: non-admin users always restricted to their jobs
|
||||
is_admin = bool(getattr(current_user, "is_admin", False))
|
||||
if mine or not is_admin:
|
||||
query = query.filter(JobRecord.requested_by_username == current_user.username)
|
||||
|
||||
if status_filter:
|
||||
query = query.filter(JobRecord.status == status_filter)
|
||||
if type_filter:
|
||||
query = query.filter(JobRecord.job_type == type_filter)
|
||||
if requested_by and is_admin:
|
||||
query = query.filter(JobRecord.requested_by_username == requested_by)
|
||||
|
||||
if search:
|
||||
tokens = [t for t in (search or "").split() if t]
|
||||
filter_expr = tokenized_ilike_filter(tokens, [
|
||||
JobRecord.job_id,
|
||||
JobRecord.job_type,
|
||||
JobRecord.status,
|
||||
JobRecord.requested_by_username,
|
||||
])
|
||||
if filter_expr is not None:
|
||||
query = query.filter(filter_expr)
|
||||
|
||||
# Sorting
|
||||
query = apply_sorting(
|
||||
query,
|
||||
sort_by,
|
||||
sort_dir,
|
||||
allowed={
|
||||
"started": [JobRecord.started_at, JobRecord.id],
|
||||
"completed": [JobRecord.completed_at, JobRecord.id],
|
||||
"status": [JobRecord.status, JobRecord.started_at],
|
||||
"type": [JobRecord.job_type, JobRecord.started_at],
|
||||
},
|
||||
)
|
||||
|
||||
jobs, total = paginate_with_total(query, skip, limit, include_total)
|
||||
items = [_to_response(j, include_url=include_urls) for j in jobs]
|
||||
if include_total:
|
||||
return {"items": items, "total": total or 0}
|
||||
return items
|
||||
|
||||
|
||||
@router.get("/{job_id}", response_model=JobRecordResponse)
|
||||
async def get_job(
|
||||
job_id: str,
|
||||
include_url: bool = Query(True),
|
||||
db: Session = Depends(get_db),
|
||||
current_user: User = Depends(get_current_user),
|
||||
):
|
||||
job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first()
|
||||
if not job:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
|
||||
|
||||
# Authorization: non-admin users can only access their jobs
|
||||
if not getattr(current_user, "is_admin", False):
|
||||
if getattr(job, "requested_by_username", None) != current_user.username:
|
||||
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Not enough permissions")
|
||||
|
||||
return _to_response(job, include_url=include_url)
|
||||
|
||||
|
||||
@router.post("/{job_id}/mark-failed", response_model=JobRecordResponse)
|
||||
async def mark_job_failed(
|
||||
job_id: str,
|
||||
payload: JobFailRequest,
|
||||
request: Request,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: User = Depends(get_admin_user),
|
||||
):
|
||||
job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first()
|
||||
if not job:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
|
||||
|
||||
job.status = "failed"
|
||||
job.completed_at = datetime.now(timezone.utc)
|
||||
details = dict(getattr(job, "details", {}) or {})
|
||||
details["last_error"] = payload.reason
|
||||
if payload.details_update:
|
||||
details.update(payload.details_update)
|
||||
job.details = details
|
||||
db.commit()
|
||||
db.refresh(job)
|
||||
|
||||
try:
|
||||
audit_service.log_action(
|
||||
db=db,
|
||||
action="FAIL",
|
||||
resource_type="JOB",
|
||||
user=current_user,
|
||||
resource_id=job.job_id,
|
||||
details={"reason": payload.reason},
|
||||
request=request,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return _to_response(job, include_url=True)
|
||||
|
||||
|
||||
@router.post("/{job_id}/mark-running", response_model=JobRecordResponse)
|
||||
async def mark_job_running(
|
||||
job_id: str,
|
||||
request: Request,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: User = Depends(get_admin_user),
|
||||
):
|
||||
job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first()
|
||||
if not job:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
|
||||
|
||||
job.status = "running"
|
||||
# Reset start time when transitioning to running
|
||||
job.started_at = datetime.now(timezone.utc)
|
||||
job.completed_at = None
|
||||
db.commit()
|
||||
db.refresh(job)
|
||||
|
||||
try:
|
||||
audit_service.log_action(
|
||||
db=db,
|
||||
action="RUNNING",
|
||||
resource_type="JOB",
|
||||
user=current_user,
|
||||
resource_id=job.job_id,
|
||||
details=None,
|
||||
request=request,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return _to_response(job)
|
||||
|
||||
|
||||
@router.post("/{job_id}/mark-completed", response_model=JobRecordResponse)
|
||||
async def mark_job_completed(
|
||||
job_id: str,
|
||||
payload: JobCompletionUpdate,
|
||||
request: Request,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: User = Depends(get_admin_user),
|
||||
):
|
||||
job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first()
|
||||
if not job:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
|
||||
|
||||
job.status = "completed"
|
||||
job.completed_at = datetime.now(timezone.utc)
|
||||
if payload.total_success is not None:
|
||||
job.total_success = max(int(payload.total_success), 0)
|
||||
if payload.total_failed is not None:
|
||||
job.total_failed = max(int(payload.total_failed), 0)
|
||||
if payload.result_storage_path is not None:
|
||||
job.result_storage_path = payload.result_storage_path
|
||||
if payload.result_mime_type is not None:
|
||||
job.result_mime_type = payload.result_mime_type
|
||||
if payload.result_size is not None:
|
||||
job.result_size = max(int(payload.result_size), 0)
|
||||
|
||||
if payload.details_update:
|
||||
details = dict(getattr(job, "details", {}) or {})
|
||||
details.update(payload.details_update)
|
||||
job.details = details
|
||||
|
||||
db.commit()
|
||||
db.refresh(job)
|
||||
|
||||
try:
|
||||
audit_service.log_action(
|
||||
db=db,
|
||||
action="COMPLETE",
|
||||
resource_type="JOB",
|
||||
user=current_user,
|
||||
resource_id=job.job_id,
|
||||
details={
|
||||
"total_success": job.total_success,
|
||||
"total_failed": job.total_failed,
|
||||
},
|
||||
request=request,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return _to_response(job, include_url=True)
|
||||
|
||||
|
||||
@router.post("/{job_id}/retry")
|
||||
async def retry_job(
|
||||
job_id: str,
|
||||
payload: RetryRequest,
|
||||
request: Request,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: User = Depends(get_admin_user),
|
||||
):
|
||||
"""
|
||||
Create a new queued job record that references the original job.
|
||||
|
||||
This endpoint does not execute the job; it enables monitoring UIs to
|
||||
track retry intent and external workers to pick it up if/when implemented.
|
||||
"""
|
||||
job = db.query(JobRecord).filter(JobRecord.job_id == job_id).first()
|
||||
if not job:
|
||||
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found")
|
||||
|
||||
new_job_id = uuid4().hex
|
||||
new_details = dict(getattr(job, "details", {}) or {})
|
||||
new_details["retry_of"] = job.job_id
|
||||
if payload.note:
|
||||
new_details["retry_note"] = payload.note
|
||||
|
||||
cloned = JobRecord(
|
||||
job_id=new_job_id,
|
||||
job_type=job.job_type,
|
||||
status="queued",
|
||||
requested_by_username=current_user.username,
|
||||
started_at=datetime.now(timezone.utc),
|
||||
completed_at=None,
|
||||
total_requested=getattr(job, "total_requested", 0) or 0,
|
||||
total_success=0,
|
||||
total_failed=0,
|
||||
result_storage_path=None,
|
||||
result_mime_type=None,
|
||||
result_size=None,
|
||||
details=new_details,
|
||||
)
|
||||
db.add(cloned)
|
||||
db.commit()
|
||||
|
||||
try:
|
||||
audit_service.log_action(
|
||||
db=db,
|
||||
action="RETRY",
|
||||
resource_type="JOB",
|
||||
user=current_user,
|
||||
resource_id=job.job_id,
|
||||
details={"new_job_id": new_job_id},
|
||||
request=request,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {"message": "Retry created", "job_id": new_job_id}
|
||||
|
||||
|
||||
@router.get("/metrics/summary", response_model=JobsMetricsResponse)
|
||||
async def jobs_metrics(
|
||||
db: Session = Depends(get_db),
|
||||
current_user: User = Depends(get_admin_user),
|
||||
):
|
||||
"""
|
||||
Basic metrics for dashboards/monitoring.
|
||||
"""
|
||||
# By status
|
||||
rows = db.query(JobRecord.status, func.count(JobRecord.id)).group_by(JobRecord.status).all()
|
||||
by_status = {str(k or "unknown"): int(v or 0) for k, v in rows}
|
||||
|
||||
# By type
|
||||
rows = db.query(JobRecord.job_type, func.count(JobRecord.id)).group_by(JobRecord.job_type).all()
|
||||
by_type = {str(k or "unknown"): int(v or 0) for k, v in rows}
|
||||
|
||||
# Running count
|
||||
try:
|
||||
running_count = db.query(func.count(JobRecord.id)).filter(JobRecord.status == "running").scalar() or 0
|
||||
except Exception:
|
||||
running_count = 0
|
||||
|
||||
# Last 24h stats
|
||||
cutoff = datetime.now(timezone.utc).replace(microsecond=0)
|
||||
try:
|
||||
failed_last_24h = db.query(func.count(JobRecord.id)).filter(
|
||||
JobRecord.status == "failed",
|
||||
(JobRecord.completed_at != None), # noqa: E711
|
||||
JobRecord.completed_at >= (cutoff.replace(hour=0, minute=0, second=0) - func.cast(1, func.INTEGER)) # type: ignore
|
||||
).scalar() or 0
|
||||
except Exception:
|
||||
# Fallback without date condition if backend doesn't support the above cast
|
||||
failed_last_24h = db.query(func.count(JobRecord.id)).filter(JobRecord.status == "failed").scalar() or 0
|
||||
|
||||
try:
|
||||
completed_last_24h = db.query(func.count(JobRecord.id)).filter(
|
||||
JobRecord.status == "completed",
|
||||
(JobRecord.completed_at != None), # noqa: E711
|
||||
JobRecord.completed_at >= (cutoff.replace(hour=0, minute=0, second=0) - func.cast(1, func.INTEGER)) # type: ignore
|
||||
).scalar() or 0
|
||||
except Exception:
|
||||
completed_last_24h = db.query(func.count(JobRecord.id)).filter(JobRecord.status == "completed").scalar() or 0
|
||||
|
||||
# Average duration on completed
|
||||
try:
|
||||
completed_jobs = db.query(JobRecord.started_at, JobRecord.completed_at).filter(JobRecord.completed_at != None).limit(500).all() # noqa: E711
|
||||
durations: List[float] = []
|
||||
for s, c in completed_jobs:
|
||||
d = _compute_duration_seconds(s, c)
|
||||
if d is not None:
|
||||
durations.append(d)
|
||||
avg_duration = (sum(durations) / len(durations)) if durations else None
|
||||
except Exception:
|
||||
avg_duration = None
|
||||
|
||||
return JobsMetricsResponse(
|
||||
by_status=by_status,
|
||||
by_type=by_type,
|
||||
avg_duration_seconds=(round(avg_duration, 2) if isinstance(avg_duration, (int, float)) else None),
|
||||
running_count=int(running_count),
|
||||
failed_last_24h=int(failed_last_24h),
|
||||
completed_last_24h=int(completed_last_24h),
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user