health.py
1 """ 2 Health check endpoint for Ag3ntum API. 3 """ 4 import time 5 from datetime import datetime, timezone 6 7 import redis.asyncio as aioredis 8 from fastapi import APIRouter, Depends 9 from sqlalchemy import text 10 from sqlalchemy.ext.asyncio import AsyncSession 11 12 from ...config import get_config_loader 13 from ...db.database import get_db 14 from ...services.agent_runner import agent_runner 15 from ..models import ComponentHealth, ConfigResponse, DeepHealthResponse, HealthResponse 16 17 router = APIRouter(tags=["health"]) 18 19 20 def _read_version() -> str: 21 """Read version from VERSION file at startup.""" 22 import os 23 from pathlib import Path 24 25 root = os.environ.get("AG3NTUM_ROOT") 26 if root: 27 version_file = Path(root) / "VERSION" 28 else: 29 version_file = Path(__file__).parent.parent.parent.parent / "VERSION" 30 try: 31 return version_file.read_text().strip() 32 except FileNotFoundError: 33 return "dev" 34 35 36 API_VERSION = _read_version() 37 38 39 @router.get("/health", response_model=HealthResponse) 40 async def health_check() -> HealthResponse: 41 """ 42 Check API health status. 43 44 Returns basic health information including version and timestamp. 45 """ 46 return HealthResponse( 47 status="ok", 48 version=API_VERSION, 49 timestamp=datetime.now(timezone.utc), 50 ) 51 52 53 @router.get("/health/deep", response_model=DeepHealthResponse) 54 async def deep_health_check(db: AsyncSession = Depends(get_db)) -> DeepHealthResponse: 55 """ 56 Deep health check with component-level status. 57 58 Checks database and Redis connectivity with latency measurements. 59 Returns overall status: ok (all healthy), degraded (some issues), unhealthy (critical failures). 60 """ 61 db_health = await _check_database_health(db) 62 redis_health = await _check_redis_health() 63 64 # Determine overall status 65 if db_health.status == "unhealthy" or redis_health.status == "unhealthy": 66 overall_status = "unhealthy" 67 elif db_health.status == "degraded" or redis_health.status == "degraded": 68 overall_status = "degraded" 69 else: 70 overall_status = "ok" 71 72 return DeepHealthResponse( 73 status=overall_status, 74 version=API_VERSION, 75 timestamp=datetime.now(timezone.utc), 76 database=db_health, 77 redis=redis_health, 78 ) 79 80 81 async def _check_database_health(db: AsyncSession) -> ComponentHealth: 82 """Check database connectivity and measure latency.""" 83 try: 84 start = time.perf_counter() 85 await db.execute(text("SELECT 1")) 86 latency_ms = (time.perf_counter() - start) * 1000 87 88 # Consider slow responses as degraded (>100ms for simple query) 89 if latency_ms > 100: 90 return ComponentHealth(status="degraded", latency_ms=latency_ms) 91 92 return ComponentHealth(status="ok", latency_ms=latency_ms) 93 except Exception as e: 94 return ComponentHealth(status="unhealthy", error=str(e)) 95 96 97 async def _check_redis_health() -> ComponentHealth: 98 """Check Redis connectivity and measure latency.""" 99 try: 100 if agent_runner._event_hub is None: 101 return ComponentHealth(status="unhealthy", error="Redis event hub not initialized") 102 103 start = time.perf_counter() 104 # Get pool directly from _ensure_pool() return value 105 pool = await agent_runner._event_hub._ensure_pool() 106 if pool is None: 107 return ComponentHealth(status="unhealthy", error="Redis pool not available") 108 109 client = aioredis.Redis(connection_pool=pool) 110 try: 111 await client.ping() 112 finally: 113 await client.aclose() 114 115 latency_ms = (time.perf_counter() - start) * 1000 116 117 # Consider slow responses as degraded (>50ms for ping) 118 if latency_ms > 50: 119 return ComponentHealth(status="degraded", latency_ms=latency_ms) 120 121 return ComponentHealth(status="ok", latency_ms=latency_ms) 122 except Exception as e: 123 return ComponentHealth(status="unhealthy", error=str(e)) 124 125 126 @router.get("/config", response_model=ConfigResponse) 127 async def get_config() -> ConfigResponse: 128 """ 129 Get application configuration. 130 131 Returns available models and default model for the UI. 132 Models with ':mode=thinking' suffix enable extended thinking mode. 133 """ 134 loader = get_config_loader() 135 config = loader.get_config() 136 137 # Get models_available, default_model, and thinking_tokens from agent.yaml config 138 models_available = config.get("models_available", []) 139 default_model = config.get("default_model", config.get("model", "")) 140 thinking_tokens = config.get("thinking_tokens") 141 142 return ConfigResponse( 143 models_available=models_available, 144 default_model=default_model, 145 thinking_tokens=thinking_tokens, 146 )