/ src / api / routes / health.py
health.py
  1  """
  2  Health check endpoint for Ag3ntum API.
  3  """
  4  import time
  5  from datetime import datetime, timezone
  6  
  7  import redis.asyncio as aioredis
  8  from fastapi import APIRouter, Depends
  9  from sqlalchemy import text
 10  from sqlalchemy.ext.asyncio import AsyncSession
 11  
 12  from ...config import get_config_loader
 13  from ...db.database import get_db
 14  from ...services.agent_runner import agent_runner
 15  from ..models import ComponentHealth, ConfigResponse, DeepHealthResponse, HealthResponse
 16  
 17  router = APIRouter(tags=["health"])
 18  
 19  
 20  def _read_version() -> str:
 21      """Read version from VERSION file at startup."""
 22      import os
 23      from pathlib import Path
 24  
 25      root = os.environ.get("AG3NTUM_ROOT")
 26      if root:
 27          version_file = Path(root) / "VERSION"
 28      else:
 29          version_file = Path(__file__).parent.parent.parent.parent / "VERSION"
 30      try:
 31          return version_file.read_text().strip()
 32      except FileNotFoundError:
 33          return "dev"
 34  
 35  
 36  API_VERSION = _read_version()
 37  
 38  
 39  @router.get("/health", response_model=HealthResponse)
 40  async def health_check() -> HealthResponse:
 41      """
 42      Check API health status.
 43  
 44      Returns basic health information including version and timestamp.
 45      """
 46      return HealthResponse(
 47          status="ok",
 48          version=API_VERSION,
 49          timestamp=datetime.now(timezone.utc),
 50      )
 51  
 52  
 53  @router.get("/health/deep", response_model=DeepHealthResponse)
 54  async def deep_health_check(db: AsyncSession = Depends(get_db)) -> DeepHealthResponse:
 55      """
 56      Deep health check with component-level status.
 57  
 58      Checks database and Redis connectivity with latency measurements.
 59      Returns overall status: ok (all healthy), degraded (some issues), unhealthy (critical failures).
 60      """
 61      db_health = await _check_database_health(db)
 62      redis_health = await _check_redis_health()
 63  
 64      # Determine overall status
 65      if db_health.status == "unhealthy" or redis_health.status == "unhealthy":
 66          overall_status = "unhealthy"
 67      elif db_health.status == "degraded" or redis_health.status == "degraded":
 68          overall_status = "degraded"
 69      else:
 70          overall_status = "ok"
 71  
 72      return DeepHealthResponse(
 73          status=overall_status,
 74          version=API_VERSION,
 75          timestamp=datetime.now(timezone.utc),
 76          database=db_health,
 77          redis=redis_health,
 78      )
 79  
 80  
 81  async def _check_database_health(db: AsyncSession) -> ComponentHealth:
 82      """Check database connectivity and measure latency."""
 83      try:
 84          start = time.perf_counter()
 85          await db.execute(text("SELECT 1"))
 86          latency_ms = (time.perf_counter() - start) * 1000
 87  
 88          # Consider slow responses as degraded (>100ms for simple query)
 89          if latency_ms > 100:
 90              return ComponentHealth(status="degraded", latency_ms=latency_ms)
 91  
 92          return ComponentHealth(status="ok", latency_ms=latency_ms)
 93      except Exception as e:
 94          return ComponentHealth(status="unhealthy", error=str(e))
 95  
 96  
 97  async def _check_redis_health() -> ComponentHealth:
 98      """Check Redis connectivity and measure latency."""
 99      try:
100          if agent_runner._event_hub is None:
101              return ComponentHealth(status="unhealthy", error="Redis event hub not initialized")
102  
103          start = time.perf_counter()
104          # Get pool directly from _ensure_pool() return value
105          pool = await agent_runner._event_hub._ensure_pool()
106          if pool is None:
107              return ComponentHealth(status="unhealthy", error="Redis pool not available")
108  
109          client = aioredis.Redis(connection_pool=pool)
110          try:
111              await client.ping()
112          finally:
113              await client.aclose()
114  
115          latency_ms = (time.perf_counter() - start) * 1000
116  
117          # Consider slow responses as degraded (>50ms for ping)
118          if latency_ms > 50:
119              return ComponentHealth(status="degraded", latency_ms=latency_ms)
120  
121          return ComponentHealth(status="ok", latency_ms=latency_ms)
122      except Exception as e:
123          return ComponentHealth(status="unhealthy", error=str(e))
124  
125  
126  @router.get("/config", response_model=ConfigResponse)
127  async def get_config() -> ConfigResponse:
128      """
129      Get application configuration.
130  
131      Returns available models and default model for the UI.
132      Models with ':mode=thinking' suffix enable extended thinking mode.
133      """
134      loader = get_config_loader()
135      config = loader.get_config()
136  
137      # Get models_available, default_model, and thinking_tokens from agent.yaml config
138      models_available = config.get("models_available", [])
139      default_model = config.get("default_model", config.get("model", ""))
140      thinking_tokens = config.get("thinking_tokens")
141  
142      return ConfigResponse(
143          models_available=models_available,
144          default_model=default_model,
145          thinking_tokens=thinking_tokens,
146      )