/ dev / benchmarks / gateway / fake_server.py
fake_server.py
 1  # /// script
 2  # requires-python = ">=3.10"
 3  # dependencies = ["fastapi>=0.115.0,<1", "uvicorn[standard]>=0.30.0,<1"]
 4  # ///
 5  """Fake OpenAI-compatible server for benchmarking.
 6  
 7  Returns synthetic responses after a configurable delay so benchmarks measure
 8  MLflow overhead rather than provider latency.
 9  
10  Run standalone:
11      uv run fake_server.py
12      PORT=9200 uv run fake_server.py
13  
14  Or with multiple workers (as launched by run.py):
15      uvicorn fake_server:app --workers 8 --port 9137
16  """
17  
18  import asyncio
19  import os
20  import time
21  from typing import Any
22  
23  import uvicorn
24  from fastapi import FastAPI
25  from pydantic import BaseModel, Field
26  
27  app = FastAPI()
28  
29  DELAY_MS = int(os.environ.get("FAKE_RESPONSE_DELAY_MS", "50"))
30  
31  
32  class ChatRequest(BaseModel):
33      model: str = "gpt-4o-mini"
34      messages: list[dict[str, str]] = Field(min_length=1)
35      stream: bool = False
36      temperature: float = 1.0
37      max_tokens: int = 50
38  
39  
40  @app.post("/v1/chat/completions")
41  async def chat_completions(req: ChatRequest) -> dict[str, Any]:
42      await asyncio.sleep(DELAY_MS / 1000)
43      return {
44          "id": "chatcmpl-fake",
45          "object": "chat.completion",
46          "created": int(time.time()),
47          "model": req.model,
48          "choices": [
49              {
50                  "index": 0,
51                  "message": {"role": "assistant", "content": "Hello!"},
52                  "finish_reason": "stop",
53              }
54          ],
55          "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
56      }
57  
58  
59  @app.get("/health")
60  async def health() -> dict[str, str]:
61      # Polled by run.py's _wait_for_port to detect when the server is ready.
62      return {"status": "ok"}
63  
64  
65  if __name__ == "__main__":
66      port = int(os.environ.get("PORT", "9137"))
67      host = os.environ.get("FAKE_SERVER_HOST", "127.0.0.1")
68      uvicorn.run("fake_server:app", host=host, port=port, log_level="warning")