fake_server.py
1 # /// script 2 # requires-python = ">=3.10" 3 # dependencies = ["fastapi>=0.115.0,<1", "uvicorn[standard]>=0.30.0,<1"] 4 # /// 5 """Fake OpenAI-compatible server for benchmarking. 6 7 Returns synthetic responses after a configurable delay so benchmarks measure 8 MLflow overhead rather than provider latency. 9 10 Run standalone: 11 uv run fake_server.py 12 PORT=9200 uv run fake_server.py 13 14 Or with multiple workers (as launched by run.py): 15 uvicorn fake_server:app --workers 8 --port 9137 16 """ 17 18 import asyncio 19 import os 20 import time 21 from typing import Any 22 23 import uvicorn 24 from fastapi import FastAPI 25 from pydantic import BaseModel, Field 26 27 app = FastAPI() 28 29 DELAY_MS = int(os.environ.get("FAKE_RESPONSE_DELAY_MS", "50")) 30 31 32 class ChatRequest(BaseModel): 33 model: str = "gpt-4o-mini" 34 messages: list[dict[str, str]] = Field(min_length=1) 35 stream: bool = False 36 temperature: float = 1.0 37 max_tokens: int = 50 38 39 40 @app.post("/v1/chat/completions") 41 async def chat_completions(req: ChatRequest) -> dict[str, Any]: 42 await asyncio.sleep(DELAY_MS / 1000) 43 return { 44 "id": "chatcmpl-fake", 45 "object": "chat.completion", 46 "created": int(time.time()), 47 "model": req.model, 48 "choices": [ 49 { 50 "index": 0, 51 "message": {"role": "assistant", "content": "Hello!"}, 52 "finish_reason": "stop", 53 } 54 ], 55 "usage": {"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15}, 56 } 57 58 59 @app.get("/health") 60 async def health() -> dict[str, str]: 61 # Polled by run.py's _wait_for_port to detect when the server is ready. 62 return {"status": "ok"} 63 64 65 if __name__ == "__main__": 66 port = int(os.environ.get("PORT", "9137")) 67 host = os.environ.get("FAKE_SERVER_HOST", "127.0.0.1") 68 uvicorn.run("fake_server:app", host=host, port=port, log_level="warning")