/ environments / web_research_env.py
web_research_env.py
1 """ 2 WebResearchEnv — RL Environment for Multi-Step Web Research 3 ============================================================ 4 5 Trains models to do accurate, efficient, multi-source web research. 6 7 Reward signals: 8 - Answer correctness (LLM judge, 0.0–1.0) 9 - Source diversity (used ≥2 distinct domains) 10 - Efficiency (penalizes excessive tool calls) 11 - Tool usage (bonus for actually using web tools) 12 13 Dataset: FRAMES benchmark (Google, 2024) — multi-hop factual questions 14 HuggingFace: google/frames-benchmark 15 Fallback: built-in sample questions (no HF token needed) 16 17 Usage: 18 # Phase 1 (OpenAI-compatible server) 19 python environments/web_research_env.py serve \\ 20 --openai.base_url http://localhost:8000/v1 \\ 21 --openai.model_name YourModel \\ 22 --openai.server_type openai 23 24 # Process mode (offline data generation) 25 python environments/web_research_env.py process \\ 26 --env.data_path_to_save_groups data/web_research.jsonl 27 28 # Standalone eval 29 python environments/web_research_env.py evaluate \\ 30 --openai.base_url http://localhost:8000/v1 \\ 31 --openai.model_name YourModel 32 33 Built by: github.com/jackx707 34 Inspired by: GroceryMind — production Hermes agent doing live web research 35 across German grocery stores (firecrawl + hermes-agent) 36 """ 37 38 from __future__ import annotations 39 40 import asyncio 41 import json 42 import logging 43 import os 44 import random 45 import re 46 import sys 47 from pathlib import Path 48 from typing import Any, Dict, List, Optional, Tuple 49 from urllib.parse import urlparse 50 51 from pydantic import Field 52 53 # Ensure hermes-agent root is on path 54 _repo_root = Path(__file__).resolve().parent.parent 55 if str(_repo_root) not in sys.path: 56 sys.path.insert(0, str(_repo_root)) 57 58 # --------------------------------------------------------------------------- 59 # Optional HuggingFace datasets import 60 # --------------------------------------------------------------------------- 61 try: 62 from datasets import load_dataset 63 HF_AVAILABLE = True 64 except ImportError: 65 HF_AVAILABLE = False 66 67 from atroposlib.envs.base import ScoredDataGroup 68 from atroposlib.envs.server_handling.server_manager import APIServerConfig 69 from atroposlib.type_definitions import Item 70 71 from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig 72 from environments.agent_loop import AgentResult 73 from environments.tool_context import ToolContext 74 75 logger = logging.getLogger(__name__) 76 77 # --------------------------------------------------------------------------- 78 # Fallback sample dataset (used when HuggingFace is unavailable) 79 # Multi-hop questions requiring real web search to answer. 80 # --------------------------------------------------------------------------- 81 SAMPLE_QUESTIONS = [ 82 { 83 "question": "What is the current population of the capital city of the country that won the 2022 FIFA World Cup?", 84 "answer": "Buenos Aires has approximately 3 million people in the city proper, or around 15 million in the greater metro area.", 85 "difficulty": "medium", 86 "hops": 2, 87 }, 88 { 89 "question": "Who is the CEO of the company that makes the most widely used open-source container orchestration platform?", 90 "answer": "The Linux Foundation oversees Kubernetes. CNCF (Cloud Native Computing Foundation) is the specific body — it does not have a traditional CEO but has an executive director.", 91 "difficulty": "medium", 92 "hops": 2, 93 }, 94 { 95 "question": "What programming language was used to write the original version of the web framework used by Instagram?", 96 "answer": "Django, which Instagram was built on, is written in Python.", 97 "difficulty": "easy", 98 "hops": 2, 99 }, 100 { 101 "question": "In what year was the university founded where the inventor of the World Wide Web currently holds a professorship?", 102 "answer": "Tim Berners-Lee holds a professorship at MIT (founded 1861) and the University of Southampton (founded 1952).", 103 "difficulty": "hard", 104 "hops": 3, 105 }, 106 { 107 "question": "What is the latest stable version of the programming language that ranks #1 on the TIOBE index as of this year?", 108 "answer": "Python is currently #1 on TIOBE. The latest stable version should be verified via the official python.org site.", 109 "difficulty": "medium", 110 "hops": 2, 111 }, 112 { 113 "question": "How many employees does the parent company of Instagram have?", 114 "answer": "Meta Platforms (parent of Instagram) employs approximately 70,000+ people as of recent reports.", 115 "difficulty": "medium", 116 "hops": 2, 117 }, 118 { 119 "question": "What is the current interest rate set by the central bank of the country where the Eiffel Tower is located?", 120 "answer": "The European Central Bank sets rates for France/eurozone. The current rate should be verified — it has changed frequently in 2023-2025.", 121 "difficulty": "hard", 122 "hops": 2, 123 }, 124 { 125 "question": "Which company acquired the startup founded by the creator of Oculus VR?", 126 "answer": "Palmer Luckey founded Oculus VR, which was acquired by Facebook (now Meta). He later founded Anduril Industries.", 127 "difficulty": "medium", 128 "hops": 2, 129 }, 130 { 131 "question": "What is the market cap of the company that owns the most popular search engine in Russia?", 132 "answer": "Yandex (now split into separate entities after 2024 restructuring). Current market cap should be verified via financial sources.", 133 "difficulty": "hard", 134 "hops": 2, 135 }, 136 { 137 "question": "What was the GDP growth rate of the country that hosted the most recent Summer Olympics?", 138 "answer": "Paris, France hosted the 2024 Summer Olympics. France's recent GDP growth should be verified via World Bank or IMF data.", 139 "difficulty": "hard", 140 "hops": 2, 141 }, 142 ] 143 144 145 # --------------------------------------------------------------------------- 146 # Configuration 147 # --------------------------------------------------------------------------- 148 149 class WebResearchEnvConfig(HermesAgentEnvConfig): 150 """Configuration for the web research RL environment.""" 151 152 # Reward weights 153 correctness_weight: float = Field( 154 default=0.6, 155 description="Weight for answer correctness in reward (LLM judge score).", 156 ) 157 tool_usage_weight: float = Field( 158 default=0.2, 159 description="Weight for tool usage signal (did the model actually use web tools?).", 160 ) 161 efficiency_weight: float = Field( 162 default=0.2, 163 description="Weight for efficiency signal (penalizes excessive tool calls).", 164 ) 165 diversity_bonus: float = Field( 166 default=0.1, 167 description="Bonus reward for citing ≥2 distinct domains.", 168 ) 169 170 # Efficiency thresholds 171 efficient_max_calls: int = Field( 172 default=5, 173 description="Maximum tool calls before efficiency penalty begins.", 174 ) 175 heavy_penalty_calls: int = Field( 176 default=10, 177 description="Tool call count where efficiency penalty steepens.", 178 ) 179 180 # Eval 181 eval_size: int = Field( 182 default=20, 183 description="Number of held-out items for evaluation.", 184 ) 185 eval_split_ratio: float = Field( 186 default=0.1, 187 description="Fraction of dataset to hold out for evaluation (0.0–1.0).", 188 ) 189 190 # Dataset 191 dataset_name: str = Field( 192 default="google/frames-benchmark", 193 description="HuggingFace dataset name for research questions.", 194 ) 195 196 197 # --------------------------------------------------------------------------- 198 # Environment 199 # --------------------------------------------------------------------------- 200 201 class WebResearchEnv(HermesAgentBaseEnv): 202 """ 203 RL environment for training multi-step web research skills. 204 205 The model is given a factual question requiring 2-3 hops of web research 206 and must use web_search / web_extract tools to find and synthesize the answer. 207 208 Reward is multi-signal: 209 60% — answer correctness (LLM judge) 210 20% — tool usage (did the model actually search the web?) 211 20% — efficiency (penalizes >5 tool calls) 212 213 Bonus +0.1 for source diversity (≥2 distinct domains cited). 214 """ 215 216 name = "web-research" 217 env_config_cls = WebResearchEnvConfig 218 219 # Default toolsets for this environment — web + file for saving notes 220 default_toolsets = ["web", "file"] 221 222 @classmethod 223 def config_init(cls) -> Tuple[WebResearchEnvConfig, List[APIServerConfig]]: 224 """Default configuration for the web research environment.""" 225 env_config = WebResearchEnvConfig( 226 enabled_toolsets=["web", "file"], 227 max_agent_turns=15, 228 agent_temperature=1.0, 229 system_prompt=( 230 "You are a highly capable research agent. When asked a factual question, " 231 "always use web_search to find current, accurate information before answering. " 232 "Cite at least 2 sources. Be concise and accurate." 233 ), 234 group_size=4, 235 total_steps=1000, 236 steps_per_eval=100, 237 use_wandb=True, 238 wandb_name="web-research", 239 ) 240 241 server_configs = [ 242 APIServerConfig( 243 base_url="https://openrouter.ai/api/v1", 244 model_name="anthropic/claude-sonnet-4.5", 245 server_type="openai", 246 api_key=os.getenv("OPENROUTER_API_KEY", ""), 247 health_check=False, 248 ) 249 ] 250 251 return env_config, server_configs 252 253 def __init__(self, *args, **kwargs): 254 super().__init__(*args, **kwargs) 255 self._items: list[dict] = [] 256 self._eval_items: list[dict] = [] 257 self._index: int = 0 258 259 # Metrics tracking for wandb 260 self._reward_buffer: list[float] = [] 261 self._correctness_buffer: list[float] = [] 262 self._tool_usage_buffer: list[float] = [] 263 self._efficiency_buffer: list[float] = [] 264 self._diversity_buffer: list[float] = [] 265 266 # ------------------------------------------------------------------ 267 # 1. Setup — load dataset 268 # ------------------------------------------------------------------ 269 270 async def setup(self) -> None: 271 """Load the FRAMES benchmark or fall back to built-in samples.""" 272 if HF_AVAILABLE: 273 try: 274 logger.info("Loading FRAMES benchmark from HuggingFace...") 275 ds = load_dataset(self.config.dataset_name, split="test") 276 self._items = [ 277 { 278 "question": row["Prompt"], 279 "answer": row["Answer"], 280 "difficulty": row.get("reasoning_types", "unknown"), 281 "hops": 2, 282 } 283 for row in ds 284 ] 285 # Hold out for eval 286 eval_size = max( 287 self.config.eval_size, 288 int(len(self._items) * self.config.eval_split_ratio), 289 ) 290 random.shuffle(self._items) 291 self._eval_items = self._items[:eval_size] 292 self._items = self._items[eval_size:] 293 logger.info( 294 f"Loaded {len(self._items)} train / {len(self._eval_items)} eval items " 295 f"from FRAMES benchmark." 296 ) 297 return 298 except Exception as e: 299 logger.warning(f"Could not load FRAMES from HuggingFace: {e}. Using built-in samples.") 300 301 # Fallback 302 random.shuffle(SAMPLE_QUESTIONS) 303 split = max(1, len(SAMPLE_QUESTIONS) * 8 // 10) 304 self._items = SAMPLE_QUESTIONS[:split] 305 self._eval_items = SAMPLE_QUESTIONS[split:] 306 logger.info( 307 f"Using built-in sample dataset: {len(self._items)} train / " 308 f"{len(self._eval_items)} eval items." 309 ) 310 311 # ------------------------------------------------------------------ 312 # 2. get_next_item — return the next question 313 # ------------------------------------------------------------------ 314 315 async def get_next_item(self) -> dict: 316 """Return the next item, cycling through the dataset.""" 317 if not self._items: 318 raise RuntimeError("Dataset is empty. Did you call setup()?") 319 item = self._items[self._index % len(self._items)] 320 self._index += 1 321 return item 322 323 # ------------------------------------------------------------------ 324 # 3. format_prompt — build the user-facing prompt 325 # ------------------------------------------------------------------ 326 327 def format_prompt(self, item: dict) -> str: 328 """Format the research question as a task prompt.""" 329 return ( 330 f"Research the following question thoroughly using web search. " 331 f"You MUST search the web to find current, accurate information — " 332 f"do not rely solely on your training data.\n\n" 333 f"Question: {item['question']}\n\n" 334 f"Requirements:\n" 335 f"- Use web_search and/or web_extract tools to find information\n" 336 f"- Search at least 2 different sources\n" 337 f"- Provide a concise, accurate answer (2-4 sentences)\n" 338 f"- Cite the sources you used" 339 ) 340 341 # ------------------------------------------------------------------ 342 # 4. compute_reward — multi-signal scoring 343 # ------------------------------------------------------------------ 344 345 async def compute_reward( 346 self, 347 item: dict, 348 result: AgentResult, 349 ctx: ToolContext, 350 ) -> float: 351 """ 352 Multi-signal reward function: 353 354 correctness_weight * correctness — LLM judge comparing answer to ground truth 355 tool_usage_weight * tool_used — binary: did the model use web tools? 356 efficiency_weight * efficiency — penalizes wasteful tool usage 357 + diversity_bonus — source diversity (≥2 distinct domains) 358 """ 359 # Extract final response from messages (last assistant message with content) 360 final_response = "" 361 tools_used: list[str] = [] 362 for msg in reversed(result.messages): 363 if msg.get("role") == "assistant" and msg.get("content") and not final_response: 364 final_response = msg["content"] 365 # Collect tool names from tool call messages 366 if msg.get("role") == "assistant" and msg.get("tool_calls"): 367 for tc in msg["tool_calls"]: 368 fn = tc.get("function", {}) if isinstance(tc, dict) else {} 369 name = fn.get("name", "") 370 if name: 371 tools_used.append(name) 372 tool_call_count: int = result.turns_used or len(tools_used) 373 374 cfg = self.config 375 376 # ---- Signal 1: Answer correctness (LLM judge) ---------------- 377 correctness = await self._llm_judge( 378 question=item["question"], 379 expected=item["answer"], 380 model_answer=final_response, 381 ) 382 383 # ---- Signal 2: Web tool usage -------------------------------- 384 web_tools = {"web_search", "web_extract", "search", "firecrawl"} 385 tool_used = 1.0 if any(t in web_tools for t in tools_used) else 0.0 386 387 # ---- Signal 3: Efficiency ------------------------------------ 388 if tool_call_count <= cfg.efficient_max_calls: 389 efficiency = 1.0 390 elif tool_call_count <= cfg.heavy_penalty_calls: 391 efficiency = 1.0 - (tool_call_count - cfg.efficient_max_calls) * 0.08 392 else: 393 efficiency = max(0.0, 1.0 - (tool_call_count - cfg.efficient_max_calls) * 0.12) 394 395 # ---- Bonus: Source diversity --------------------------------- 396 domains = self._extract_domains(final_response) 397 diversity = cfg.diversity_bonus if len(domains) >= 2 else 0.0 398 399 # ---- Combine ------------------------------------------------ 400 reward = ( 401 cfg.correctness_weight * correctness 402 + cfg.tool_usage_weight * tool_used 403 + cfg.efficiency_weight * efficiency 404 + diversity 405 ) 406 reward = min(1.0, max(0.0, reward)) # clamp to [0, 1] 407 408 # Track for wandb 409 self._reward_buffer.append(reward) 410 self._correctness_buffer.append(correctness) 411 self._tool_usage_buffer.append(tool_used) 412 self._efficiency_buffer.append(efficiency) 413 self._diversity_buffer.append(diversity) 414 415 logger.debug( 416 f"Reward breakdown — correctness={correctness:.2f}, " 417 f"tool_used={tool_used:.1f}, efficiency={efficiency:.2f}, " 418 f"diversity={diversity:.1f} → total={reward:.3f}" 419 ) 420 421 return reward 422 423 # ------------------------------------------------------------------ 424 # 5. evaluate — run on held-out eval split 425 # ------------------------------------------------------------------ 426 427 async def evaluate(self, *args, **kwargs) -> None: 428 """Run evaluation on the held-out split using the full agent loop with tools. 429 430 Each eval item runs through the same agent loop as training — 431 the model can use web_search, web_extract, etc. to research answers. 432 This measures actual agentic research capability, not just knowledge. 433 """ 434 import time 435 import uuid 436 from environments.agent_loop import HermesAgentLoop 437 from environments.tool_context import ToolContext 438 439 items = self._eval_items 440 if not items: 441 logger.warning("No eval items available.") 442 return 443 444 eval_size = min(self.config.eval_size, len(items)) 445 eval_items = items[:eval_size] 446 447 logger.info(f"Running eval on {len(eval_items)} questions (with agent loop + tools)...") 448 start_time = time.time() 449 samples = [] 450 451 # Resolve tools once for all eval items 452 tools, valid_names = self._resolve_tools_for_group() 453 454 for i, item in enumerate(eval_items): 455 task_id = str(uuid.uuid4()) 456 logger.info(f"Eval [{i+1}/{len(eval_items)}]: {item['question'][:80]}...") 457 458 try: 459 # Build messages 460 messages: List[Dict[str, Any]] = [] 461 if self.config.system_prompt: 462 messages.append({"role": "system", "content": self.config.system_prompt}) 463 messages.append({"role": "user", "content": self.format_prompt(item)}) 464 465 # Run the full agent loop with tools 466 agent = HermesAgentLoop( 467 server=self.server, 468 tool_schemas=tools, 469 valid_tool_names=valid_names, 470 max_turns=self.config.max_agent_turns, 471 task_id=task_id, 472 temperature=0.0, # Deterministic for eval 473 max_tokens=self.config.max_token_length, 474 extra_body=self.config.extra_body, 475 budget_config=self.config.build_budget_config(), 476 ) 477 result = await agent.run(messages) 478 479 # Extract final response and tool usage from messages 480 final_response = "" 481 tool_call_count = 0 482 for msg in reversed(result.messages): 483 if msg.get("role") == "assistant" and msg.get("content") and not final_response: 484 final_response = msg["content"] 485 if msg.get("role") == "assistant" and msg.get("tool_calls"): 486 tool_call_count += len(msg["tool_calls"]) 487 488 # Compute reward (includes LLM judge for correctness) 489 # Temporarily save buffer lengths so we can extract the 490 # correctness score without calling judge twice, and avoid 491 # polluting training metric buffers with eval data. 492 buf_len = len(self._correctness_buffer) 493 ctx = ToolContext(task_id) 494 try: 495 reward = await self.compute_reward(item, result, ctx) 496 finally: 497 ctx.cleanup() 498 499 # Extract correctness from the buffer (compute_reward appended it) 500 # then remove eval entries from training buffers 501 correctness = ( 502 self._correctness_buffer[buf_len] 503 if len(self._correctness_buffer) > buf_len 504 else 0.0 505 ) 506 # Roll back buffers to avoid polluting training metrics 507 for buf in ( 508 self._reward_buffer, self._correctness_buffer, 509 self._tool_usage_buffer, self._efficiency_buffer, 510 self._diversity_buffer, 511 ): 512 if len(buf) > buf_len: 513 buf.pop() 514 515 samples.append({ 516 "prompt": item["question"], 517 "response": final_response[:500], 518 "expected": item["answer"], 519 "correctness": correctness, 520 "reward": reward, 521 "tool_calls": tool_call_count, 522 "turns": result.turns_used, 523 }) 524 525 logger.info( 526 f" → correctness={correctness:.2f}, reward={reward:.3f}, " 527 f"tools={tool_call_count}, turns={result.turns_used}" 528 ) 529 530 except Exception as e: 531 logger.error(f"Eval error on item: {e}") 532 samples.append({ 533 "prompt": item["question"], 534 "response": f"ERROR: {e}", 535 "expected": item["answer"], 536 "correctness": 0.0, 537 "reward": 0.0, 538 "tool_calls": 0, 539 "turns": 0, 540 }) 541 542 end_time = time.time() 543 544 # Compute aggregate metrics 545 correctness_scores = [s["correctness"] for s in samples] 546 rewards = [s["reward"] for s in samples] 547 tool_counts = [s["tool_calls"] for s in samples] 548 n = len(samples) 549 550 eval_metrics = { 551 "eval/mean_correctness": sum(correctness_scores) / n if n else 0.0, 552 "eval/mean_reward": sum(rewards) / n if n else 0.0, 553 "eval/mean_tool_calls": sum(tool_counts) / n if n else 0.0, 554 "eval/tool_usage_rate": sum(1 for t in tool_counts if t > 0) / n if n else 0.0, 555 "eval/n_items": n, 556 } 557 558 logger.info( 559 f"Eval complete — correctness={eval_metrics['eval/mean_correctness']:.3f}, " 560 f"reward={eval_metrics['eval/mean_reward']:.3f}, " 561 f"tool_usage={eval_metrics['eval/tool_usage_rate']:.0%}" 562 ) 563 564 await self.evaluate_log( 565 metrics=eval_metrics, 566 samples=samples, 567 start_time=start_time, 568 end_time=end_time, 569 ) 570 571 # ------------------------------------------------------------------ 572 # 6. wandb_log — custom metrics 573 # ------------------------------------------------------------------ 574 575 async def wandb_log(self, wandb_metrics: Optional[Dict] = None) -> None: 576 """Log reward breakdown metrics to wandb.""" 577 if wandb_metrics is None: 578 wandb_metrics = {} 579 580 if self._reward_buffer: 581 n = len(self._reward_buffer) 582 wandb_metrics["train/mean_reward"] = sum(self._reward_buffer) / n 583 wandb_metrics["train/mean_correctness"] = sum(self._correctness_buffer) / n 584 wandb_metrics["train/mean_tool_usage"] = sum(self._tool_usage_buffer) / n 585 wandb_metrics["train/mean_efficiency"] = sum(self._efficiency_buffer) / n 586 wandb_metrics["train/mean_diversity"] = sum(self._diversity_buffer) / n 587 wandb_metrics["train/total_rollouts"] = n 588 589 # Accuracy buckets 590 wandb_metrics["train/correct_rate"] = ( 591 sum(1 for c in self._correctness_buffer if c >= 0.7) / n 592 ) 593 wandb_metrics["train/tool_usage_rate"] = ( 594 sum(1 for t in self._tool_usage_buffer if t > 0) / n 595 ) 596 597 # Clear buffers 598 self._reward_buffer.clear() 599 self._correctness_buffer.clear() 600 self._tool_usage_buffer.clear() 601 self._efficiency_buffer.clear() 602 self._diversity_buffer.clear() 603 604 await super().wandb_log(wandb_metrics) 605 606 # ------------------------------------------------------------------ 607 # Private helpers 608 # ------------------------------------------------------------------ 609 610 async def _llm_judge( 611 self, 612 question: str, 613 expected: str, 614 model_answer: str, 615 ) -> float: 616 """ 617 Use the server's LLM to judge answer correctness. 618 Falls back to keyword heuristic if LLM call fails. 619 """ 620 if not model_answer or not model_answer.strip(): 621 return 0.0 622 623 judge_prompt = ( 624 "You are an impartial judge evaluating the quality of an AI research answer.\n\n" 625 f"Question: {question}\n\n" 626 f"Reference answer: {expected}\n\n" 627 f"Model answer: {model_answer}\n\n" 628 "Score the model answer on a scale from 0.0 to 1.0 where:\n" 629 " 1.0 = fully correct and complete\n" 630 " 0.7 = mostly correct with minor gaps\n" 631 " 0.4 = partially correct\n" 632 " 0.1 = mentions relevant topic but wrong or very incomplete\n" 633 " 0.0 = completely wrong or no answer\n\n" 634 "Consider: factual accuracy, completeness, and relevance.\n" 635 'Respond with ONLY a JSON object: {"score": <float>, "reason": "<one sentence>"}' 636 ) 637 638 try: 639 response = await self.server.chat_completion( 640 messages=[{"role": "user", "content": judge_prompt}], 641 n=1, 642 max_tokens=150, 643 temperature=0.0, 644 split="eval", 645 ) 646 text = response.choices[0].message.content if response.choices else "" 647 parsed = self._parse_judge_json(text) 648 if parsed is not None: 649 return float(parsed) 650 except Exception as e: 651 logger.debug(f"LLM judge failed: {e}. Using heuristic.") 652 653 return self._heuristic_score(expected, model_answer) 654 655 @staticmethod 656 def _parse_judge_json(text: str) -> Optional[float]: 657 """Extract the score float from LLM judge JSON response.""" 658 try: 659 clean = re.sub(r"```(?:json)?|```", "", text).strip() 660 data = json.loads(clean) 661 score = float(data.get("score", -1)) 662 if 0.0 <= score <= 1.0: 663 return score 664 except Exception: 665 match = re.search(r'"score"\s*:\s*([0-9.]+)', text) 666 if match: 667 score = float(match.group(1)) 668 if 0.0 <= score <= 1.0: 669 return score 670 return None 671 672 @staticmethod 673 def _heuristic_score(expected: str, model_answer: str) -> float: 674 """Lightweight keyword overlap score as fallback.""" 675 stopwords = { 676 "the", "a", "an", "is", "are", "was", "were", "of", "in", "on", 677 "at", "to", "for", "with", "and", "or", "but", "it", "its", 678 "this", "that", "as", "by", "from", "be", "has", "have", "had", 679 } 680 681 def tokenize(text: str) -> set: 682 tokens = re.findall(r'\b\w+\b', text.lower()) 683 return {t for t in tokens if t not in stopwords and len(t) > 2} 684 685 expected_tokens = tokenize(expected) 686 answer_tokens = tokenize(model_answer) 687 688 if not expected_tokens: 689 return 0.5 690 691 overlap = len(expected_tokens & answer_tokens) 692 union = len(expected_tokens | answer_tokens) 693 694 jaccard = overlap / union if union > 0 else 0.0 695 recall = overlap / len(expected_tokens) 696 return min(1.0, 0.4 * jaccard + 0.6 * recall) 697 698 @staticmethod 699 def _extract_domains(text: str) -> set: 700 """Extract unique domains from URLs cited in the response.""" 701 urls = re.findall(r'https?://[^\s\)>\]"\']+', text) 702 domains = set() 703 for url in urls: 704 try: 705 parsed = urlparse(url) 706 domain = parsed.netloc.lower().lstrip("www.") 707 if domain: 708 domains.add(domain) 709 except Exception: 710 pass 711 return domains 712 713 714 # --------------------------------------------------------------------------- 715 # Entry point 716 # --------------------------------------------------------------------------- 717 718 if __name__ == "__main__": 719 WebResearchEnv.cli()