Cradicle Explorer

/ environments / web_research_env.py
web_research_env.py
  1  """
  2  WebResearchEnv — RL Environment for Multi-Step Web Research
  3  ============================================================
  4  
  5  Trains models to do accurate, efficient, multi-source web research.
  6  
  7  Reward signals:
  8    - Answer correctness  (LLM judge, 0.0–1.0)
  9    - Source diversity    (used ≥2 distinct domains)
 10    - Efficiency          (penalizes excessive tool calls)
 11    - Tool usage          (bonus for actually using web tools)
 12  
 13  Dataset: FRAMES benchmark (Google, 2024) — multi-hop factual questions
 14    HuggingFace: google/frames-benchmark
 15    Fallback:    built-in sample questions (no HF token needed)
 16  
 17  Usage:
 18      # Phase 1 (OpenAI-compatible server)
 19      python environments/web_research_env.py serve \\
 20          --openai.base_url http://localhost:8000/v1 \\
 21          --openai.model_name YourModel \\
 22          --openai.server_type openai
 23  
 24      # Process mode (offline data generation)
 25      python environments/web_research_env.py process \\
 26          --env.data_path_to_save_groups data/web_research.jsonl
 27  
 28      # Standalone eval
 29      python environments/web_research_env.py evaluate \\
 30          --openai.base_url http://localhost:8000/v1 \\
 31          --openai.model_name YourModel
 32  
 33  Built by: github.com/jackx707
 34  Inspired by: GroceryMind — production Hermes agent doing live web research
 35               across German grocery stores (firecrawl + hermes-agent)
 36  """
 37  
 38  from __future__ import annotations
 39  
 40  import asyncio
 41  import json
 42  import logging
 43  import os
 44  import random
 45  import re
 46  import sys
 47  from pathlib import Path
 48  from typing import Any, Dict, List, Optional, Tuple
 49  from urllib.parse import urlparse
 50  
 51  from pydantic import Field
 52  
 53  # Ensure hermes-agent root is on path
 54  _repo_root = Path(__file__).resolve().parent.parent
 55  if str(_repo_root) not in sys.path:
 56      sys.path.insert(0, str(_repo_root))
 57  
 58  # ---------------------------------------------------------------------------
 59  # Optional HuggingFace datasets import
 60  # ---------------------------------------------------------------------------
 61  try:
 62      from datasets import load_dataset
 63      HF_AVAILABLE = True
 64  except ImportError:
 65      HF_AVAILABLE = False
 66  
 67  from atroposlib.envs.base import ScoredDataGroup
 68  from atroposlib.envs.server_handling.server_manager import APIServerConfig
 69  from atroposlib.type_definitions import Item
 70  
 71  from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
 72  from environments.agent_loop import AgentResult
 73  from environments.tool_context import ToolContext
 74  
 75  logger = logging.getLogger(__name__)
 76  
 77  # ---------------------------------------------------------------------------
 78  # Fallback sample dataset (used when HuggingFace is unavailable)
 79  # Multi-hop questions requiring real web search to answer.
 80  # ---------------------------------------------------------------------------
 81  SAMPLE_QUESTIONS = [
 82      {
 83          "question": "What is the current population of the capital city of the country that won the 2022 FIFA World Cup?",
 84          "answer": "Buenos Aires has approximately 3 million people in the city proper, or around 15 million in the greater metro area.",
 85          "difficulty": "medium",
 86          "hops": 2,
 87      },
 88      {
 89          "question": "Who is the CEO of the company that makes the most widely used open-source container orchestration platform?",
 90          "answer": "The Linux Foundation oversees Kubernetes. CNCF (Cloud Native Computing Foundation) is the specific body — it does not have a traditional CEO but has an executive director.",
 91          "difficulty": "medium",
 92          "hops": 2,
 93      },
 94      {
 95          "question": "What programming language was used to write the original version of the web framework used by Instagram?",
 96          "answer": "Django, which Instagram was built on, is written in Python.",
 97          "difficulty": "easy",
 98          "hops": 2,
 99      },
100      {
101          "question": "In what year was the university founded where the inventor of the World Wide Web currently holds a professorship?",
102          "answer": "Tim Berners-Lee holds a professorship at MIT (founded 1861) and the University of Southampton (founded 1952).",
103          "difficulty": "hard",
104          "hops": 3,
105      },
106      {
107          "question": "What is the latest stable version of the programming language that ranks #1 on the TIOBE index as of this year?",
108          "answer": "Python is currently #1 on TIOBE. The latest stable version should be verified via the official python.org site.",
109          "difficulty": "medium",
110          "hops": 2,
111      },
112      {
113          "question": "How many employees does the parent company of Instagram have?",
114          "answer": "Meta Platforms (parent of Instagram) employs approximately 70,000+ people as of recent reports.",
115          "difficulty": "medium",
116          "hops": 2,
117      },
118      {
119          "question": "What is the current interest rate set by the central bank of the country where the Eiffel Tower is located?",
120          "answer": "The European Central Bank sets rates for France/eurozone. The current rate should be verified — it has changed frequently in 2023-2025.",
121          "difficulty": "hard",
122          "hops": 2,
123      },
124      {
125          "question": "Which company acquired the startup founded by the creator of Oculus VR?",
126          "answer": "Palmer Luckey founded Oculus VR, which was acquired by Facebook (now Meta). He later founded Anduril Industries.",
127          "difficulty": "medium",
128          "hops": 2,
129      },
130      {
131          "question": "What is the market cap of the company that owns the most popular search engine in Russia?",
132          "answer": "Yandex (now split into separate entities after 2024 restructuring). Current market cap should be verified via financial sources.",
133          "difficulty": "hard",
134          "hops": 2,
135      },
136      {
137          "question": "What was the GDP growth rate of the country that hosted the most recent Summer Olympics?",
138          "answer": "Paris, France hosted the 2024 Summer Olympics. France's recent GDP growth should be verified via World Bank or IMF data.",
139          "difficulty": "hard",
140          "hops": 2,
141      },
142  ]
143  
144  
145  # ---------------------------------------------------------------------------
146  # Configuration
147  # ---------------------------------------------------------------------------
148  
149  class WebResearchEnvConfig(HermesAgentEnvConfig):
150      """Configuration for the web research RL environment."""
151  
152      # Reward weights
153      correctness_weight: float = Field(
154          default=0.6,
155          description="Weight for answer correctness in reward (LLM judge score).",
156      )
157      tool_usage_weight: float = Field(
158          default=0.2,
159          description="Weight for tool usage signal (did the model actually use web tools?).",
160      )
161      efficiency_weight: float = Field(
162          default=0.2,
163          description="Weight for efficiency signal (penalizes excessive tool calls).",
164      )
165      diversity_bonus: float = Field(
166          default=0.1,
167          description="Bonus reward for citing ≥2 distinct domains.",
168      )
169  
170      # Efficiency thresholds
171      efficient_max_calls: int = Field(
172          default=5,
173          description="Maximum tool calls before efficiency penalty begins.",
174      )
175      heavy_penalty_calls: int = Field(
176          default=10,
177          description="Tool call count where efficiency penalty steepens.",
178      )
179  
180      # Eval
181      eval_size: int = Field(
182          default=20,
183          description="Number of held-out items for evaluation.",
184      )
185      eval_split_ratio: float = Field(
186          default=0.1,
187          description="Fraction of dataset to hold out for evaluation (0.0–1.0).",
188      )
189  
190      # Dataset
191      dataset_name: str = Field(
192          default="google/frames-benchmark",
193          description="HuggingFace dataset name for research questions.",
194      )
195  
196  
197  # ---------------------------------------------------------------------------
198  # Environment
199  # ---------------------------------------------------------------------------
200  
201  class WebResearchEnv(HermesAgentBaseEnv):
202      """
203      RL environment for training multi-step web research skills.
204  
205      The model is given a factual question requiring 2-3 hops of web research
206      and must use web_search / web_extract tools to find and synthesize the answer.
207  
208      Reward is multi-signal:
209        60% — answer correctness (LLM judge)
210        20% — tool usage (did the model actually search the web?)
211        20% — efficiency (penalizes >5 tool calls)
212  
213      Bonus +0.1 for source diversity (≥2 distinct domains cited).
214      """
215  
216      name = "web-research"
217      env_config_cls = WebResearchEnvConfig
218  
219      # Default toolsets for this environment — web + file for saving notes
220      default_toolsets = ["web", "file"]
221  
222      @classmethod
223      def config_init(cls) -> Tuple[WebResearchEnvConfig, List[APIServerConfig]]:
224          """Default configuration for the web research environment."""
225          env_config = WebResearchEnvConfig(
226              enabled_toolsets=["web", "file"],
227              max_agent_turns=15,
228              agent_temperature=1.0,
229              system_prompt=(
230                  "You are a highly capable research agent. When asked a factual question, "
231                  "always use web_search to find current, accurate information before answering. "
232                  "Cite at least 2 sources. Be concise and accurate."
233              ),
234              group_size=4,
235              total_steps=1000,
236              steps_per_eval=100,
237              use_wandb=True,
238              wandb_name="web-research",
239          )
240  
241          server_configs = [
242              APIServerConfig(
243                  base_url="https://openrouter.ai/api/v1",
244                  model_name="anthropic/claude-sonnet-4.5",
245                  server_type="openai",
246                  api_key=os.getenv("OPENROUTER_API_KEY", ""),
247                  health_check=False,
248              )
249          ]
250  
251          return env_config, server_configs
252  
253      def __init__(self, *args, **kwargs):
254          super().__init__(*args, **kwargs)
255          self._items: list[dict] = []
256          self._eval_items: list[dict] = []
257          self._index: int = 0
258  
259          # Metrics tracking for wandb
260          self._reward_buffer: list[float] = []
261          self._correctness_buffer: list[float] = []
262          self._tool_usage_buffer: list[float] = []
263          self._efficiency_buffer: list[float] = []
264          self._diversity_buffer: list[float] = []
265  
266      # ------------------------------------------------------------------
267      # 1. Setup — load dataset
268      # ------------------------------------------------------------------
269  
270      async def setup(self) -> None:
271          """Load the FRAMES benchmark or fall back to built-in samples."""
272          if HF_AVAILABLE:
273              try:
274                  logger.info("Loading FRAMES benchmark from HuggingFace...")
275                  ds = load_dataset(self.config.dataset_name, split="test")
276                  self._items = [
277                      {
278                          "question": row["Prompt"],
279                          "answer": row["Answer"],
280                          "difficulty": row.get("reasoning_types", "unknown"),
281                          "hops": 2,
282                      }
283                      for row in ds
284                  ]
285                  # Hold out for eval
286                  eval_size = max(
287                      self.config.eval_size,
288                      int(len(self._items) * self.config.eval_split_ratio),
289                  )
290                  random.shuffle(self._items)
291                  self._eval_items = self._items[:eval_size]
292                  self._items = self._items[eval_size:]
293                  logger.info(
294                      f"Loaded {len(self._items)} train / {len(self._eval_items)} eval items "
295                      f"from FRAMES benchmark."
296                  )
297                  return
298              except Exception as e:
299                  logger.warning(f"Could not load FRAMES from HuggingFace: {e}. Using built-in samples.")
300  
301          # Fallback
302          random.shuffle(SAMPLE_QUESTIONS)
303          split = max(1, len(SAMPLE_QUESTIONS) * 8 // 10)
304          self._items = SAMPLE_QUESTIONS[:split]
305          self._eval_items = SAMPLE_QUESTIONS[split:]
306          logger.info(
307              f"Using built-in sample dataset: {len(self._items)} train / "
308              f"{len(self._eval_items)} eval items."
309          )
310  
311      # ------------------------------------------------------------------
312      # 2. get_next_item — return the next question
313      # ------------------------------------------------------------------
314  
315      async def get_next_item(self) -> dict:
316          """Return the next item, cycling through the dataset."""
317          if not self._items:
318              raise RuntimeError("Dataset is empty. Did you call setup()?")
319          item = self._items[self._index % len(self._items)]
320          self._index += 1
321          return item
322  
323      # ------------------------------------------------------------------
324      # 3. format_prompt — build the user-facing prompt
325      # ------------------------------------------------------------------
326  
327      def format_prompt(self, item: dict) -> str:
328          """Format the research question as a task prompt."""
329          return (
330              f"Research the following question thoroughly using web search. "
331              f"You MUST search the web to find current, accurate information — "
332              f"do not rely solely on your training data.\n\n"
333              f"Question: {item['question']}\n\n"
334              f"Requirements:\n"
335              f"- Use web_search and/or web_extract tools to find information\n"
336              f"- Search at least 2 different sources\n"
337              f"- Provide a concise, accurate answer (2-4 sentences)\n"
338              f"- Cite the sources you used"
339          )
340  
341      # ------------------------------------------------------------------
342      # 4. compute_reward — multi-signal scoring
343      # ------------------------------------------------------------------
344  
345      async def compute_reward(
346          self,
347          item: dict,
348          result: AgentResult,
349          ctx: ToolContext,
350      ) -> float:
351          """
352          Multi-signal reward function:
353  
354            correctness_weight * correctness  — LLM judge comparing answer to ground truth
355            tool_usage_weight  * tool_used    — binary: did the model use web tools?
356            efficiency_weight  * efficiency   — penalizes wasteful tool usage
357            + diversity_bonus                 — source diversity (≥2 distinct domains)
358          """
359          # Extract final response from messages (last assistant message with content)
360          final_response = ""
361          tools_used: list[str] = []
362          for msg in reversed(result.messages):
363              if msg.get("role") == "assistant" and msg.get("content") and not final_response:
364                  final_response = msg["content"]
365              # Collect tool names from tool call messages
366              if msg.get("role") == "assistant" and msg.get("tool_calls"):
367                  for tc in msg["tool_calls"]:
368                      fn = tc.get("function", {}) if isinstance(tc, dict) else {}
369                      name = fn.get("name", "")
370                      if name:
371                          tools_used.append(name)
372          tool_call_count: int = result.turns_used or len(tools_used)
373  
374          cfg = self.config
375  
376          # ---- Signal 1: Answer correctness (LLM judge) ----------------
377          correctness = await self._llm_judge(
378              question=item["question"],
379              expected=item["answer"],
380              model_answer=final_response,
381          )
382  
383          # ---- Signal 2: Web tool usage --------------------------------
384          web_tools = {"web_search", "web_extract", "search", "firecrawl"}
385          tool_used = 1.0 if any(t in web_tools for t in tools_used) else 0.0
386  
387          # ---- Signal 3: Efficiency ------------------------------------
388          if tool_call_count <= cfg.efficient_max_calls:
389              efficiency = 1.0
390          elif tool_call_count <= cfg.heavy_penalty_calls:
391              efficiency = 1.0 - (tool_call_count - cfg.efficient_max_calls) * 0.08
392          else:
393              efficiency = max(0.0, 1.0 - (tool_call_count - cfg.efficient_max_calls) * 0.12)
394  
395          # ---- Bonus: Source diversity ---------------------------------
396          domains = self._extract_domains(final_response)
397          diversity = cfg.diversity_bonus if len(domains) >= 2 else 0.0
398  
399          # ---- Combine ------------------------------------------------
400          reward = (
401              cfg.correctness_weight * correctness
402              + cfg.tool_usage_weight * tool_used
403              + cfg.efficiency_weight * efficiency
404              + diversity
405          )
406          reward = min(1.0, max(0.0, reward))  # clamp to [0, 1]
407  
408          # Track for wandb
409          self._reward_buffer.append(reward)
410          self._correctness_buffer.append(correctness)
411          self._tool_usage_buffer.append(tool_used)
412          self._efficiency_buffer.append(efficiency)
413          self._diversity_buffer.append(diversity)
414  
415          logger.debug(
416              f"Reward breakdown — correctness={correctness:.2f}, "
417              f"tool_used={tool_used:.1f}, efficiency={efficiency:.2f}, "
418              f"diversity={diversity:.1f} → total={reward:.3f}"
419          )
420  
421          return reward
422  
423      # ------------------------------------------------------------------
424      # 5. evaluate — run on held-out eval split
425      # ------------------------------------------------------------------
426  
427      async def evaluate(self, *args, **kwargs) -> None:
428          """Run evaluation on the held-out split using the full agent loop with tools.
429  
430          Each eval item runs through the same agent loop as training —
431          the model can use web_search, web_extract, etc. to research answers.
432          This measures actual agentic research capability, not just knowledge.
433          """
434          import time
435          import uuid
436          from environments.agent_loop import HermesAgentLoop
437          from environments.tool_context import ToolContext
438  
439          items = self._eval_items
440          if not items:
441              logger.warning("No eval items available.")
442              return
443  
444          eval_size = min(self.config.eval_size, len(items))
445          eval_items = items[:eval_size]
446  
447          logger.info(f"Running eval on {len(eval_items)} questions (with agent loop + tools)...")
448          start_time = time.time()
449          samples = []
450  
451          # Resolve tools once for all eval items
452          tools, valid_names = self._resolve_tools_for_group()
453  
454          for i, item in enumerate(eval_items):
455              task_id = str(uuid.uuid4())
456              logger.info(f"Eval [{i+1}/{len(eval_items)}]: {item['question'][:80]}...")
457  
458              try:
459                  # Build messages
460                  messages: List[Dict[str, Any]] = []
461                  if self.config.system_prompt:
462                      messages.append({"role": "system", "content": self.config.system_prompt})
463                  messages.append({"role": "user", "content": self.format_prompt(item)})
464  
465                  # Run the full agent loop with tools
466                  agent = HermesAgentLoop(
467                      server=self.server,
468                      tool_schemas=tools,
469                      valid_tool_names=valid_names,
470                      max_turns=self.config.max_agent_turns,
471                      task_id=task_id,
472                      temperature=0.0,  # Deterministic for eval
473                      max_tokens=self.config.max_token_length,
474                      extra_body=self.config.extra_body,
475                      budget_config=self.config.build_budget_config(),
476                  )
477                  result = await agent.run(messages)
478  
479                  # Extract final response and tool usage from messages
480                  final_response = ""
481                  tool_call_count = 0
482                  for msg in reversed(result.messages):
483                      if msg.get("role") == "assistant" and msg.get("content") and not final_response:
484                          final_response = msg["content"]
485                      if msg.get("role") == "assistant" and msg.get("tool_calls"):
486                          tool_call_count += len(msg["tool_calls"])
487  
488                  # Compute reward (includes LLM judge for correctness)
489                  # Temporarily save buffer lengths so we can extract the
490                  # correctness score without calling judge twice, and avoid
491                  # polluting training metric buffers with eval data.
492                  buf_len = len(self._correctness_buffer)
493                  ctx = ToolContext(task_id)
494                  try:
495                      reward = await self.compute_reward(item, result, ctx)
496                  finally:
497                      ctx.cleanup()
498  
499                  # Extract correctness from the buffer (compute_reward appended it)
500                  # then remove eval entries from training buffers
501                  correctness = (
502                      self._correctness_buffer[buf_len]
503                      if len(self._correctness_buffer) > buf_len
504                      else 0.0
505                  )
506                  # Roll back buffers to avoid polluting training metrics
507                  for buf in (
508                      self._reward_buffer, self._correctness_buffer,
509                      self._tool_usage_buffer, self._efficiency_buffer,
510                      self._diversity_buffer,
511                  ):
512                      if len(buf) > buf_len:
513                          buf.pop()
514  
515                  samples.append({
516                      "prompt": item["question"],
517                      "response": final_response[:500],
518                      "expected": item["answer"],
519                      "correctness": correctness,
520                      "reward": reward,
521                      "tool_calls": tool_call_count,
522                      "turns": result.turns_used,
523                  })
524  
525                  logger.info(
526                      f"  → correctness={correctness:.2f}, reward={reward:.3f}, "
527                      f"tools={tool_call_count}, turns={result.turns_used}"
528                  )
529  
530              except Exception as e:
531                  logger.error(f"Eval error on item: {e}")
532                  samples.append({
533                      "prompt": item["question"],
534                      "response": f"ERROR: {e}",
535                      "expected": item["answer"],
536                      "correctness": 0.0,
537                      "reward": 0.0,
538                      "tool_calls": 0,
539                      "turns": 0,
540                  })
541  
542          end_time = time.time()
543  
544          # Compute aggregate metrics
545          correctness_scores = [s["correctness"] for s in samples]
546          rewards = [s["reward"] for s in samples]
547          tool_counts = [s["tool_calls"] for s in samples]
548          n = len(samples)
549  
550          eval_metrics = {
551              "eval/mean_correctness": sum(correctness_scores) / n if n else 0.0,
552              "eval/mean_reward": sum(rewards) / n if n else 0.0,
553              "eval/mean_tool_calls": sum(tool_counts) / n if n else 0.0,
554              "eval/tool_usage_rate": sum(1 for t in tool_counts if t > 0) / n if n else 0.0,
555              "eval/n_items": n,
556          }
557  
558          logger.info(
559              f"Eval complete — correctness={eval_metrics['eval/mean_correctness']:.3f}, "
560              f"reward={eval_metrics['eval/mean_reward']:.3f}, "
561              f"tool_usage={eval_metrics['eval/tool_usage_rate']:.0%}"
562          )
563  
564          await self.evaluate_log(
565              metrics=eval_metrics,
566              samples=samples,
567              start_time=start_time,
568              end_time=end_time,
569          )
570  
571      # ------------------------------------------------------------------
572      # 6. wandb_log — custom metrics
573      # ------------------------------------------------------------------
574  
575      async def wandb_log(self, wandb_metrics: Optional[Dict] = None) -> None:
576          """Log reward breakdown metrics to wandb."""
577          if wandb_metrics is None:
578              wandb_metrics = {}
579  
580          if self._reward_buffer:
581              n = len(self._reward_buffer)
582              wandb_metrics["train/mean_reward"] = sum(self._reward_buffer) / n
583              wandb_metrics["train/mean_correctness"] = sum(self._correctness_buffer) / n
584              wandb_metrics["train/mean_tool_usage"] = sum(self._tool_usage_buffer) / n
585              wandb_metrics["train/mean_efficiency"] = sum(self._efficiency_buffer) / n
586              wandb_metrics["train/mean_diversity"] = sum(self._diversity_buffer) / n
587              wandb_metrics["train/total_rollouts"] = n
588  
589              # Accuracy buckets
590              wandb_metrics["train/correct_rate"] = (
591                  sum(1 for c in self._correctness_buffer if c >= 0.7) / n
592              )
593              wandb_metrics["train/tool_usage_rate"] = (
594                  sum(1 for t in self._tool_usage_buffer if t > 0) / n
595              )
596  
597              # Clear buffers
598              self._reward_buffer.clear()
599              self._correctness_buffer.clear()
600              self._tool_usage_buffer.clear()
601              self._efficiency_buffer.clear()
602              self._diversity_buffer.clear()
603  
604          await super().wandb_log(wandb_metrics)
605  
606      # ------------------------------------------------------------------
607      # Private helpers
608      # ------------------------------------------------------------------
609  
610      async def _llm_judge(
611          self,
612          question: str,
613          expected: str,
614          model_answer: str,
615      ) -> float:
616          """
617          Use the server's LLM to judge answer correctness.
618          Falls back to keyword heuristic if LLM call fails.
619          """
620          if not model_answer or not model_answer.strip():
621              return 0.0
622  
623          judge_prompt = (
624              "You are an impartial judge evaluating the quality of an AI research answer.\n\n"
625              f"Question: {question}\n\n"
626              f"Reference answer: {expected}\n\n"
627              f"Model answer: {model_answer}\n\n"
628              "Score the model answer on a scale from 0.0 to 1.0 where:\n"
629              "  1.0 = fully correct and complete\n"
630              "  0.7 = mostly correct with minor gaps\n"
631              "  0.4 = partially correct\n"
632              "  0.1 = mentions relevant topic but wrong or very incomplete\n"
633              "  0.0 = completely wrong or no answer\n\n"
634              "Consider: factual accuracy, completeness, and relevance.\n"
635              'Respond with ONLY a JSON object: {"score": <float>, "reason": "<one sentence>"}'
636          )
637  
638          try:
639              response = await self.server.chat_completion(
640                  messages=[{"role": "user", "content": judge_prompt}],
641                  n=1,
642                  max_tokens=150,
643                  temperature=0.0,
644                  split="eval",
645              )
646              text = response.choices[0].message.content if response.choices else ""
647              parsed = self._parse_judge_json(text)
648              if parsed is not None:
649                  return float(parsed)
650          except Exception as e:
651              logger.debug(f"LLM judge failed: {e}. Using heuristic.")
652  
653          return self._heuristic_score(expected, model_answer)
654  
655      @staticmethod
656      def _parse_judge_json(text: str) -> Optional[float]:
657          """Extract the score float from LLM judge JSON response."""
658          try:
659              clean = re.sub(r"```(?:json)?|```", "", text).strip()
660              data = json.loads(clean)
661              score = float(data.get("score", -1))
662              if 0.0 <= score <= 1.0:
663                  return score
664          except Exception:
665              match = re.search(r'"score"\s*:\s*([0-9.]+)', text)
666              if match:
667                  score = float(match.group(1))
668                  if 0.0 <= score <= 1.0:
669                      return score
670          return None
671  
672      @staticmethod
673      def _heuristic_score(expected: str, model_answer: str) -> float:
674          """Lightweight keyword overlap score as fallback."""
675          stopwords = {
676              "the", "a", "an", "is", "are", "was", "were", "of", "in", "on",
677              "at", "to", "for", "with", "and", "or", "but", "it", "its",
678              "this", "that", "as", "by", "from", "be", "has", "have", "had",
679          }
680  
681          def tokenize(text: str) -> set:
682              tokens = re.findall(r'\b\w+\b', text.lower())
683              return {t for t in tokens if t not in stopwords and len(t) > 2}
684  
685          expected_tokens = tokenize(expected)
686          answer_tokens = tokenize(model_answer)
687  
688          if not expected_tokens:
689              return 0.5
690  
691          overlap = len(expected_tokens & answer_tokens)
692          union = len(expected_tokens | answer_tokens)
693  
694          jaccard = overlap / union if union > 0 else 0.0
695          recall = overlap / len(expected_tokens)
696          return min(1.0, 0.4 * jaccard + 0.6 * recall)
697  
698      @staticmethod
699      def _extract_domains(text: str) -> set:
700          """Extract unique domains from URLs cited in the response."""
701          urls = re.findall(r'https?://[^\s\)>\]"\']+', text)
702          domains = set()
703          for url in urls:
704              try:
705                  parsed = urlparse(url)
706                  domain = parsed.netloc.lower().lstrip("www.")
707                  if domain:
708                      domains.add(domain)
709              except Exception:
710                  pass
711          return domains
712  
713  
714  # ---------------------------------------------------------------------------
715  # Entry point
716  # ---------------------------------------------------------------------------
717  
718  if __name__ == "__main__":
719      WebResearchEnv.cli()