web_search_tools.py
1 """ 2 Web Search Tools for Solace Agent Mesh 3 Provides web search capabilities using Google Custom Search API. 4 5 For other search providers (e.g., Exa, Brave, Tavily), please use the corresponding 6 plugins from the solace-agent-mesh-plugins repository. 7 """ 8 9 import logging 10 from typing import Any, Dict, Optional 11 from datetime import datetime, timezone 12 from google.adk.tools import ToolContext 13 14 from ...tools.web_search import GoogleSearchTool, SearchResult 15 from .tool_definition import BuiltinTool 16 from .registry import tool_registry 17 from ...common.rag_dto import create_rag_source, create_rag_search_result 18 19 log = logging.getLogger(__name__) 20 21 CATEGORY_NAME = "web_search" 22 CATEGORY_DESCRIPTION = "Tools for searching the web and retrieving current information" 23 24 # State key for tracking search turns within a task/session 25 _SEARCH_TURN_STATE_KEY = "web_search_turn_counter" 26 27 28 def _get_next_search_turn(tool_context: Optional[ToolContext]) -> int: 29 """ 30 Get the next search turn number using tool context state. 31 32 This approach stores the turn counter in the tool context state, which is: 33 - Per-task/session scoped (not global) 34 - Automatically cleaned up when the task ends 35 36 Each search within a task gets a unique turn number, so citations from 37 different searches never collide (e.g., s0r0, s0r1 for first search, 38 s1r0, s1r1 for second search). 39 """ 40 if not tool_context: 41 # Fallback: return 0 if no context (shouldn't happen in practice) 42 log.warning("[web_search] No tool_context provided, using turn=0") 43 return 0 44 45 # Get current turn from state, defaulting to 0 46 current_turn = tool_context.state.get(_SEARCH_TURN_STATE_KEY, 0) 47 48 # Increment for next search 49 tool_context.state[_SEARCH_TURN_STATE_KEY] = current_turn + 1 50 51 return current_turn 52 53 54 async def web_search_google( 55 query: str, 56 max_results: int = 5, 57 search_type: Optional[str] = None, 58 date_restrict: Optional[str] = None, 59 safe_search: Optional[str] = None, 60 tool_context: ToolContext = None, 61 tool_config: Optional[Dict[str, Any]] = None, 62 **kwargs 63 ) -> str: 64 """ 65 Search the web using Google Custom Search API. 66 67 Args: 68 query: The search query string 69 max_results: Maximum number of results to return (1-10) 70 search_type: Set to 'image' for image search 71 date_restrict: Restrict results by recency (e.g., 'd7' for last 7 days) 72 safe_search: Safe search level - 'off', 'medium', or 'high' 73 tool_context: ADK tool context 74 tool_config: Tool configuration containing API keys 75 76 Returns: 77 JSON string containing search results with sources for citation 78 """ 79 log_identifier = "[web_search_google]" 80 81 try: 82 config = tool_config or {} 83 api_key = config.get("google_search_api_key") 84 search_engine_id = config.get("google_cse_id") 85 86 if not api_key or not search_engine_id: 87 error_msg = "google_search_api_key or google_cse_id not configured in tool_config" 88 log.error("%s %s", log_identifier, error_msg) 89 return f"Error: {error_msg}" 90 91 tool = GoogleSearchTool( 92 api_key=api_key, 93 search_engine_id=search_engine_id 94 ) 95 96 result: SearchResult = await tool.search( 97 query=query, 98 max_results=max_results, 99 search_type=search_type, 100 date_restrict=date_restrict, 101 safe_search=safe_search, 102 **kwargs 103 ) 104 105 if not result.success: 106 log.error("%s Search failed: %s", log_identifier, result.error) 107 return f"Error: {result.error}" 108 109 # Get unique search turn for this search to prevent citation ID collisions 110 # Uses tool context state (per-task scoped, automatically cleaned up) 111 search_turn = _get_next_search_turn(tool_context) 112 citation_prefix = f"s{search_turn}r" # e.g., s0r0, s0r1 for first search; s1r0, s1r1 for second 113 114 log.info( 115 "%s Search successful: %d results, %d images (turn=%d, citation_prefix=%s)", 116 log_identifier, 117 len(result.organic), 118 len(result.images), 119 search_turn, 120 citation_prefix 121 ) 122 123 rag_sources = [] 124 valid_citation_ids = [] 125 126 # Log citation-to-source mapping for debugging 127 log.debug("%s === CITATION TO SOURCE MAPPING (turn %d) ===", log_identifier, search_turn) 128 129 for i, source in enumerate(result.organic): 130 citation_id = f"{citation_prefix}{i}" 131 valid_citation_ids.append(citation_id) 132 133 # Log each citation mapping at debug level 134 log.debug( 135 "%s Citation [[cite:%s]] -> URL: %s | Title: %s", 136 log_identifier, 137 citation_id, 138 source.link, 139 source.title[:50] if source.title else "N/A" 140 ) 141 142 rag_source = create_rag_source( 143 citation_id=citation_id, 144 file_id=f"web_search_{search_turn}_{i}", 145 filename=source.attribution or source.title, 146 title=source.title, 147 source_url=source.link, 148 url=source.link, 149 content_preview=source.snippet, 150 relevance_score=1.0, 151 source_type="web", 152 retrieved_at=datetime.now(timezone.utc).isoformat(), 153 metadata={ 154 "title": source.title, 155 "link": source.link, 156 "type": "web_search", 157 "favicon": f"https://www.google.com/s2/favicons?domain={source.link}&sz=32" if source.link else "" 158 } 159 ) 160 rag_sources.append(rag_source) 161 162 log.debug("%s === END CITATION MAPPING ===", log_identifier) 163 log.debug("%s Valid citation IDs for this search: %s", log_identifier, valid_citation_ids) 164 165 for i, image in enumerate(result.images): 166 image_citation_id = f"img{search_turn}r{i}" 167 image_source = create_rag_source( 168 citation_id=image_citation_id, 169 file_id=f"web_search_image_{search_turn}_{i}", 170 filename=image.title or f"Image {i+1}", 171 title=image.title, 172 source_url=image.link, 173 url=image.link, 174 content_preview=image.title or "", 175 relevance_score=1.0, 176 source_type="image", 177 retrieved_at=datetime.now(timezone.utc).isoformat(), 178 metadata={ 179 "title": image.title, 180 "link": image.link, 181 "imageUrl": image.imageUrl, 182 "type": "image", 183 } 184 ) 185 rag_sources.append(image_source) 186 187 rag_metadata = create_rag_search_result( 188 query=query, 189 search_type="web_search", 190 timestamp=datetime.now(timezone.utc).isoformat(), 191 sources=rag_sources 192 ) 193 194 # Build a formatted result string that clearly associates each citation ID with its content 195 # This helps the LLM correctly match citations to facts 196 formatted_results = [] 197 formatted_results.append(f"=== SEARCH RESULTS (Turn {search_turn}) ===") 198 formatted_results.append(f"Query: {query}") 199 formatted_results.append(f"Valid citation IDs: {', '.join(valid_citation_ids)}") 200 formatted_results.append("") 201 202 for i, source in enumerate(result.organic): 203 citation_id = f"{citation_prefix}{i}" 204 formatted_results.append(f"--- RESULT {i+1} ---") 205 formatted_results.append(f"CITATION ID: [[cite:{citation_id}]]") 206 formatted_results.append(f"TITLE: {source.title}") 207 formatted_results.append(f"URL: {source.link}") 208 formatted_results.append(f"CONTENT: {source.snippet}") 209 formatted_results.append(f"USE [[cite:{citation_id}]] to cite facts from THIS result only") 210 formatted_results.append("") 211 212 formatted_results.append("=== END SEARCH RESULTS ===") 213 formatted_results.append("") 214 formatted_results.append("IMPORTANT: Each citation ID is UNIQUE to its result.") 215 formatted_results.append("Only use a citation ID for facts that appear in THAT specific result's CONTENT.") 216 217 return { 218 "result": result.model_dump_json(), 219 "formatted_results": "\n".join(formatted_results), 220 "rag_metadata": rag_metadata, 221 "valid_citation_ids": valid_citation_ids, 222 "num_results": len(result.organic), 223 "search_turn": search_turn 224 } 225 226 except Exception as e: 227 log.exception("%s Unexpected error in Google search: %s", log_identifier, e) 228 return f"Error executing Google search: {str(e)}" 229 230 231 web_search_google_tool_def = BuiltinTool( 232 name="web_search_google", 233 implementation=web_search_google, 234 description=( 235 "Search the web using Google Custom Search API. " 236 "Use this when you need up-to-date information from Google. " 237 "Always cite text sources using the citation format provided in your instructions. " 238 "IMPORTANT: Image results will be displayed automatically in the UI - do NOT cite images, do NOT mention image URLs, and do NOT use citation markers like [[cite:imageX]] for images in your response text." 239 ), 240 category=CATEGORY_NAME, 241 category_description=CATEGORY_DESCRIPTION, 242 required_scopes=["tool:web_search:execute"], 243 parameters={ 244 "type": "object", 245 "properties": { 246 "query": { 247 "type": "string", 248 "description": "The search query" 249 }, 250 "max_results": { 251 "type": "integer", 252 "description": "Maximum number of results (1-10)", 253 "minimum": 1, 254 "maximum": 10, 255 "default": 5 256 }, 257 "search_type": { 258 "type": "string", 259 "enum": ["image"], 260 "description": "Set to 'image' for image search" 261 }, 262 "date_restrict": { 263 "type": "string", 264 "description": "Restrict results by recency (e.g., 'd7' for last 7 days)" 265 }, 266 "safe_search": { 267 "type": "string", 268 "enum": ["off", "medium", "high"], 269 "description": "Safe search level" 270 } 271 }, 272 "required": ["query"] 273 }, 274 ) 275 276 tool_registry.register(web_search_google_tool_def) 277 278 log.info("Web search tools registered: web_search_google") 279 log.info("Note: For Exa, Brave, and Tavily search, use plugins from solace-agent-mesh-plugins")