google_search.py
1 """Google Custom Search API implementation.""" 2 3 import httpx 4 import logging 5 from typing import Literal, Optional 6 from urllib.parse import quote 7 from .base import WebSearchTool 8 from .models import SearchResult, SearchSource 9 10 logger = logging.getLogger(__name__) 11 12 # Default number of search results to return 13 DEFAULT_MAX_RESULTS = 5 14 15 16 class GoogleSearchTool(WebSearchTool): 17 """Google Custom Search API implementation.""" 18 19 def __init__(self, api_key: str, search_engine_id: str, **kwargs): 20 """Initialize Google Custom Search tool. 21 22 Args: 23 api_key: Google API key 24 search_engine_id: Google Custom Search Engine ID (CSE ID) 25 **kwargs: Additional configuration 26 """ 27 super().__init__(api_key=api_key, **kwargs) 28 self.search_engine_id = search_engine_id 29 self.base_url = "https://www.googleapis.com/customsearch/v1" 30 31 if not self.api_key: 32 raise ValueError("Google API key is required") 33 if not self.search_engine_id: 34 raise ValueError("Google Custom Search Engine ID is required") 35 36 async def search( 37 self, 38 query: str, 39 max_results: int = DEFAULT_MAX_RESULTS, 40 search_depth: Literal["basic", "advanced"] = "basic", 41 search_type: Optional[Literal["image"]] = None, 42 date_restrict: Optional[str] = None, 43 safe_search: Optional[Literal["off", "medium", "high"]] = None, 44 **kwargs 45 ) -> SearchResult: 46 """Execute Google Custom Search. 47 48 Args: 49 query: Search query string 50 max_results: Maximum number of results (1-10) 51 search_depth: Not used for Google (kept for interface compatibility) 52 search_type: Set to "image" for image search 53 date_restrict: Restricts results based on recency (e.g., "d[number]" for days) 54 safe_search: Safe search level 55 **kwargs: Additional Google CSE parameters 56 57 Returns: 58 SearchResult object 59 """ 60 try: 61 # Ensure max_results is an integer (LLM may pass string) 62 try: 63 max_results = int(max_results) 64 except (TypeError, ValueError): 65 max_results = DEFAULT_MAX_RESULTS 66 67 # Google CSE allows max 10 results per request 68 num_results = min(max(max_results, 1), 10) 69 70 # Build query parameters 71 params = { 72 "key": self.api_key, 73 "cx": self.search_engine_id, 74 "q": query, 75 "num": num_results, 76 } 77 78 # Add optional parameters 79 if search_type: 80 params["searchType"] = search_type 81 if date_restrict: 82 params["dateRestrict"] = date_restrict 83 if safe_search: 84 params["safe"] = safe_search 85 86 # Add any additional kwargs 87 params.update(kwargs) 88 89 logger.info(f"Executing Google search: query='{query}', num={num_results}") 90 91 async with httpx.AsyncClient() as client: 92 response = await client.get( 93 self.base_url, 94 params=params, 95 timeout=30.0 96 ) 97 98 if response.status_code != 200: 99 error_msg = f"Google API error: {response.status_code}" 100 try: 101 error_data = response.json() 102 error_msg += f" - {error_data.get('error', {}).get('message', '')}" 103 except: 104 error_msg += f" - {response.text}" 105 106 logger.error(error_msg) 107 return SearchResult( 108 success=False, 109 query=query, 110 error=error_msg 111 ) 112 113 data = response.json() 114 115 # Transform results to our format 116 organic = [] 117 images = [] 118 items = data.get("items", []) 119 120 # Handle image search results separately 121 if search_type == "image": 122 from .models import ImageResult 123 for item in items: 124 try: 125 image = ImageResult( 126 imageUrl=item["link"], 127 title=item.get("title", ""), 128 link=item.get("image", {}).get("contextLink", item["link"]) 129 ) 130 images.append(image) 131 except Exception as e: 132 logger.warning(f"Failed to parse image result: {e}") 133 continue 134 else: 135 # Regular web search - add to organic results 136 for item in items: 137 try: 138 # Extract snippet (description) 139 snippet = item.get("snippet", "") 140 141 # For HTML-formatted snippets, try to get plain text 142 if "htmlSnippet" in item: 143 import html 144 snippet = html.unescape(item["htmlSnippet"]) 145 # Remove HTML tags 146 import re 147 snippet = re.sub(r'<[^>]+>', '', snippet) 148 149 source = SearchSource( 150 link=item["link"], 151 title=item["title"], 152 snippet=snippet, 153 attribution=self._extract_domain(item["link"]), 154 imageUrl=item.get("pagemap", {}).get("cse_thumbnail", [{}])[0].get("src") 155 ) 156 organic.append(source) 157 except Exception as e: 158 logger.warning(f"Failed to parse search result: {e}") 159 continue 160 161 logger.info(f"Google search successful: {len(organic)} results") 162 163 return SearchResult( 164 success=True, 165 query=query, 166 organic=organic, 167 images=images, 168 metadata={ 169 "search_engine": "google", 170 "total_results": data.get("searchInformation", {}).get("totalResults"), 171 "search_time": data.get("searchInformation", {}).get("searchTime"), 172 } 173 ) 174 175 except httpx.TimeoutException: 176 error_msg = "Google search timed out" 177 logger.error(error_msg) 178 return SearchResult( 179 success=False, 180 query=query, 181 error=error_msg 182 ) 183 except Exception as e: 184 error_msg = f"Google search failed: {str(e)}" 185 logger.error(error_msg, exc_info=True) 186 return SearchResult( 187 success=False, 188 query=query, 189 error=error_msg 190 ) 191 192 @staticmethod 193 def _extract_domain(url: str) -> str: 194 """Extract clean domain from URL. 195 196 Args: 197 url: Full URL 198 199 Returns: 200 Clean domain name 201 """ 202 try: 203 # Remove protocol 204 domain = url.replace("https://", "").replace("http://", "") 205 # Get first part (domain) 206 domain = domain.split("/")[0] 207 # Remove www. 208 if domain.startswith("www."): 209 domain = domain[4:] 210 return domain 211 except: 212 return url 213 214 def get_tool_definition(self) -> dict: 215 """Get the tool definition for LLM function calling. 216 217 Returns: 218 Dictionary containing the tool definition 219 """ 220 return { 221 "type": "function", 222 "function": { 223 "name": "web_search", 224 "description": ( 225 "Search the web using Google Custom Search. Use this when you need " 226 "up-to-date information, facts, news, or data that may not be in " 227 "your training data. Always cite sources using the citation format." 228 ), 229 "parameters": { 230 "type": "object", 231 "properties": { 232 "query": { 233 "type": "string", 234 "description": "The search query" 235 }, 236 "max_results": { 237 "type": "integer", 238 "description": "Maximum number of results (1-10)", 239 "minimum": 1, 240 "maximum": 10, 241 "default": DEFAULT_MAX_RESULTS 242 }, 243 "search_type": { 244 "type": "string", 245 "enum": ["image"], 246 "description": "Set to 'image' for image search" 247 }, 248 "date_restrict": { 249 "type": "string", 250 "description": "Restrict results by recency (e.g., 'd7' for last 7 days)" 251 } 252 }, 253 "required": ["query"] 254 } 255 } 256 }