Cradicle Explorer

/ src / solace_agent_mesh / tools / web_search / google_search.py
google_search.py
  1  """Google Custom Search API implementation."""
  2  
  3  import httpx
  4  import logging
  5  from typing import Literal, Optional
  6  from urllib.parse import quote
  7  from .base import WebSearchTool
  8  from .models import SearchResult, SearchSource
  9  
 10  logger = logging.getLogger(__name__)
 11  
 12  # Default number of search results to return
 13  DEFAULT_MAX_RESULTS = 5
 14  
 15  
 16  class GoogleSearchTool(WebSearchTool):
 17      """Google Custom Search API implementation."""
 18      
 19      def __init__(self, api_key: str, search_engine_id: str, **kwargs):
 20          """Initialize Google Custom Search tool.
 21          
 22          Args:
 23              api_key: Google API key
 24              search_engine_id: Google Custom Search Engine ID (CSE ID)
 25              **kwargs: Additional configuration
 26          """
 27          super().__init__(api_key=api_key, **kwargs)
 28          self.search_engine_id = search_engine_id
 29          self.base_url = "https://www.googleapis.com/customsearch/v1"
 30          
 31          if not self.api_key:
 32              raise ValueError("Google API key is required")
 33          if not self.search_engine_id:
 34              raise ValueError("Google Custom Search Engine ID is required")
 35      
 36      async def search(
 37          self,
 38          query: str,
 39          max_results: int = DEFAULT_MAX_RESULTS,
 40          search_depth: Literal["basic", "advanced"] = "basic",
 41          search_type: Optional[Literal["image"]] = None,
 42          date_restrict: Optional[str] = None,
 43          safe_search: Optional[Literal["off", "medium", "high"]] = None,
 44          **kwargs
 45      ) -> SearchResult:
 46          """Execute Google Custom Search.
 47          
 48          Args:
 49              query: Search query string
 50              max_results: Maximum number of results (1-10)
 51              search_depth: Not used for Google (kept for interface compatibility)
 52              search_type: Set to "image" for image search
 53              date_restrict: Restricts results based on recency (e.g., "d[number]" for days)
 54              safe_search: Safe search level
 55              **kwargs: Additional Google CSE parameters
 56              
 57          Returns:
 58              SearchResult object
 59          """
 60          try:
 61              # Ensure max_results is an integer (LLM may pass string)
 62              try:
 63                  max_results = int(max_results)
 64              except (TypeError, ValueError):
 65                  max_results = DEFAULT_MAX_RESULTS
 66              
 67              # Google CSE allows max 10 results per request
 68              num_results = min(max(max_results, 1), 10)
 69              
 70              # Build query parameters
 71              params = {
 72                  "key": self.api_key,
 73                  "cx": self.search_engine_id,
 74                  "q": query,
 75                  "num": num_results,
 76              }
 77              
 78              # Add optional parameters
 79              if search_type:
 80                  params["searchType"] = search_type
 81              if date_restrict:
 82                  params["dateRestrict"] = date_restrict
 83              if safe_search:
 84                  params["safe"] = safe_search
 85              
 86              # Add any additional kwargs
 87              params.update(kwargs)
 88              
 89              logger.info(f"Executing Google search: query='{query}', num={num_results}")
 90              
 91              async with httpx.AsyncClient() as client:
 92                  response = await client.get(
 93                      self.base_url,
 94                      params=params,
 95                      timeout=30.0
 96                  )
 97                  
 98                  if response.status_code != 200:
 99                      error_msg = f"Google API error: {response.status_code}"
100                      try:
101                          error_data = response.json()
102                          error_msg += f" - {error_data.get('error', {}).get('message', '')}"
103                      except:
104                          error_msg += f" - {response.text}"
105                      
106                      logger.error(error_msg)
107                      return SearchResult(
108                          success=False,
109                          query=query,
110                          error=error_msg
111                      )
112                  
113                  data = response.json()
114                  
115                  # Transform results to our format
116                  organic = []
117                  images = []
118                  items = data.get("items", [])
119                  
120                  # Handle image search results separately
121                  if search_type == "image":
122                      from .models import ImageResult
123                      for item in items:
124                          try:
125                              image = ImageResult(
126                                  imageUrl=item["link"],
127                                  title=item.get("title", ""),
128                                  link=item.get("image", {}).get("contextLink", item["link"])
129                              )
130                              images.append(image)
131                          except Exception as e:
132                              logger.warning(f"Failed to parse image result: {e}")
133                              continue
134                  else:
135                      # Regular web search - add to organic results
136                      for item in items:
137                          try:
138                              # Extract snippet (description)
139                              snippet = item.get("snippet", "")
140                              
141                              # For HTML-formatted snippets, try to get plain text
142                              if "htmlSnippet" in item:
143                                  import html
144                                  snippet = html.unescape(item["htmlSnippet"])
145                                  # Remove HTML tags
146                                  import re
147                                  snippet = re.sub(r'<[^>]+>', '', snippet)
148                              
149                              source = SearchSource(
150                                  link=item["link"],
151                                  title=item["title"],
152                                  snippet=snippet,
153                                  attribution=self._extract_domain(item["link"]),
154                                  imageUrl=item.get("pagemap", {}).get("cse_thumbnail", [{}])[0].get("src")
155                              )
156                              organic.append(source)
157                          except Exception as e:
158                              logger.warning(f"Failed to parse search result: {e}")
159                              continue
160                  
161                  logger.info(f"Google search successful: {len(organic)} results")
162                  
163                  return SearchResult(
164                      success=True,
165                      query=query,
166                      organic=organic,
167                      images=images,
168                      metadata={
169                          "search_engine": "google",
170                          "total_results": data.get("searchInformation", {}).get("totalResults"),
171                          "search_time": data.get("searchInformation", {}).get("searchTime"),
172                      }
173                  )
174                  
175          except httpx.TimeoutException:
176              error_msg = "Google search timed out"
177              logger.error(error_msg)
178              return SearchResult(
179                  success=False,
180                  query=query,
181                  error=error_msg
182              )
183          except Exception as e:
184              error_msg = f"Google search failed: {str(e)}"
185              logger.error(error_msg, exc_info=True)
186              return SearchResult(
187                  success=False,
188                  query=query,
189                  error=error_msg
190              )
191      
192      @staticmethod
193      def _extract_domain(url: str) -> str:
194          """Extract clean domain from URL.
195          
196          Args:
197              url: Full URL
198              
199          Returns:
200              Clean domain name
201          """
202          try:
203              # Remove protocol
204              domain = url.replace("https://", "").replace("http://", "")
205              # Get first part (domain)
206              domain = domain.split("/")[0]
207              # Remove www.
208              if domain.startswith("www."):
209                  domain = domain[4:]
210              return domain
211          except:
212              return url
213      
214      def get_tool_definition(self) -> dict:
215          """Get the tool definition for LLM function calling.
216          
217          Returns:
218              Dictionary containing the tool definition
219          """
220          return {
221              "type": "function",
222              "function": {
223                  "name": "web_search",
224                  "description": (
225                      "Search the web using Google Custom Search. Use this when you need "
226                      "up-to-date information, facts, news, or data that may not be in "
227                      "your training data. Always cite sources using the citation format."
228                  ),
229                  "parameters": {
230                      "type": "object",
231                      "properties": {
232                          "query": {
233                              "type": "string",
234                              "description": "The search query"
235                          },
236                          "max_results": {
237                              "type": "integer",
238                              "description": "Maximum number of results (1-10)",
239                              "minimum": 1,
240                              "maximum": 10,
241                              "default": DEFAULT_MAX_RESULTS
242                          },
243                          "search_type": {
244                              "type": "string",
245                              "enum": ["image"],
246                              "description": "Set to 'image' for image search"
247                          },
248                          "date_restrict": {
249                              "type": "string",
250                              "description": "Restrict results by recency (e.g., 'd7' for last 7 days)"
251                          }
252                      },
253                      "required": ["query"]
254                  }
255              }
256          }