searchapi.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 from typing import Any 6 7 import httpx 8 9 from haystack import ComponentError, Document, component, default_from_dict, default_to_dict, logging 10 from haystack.utils import Secret 11 12 logger = logging.getLogger(__name__) 13 14 15 SEARCHAPI_BASE_URL = "https://www.searchapi.io/api/v1/search" 16 17 18 class SearchApiError(ComponentError): ... 19 20 21 @component 22 class SearchApiWebSearch: 23 """ 24 Uses [SearchApi](https://www.searchapi.io/) to search the web for relevant documents. 25 26 Usage example: 27 <!-- test-ignore --> 28 ```python 29 from haystack.components.websearch import SearchApiWebSearch 30 from haystack.utils import Secret 31 32 websearch = SearchApiWebSearch(top_k=10, api_key=Secret.from_env_var("SERPERDEV_API_KEY")) 33 results = websearch.run(query="Who is the boyfriend of Olivia Wilde?") 34 35 assert results["documents"] 36 assert results["links"] 37 ``` 38 """ 39 40 def __init__( 41 self, 42 api_key: Secret = Secret.from_env_var("SEARCHAPI_API_KEY"), 43 top_k: int | None = 10, 44 allowed_domains: list[str] | None = None, 45 search_params: dict[str, Any] | None = None, 46 ) -> None: 47 """ 48 Initialize the SearchApiWebSearch component. 49 50 :param api_key: API key for the SearchApi API 51 :param top_k: Number of documents to return. 52 :param allowed_domains: List of domains to limit the search to. 53 :param search_params: Additional parameters passed to the SearchApi API. 54 For example, you can set 'num' to 100 to increase the number of search results. 55 See the [SearchApi website](https://www.searchapi.io/) for more details. 56 57 The default search engine is Google, however, users can change it by setting the `engine` 58 parameter in the `search_params`. 59 """ 60 61 self.api_key = api_key 62 self.top_k = top_k 63 self.allowed_domains = allowed_domains 64 self.search_params = search_params or {} 65 if "engine" not in self.search_params: 66 self.search_params["engine"] = "google" 67 68 # Ensure that the API key is resolved. 69 _ = self.api_key.resolve_value() 70 71 def to_dict(self) -> dict[str, Any]: 72 """ 73 Serializes the component to a dictionary. 74 75 :returns: 76 Dictionary with serialized data. 77 """ 78 return default_to_dict( 79 self, 80 top_k=self.top_k, 81 allowed_domains=self.allowed_domains, 82 search_params=self.search_params, 83 api_key=self.api_key, 84 ) 85 86 @classmethod 87 def from_dict(cls, data: dict[str, Any]) -> "SearchApiWebSearch": 88 """ 89 Deserializes the component from a dictionary. 90 91 :param data: 92 The dictionary to deserialize from. 93 :returns: 94 The deserialized component. 95 """ 96 return default_from_dict(cls, data) 97 98 @component.output_types(documents=list[Document], links=list[str]) 99 def run(self, query: str) -> dict[str, list[Document] | list[str]]: 100 """ 101 Uses [SearchApi](https://www.searchapi.io/) to search the web. 102 103 :param query: Search query. 104 :returns: A dictionary with the following keys: 105 - "documents": List of documents returned by the search engine. 106 - "links": List of links returned by the search engine. 107 :raises TimeoutError: If the request to the SearchApi API times out. 108 :raises SearchApiError: If an error occurs while querying the SearchApi API. 109 """ 110 payload, headers = self._prepare_request(query) 111 try: 112 response = httpx.get(SEARCHAPI_BASE_URL, headers=headers, params=payload, timeout=90) 113 response.raise_for_status() # Will raise an HTTPError for bad responses 114 except httpx.ConnectTimeout as error: 115 raise TimeoutError(f"Request to {self.__class__.__name__} timed out.") from error 116 117 except httpx.HTTPStatusError as e: 118 raise SearchApiError( 119 f"An error occurred while querying {self.__class__.__name__}. Error: {e}, Response: {e.response.text}" 120 ) from e 121 122 except httpx.HTTPError as e: 123 raise SearchApiError(f"An error occurred while querying {self.__class__.__name__}. Error: {e}") from e 124 125 documents, links = self._parse_response(response) 126 127 logger.debug( 128 "SearchApi returned {number_documents} documents for the query '{query}'", 129 number_documents=len(documents), 130 query=query, 131 ) 132 return {"documents": documents[: self.top_k], "links": links[: self.top_k]} 133 134 @component.output_types(documents=list[Document], links=list[str]) 135 async def run_async(self, query: str) -> dict[str, list[Document] | list[str]]: 136 """ 137 Asynchronously uses [SearchApi](https://www.searchapi.io/) to search the web. 138 139 This is the asynchronous version of the `run` method with the same parameters and return values. 140 141 142 :param query: Search query. 143 :returns: A dictionary with the following keys: 144 - "documents": List of documents returned by the search engine. 145 - "links": List of links returned by the search engine. 146 :raises TimeoutError: If the request to the SearchApi API times out. 147 :raises SearchApiError: If an error occurs while querying the SearchApi API. 148 """ 149 payload, headers = self._prepare_request(query) 150 try: 151 async with httpx.AsyncClient() as client: 152 response = await client.get(SEARCHAPI_BASE_URL, headers=headers, params=payload, timeout=90) 153 response.raise_for_status() # Will raise an HTTPError for bad responses 154 except httpx.ConnectTimeout as error: 155 raise TimeoutError(f"Request to {self.__class__.__name__} timed out.") from error 156 157 except httpx.HTTPStatusError as e: 158 raise SearchApiError( 159 f"An error occurred while querying {self.__class__.__name__}. Error: {e}, Response: {e.response.text}" 160 ) from e 161 162 except httpx.HTTPError as e: 163 raise SearchApiError(f"An error occurred while querying {self.__class__.__name__}. Error: {e}") from e 164 165 documents, links = self._parse_response(response) 166 167 logger.debug( 168 "SearchApi returned {number_documents} documents for the query '{query}'", 169 number_documents=len(documents), 170 query=query, 171 ) 172 return {"documents": documents[: self.top_k], "links": links[: self.top_k]} 173 174 def _prepare_request(self, query: str) -> tuple[httpx._types.QueryParamTypes, httpx._types.HeaderTypes]: 175 query_prepend = "OR ".join(f"site:{domain} " for domain in self.allowed_domains) if self.allowed_domains else "" 176 payload = {"q": query_prepend + " " + query, **self.search_params} 177 headers = {"Authorization": f"Bearer {self.api_key.resolve_value()}", "X-SearchApi-Source": "Haystack"} 178 return payload, headers 179 180 @staticmethod 181 def _parse_response(response: httpx.Response) -> tuple[list[Document], list[str]]: 182 # Request succeeded 183 json_result = response.json() 184 185 # organic results are the main results from the search engine 186 organic_results = [] 187 if "organic_results" in json_result: 188 for result in json_result["organic_results"]: 189 organic_results.append( 190 Document.from_dict({"title": result["title"], "content": result["snippet"], "link": result["link"]}) 191 ) 192 193 # answer box has a direct answer to the query 194 answer_box = [] 195 if "answer_box" in json_result: 196 answer_box = [ 197 Document.from_dict( 198 { 199 "title": json_result["answer_box"].get("title", ""), 200 "content": json_result["answer_box"].get("answer", ""), 201 "link": json_result["answer_box"].get("link", ""), 202 } 203 ) 204 ] 205 206 knowledge_graph = [] 207 if "knowledge_graph" in json_result: 208 knowledge_graph = [ 209 Document.from_dict( 210 { 211 "title": json_result["knowledge_graph"].get("title", ""), 212 "content": json_result["knowledge_graph"].get("description", ""), 213 } 214 ) 215 ] 216 217 related_questions = [] 218 if "related_questions" in json_result: 219 for result in json_result["related_questions"]: 220 related_questions.append( 221 Document.from_dict( 222 { 223 "title": result["question"], 224 "content": result["answer"] if result.get("answer") else result.get("answer_highlight", ""), 225 "link": result.get("source", {}).get("link", ""), 226 } 227 ) 228 ) 229 230 documents = answer_box + knowledge_graph + organic_results + related_questions 231 232 links = [result["link"] for result in json_result["organic_results"]] 233 return documents, links