/ haystack / components / websearch / searchapi.py
searchapi.py
  1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
  2  #
  3  # SPDX-License-Identifier: Apache-2.0
  4  
  5  from typing import Any
  6  
  7  import httpx
  8  
  9  from haystack import ComponentError, Document, component, default_from_dict, default_to_dict, logging
 10  from haystack.utils import Secret
 11  
 12  logger = logging.getLogger(__name__)
 13  
 14  
 15  SEARCHAPI_BASE_URL = "https://www.searchapi.io/api/v1/search"
 16  
 17  
 18  class SearchApiError(ComponentError): ...
 19  
 20  
 21  @component
 22  class SearchApiWebSearch:
 23      """
 24      Uses [SearchApi](https://www.searchapi.io/) to search the web for relevant documents.
 25  
 26      Usage example:
 27      <!-- test-ignore -->
 28      ```python
 29      from haystack.components.websearch import SearchApiWebSearch
 30      from haystack.utils import Secret
 31  
 32      websearch = SearchApiWebSearch(top_k=10, api_key=Secret.from_env_var("SERPERDEV_API_KEY"))
 33      results = websearch.run(query="Who is the boyfriend of Olivia Wilde?")
 34  
 35      assert results["documents"]
 36      assert results["links"]
 37      ```
 38      """
 39  
 40      def __init__(
 41          self,
 42          api_key: Secret = Secret.from_env_var("SEARCHAPI_API_KEY"),
 43          top_k: int | None = 10,
 44          allowed_domains: list[str] | None = None,
 45          search_params: dict[str, Any] | None = None,
 46      ) -> None:
 47          """
 48          Initialize the SearchApiWebSearch component.
 49  
 50          :param api_key: API key for the SearchApi API
 51          :param top_k: Number of documents to return.
 52          :param allowed_domains: List of domains to limit the search to.
 53          :param search_params: Additional parameters passed to the SearchApi API.
 54              For example, you can set 'num' to 100 to increase the number of search results.
 55              See the [SearchApi website](https://www.searchapi.io/) for more details.
 56  
 57              The default search engine is Google, however, users can change it by setting the `engine`
 58              parameter in the `search_params`.
 59          """
 60  
 61          self.api_key = api_key
 62          self.top_k = top_k
 63          self.allowed_domains = allowed_domains
 64          self.search_params = search_params or {}
 65          if "engine" not in self.search_params:
 66              self.search_params["engine"] = "google"
 67  
 68          # Ensure that the API key is resolved.
 69          _ = self.api_key.resolve_value()
 70  
 71      def to_dict(self) -> dict[str, Any]:
 72          """
 73          Serializes the component to a dictionary.
 74  
 75          :returns:
 76                Dictionary with serialized data.
 77          """
 78          return default_to_dict(
 79              self,
 80              top_k=self.top_k,
 81              allowed_domains=self.allowed_domains,
 82              search_params=self.search_params,
 83              api_key=self.api_key,
 84          )
 85  
 86      @classmethod
 87      def from_dict(cls, data: dict[str, Any]) -> "SearchApiWebSearch":
 88          """
 89          Deserializes the component from a dictionary.
 90  
 91          :param data:
 92              The dictionary to deserialize from.
 93          :returns:
 94              The deserialized component.
 95          """
 96          return default_from_dict(cls, data)
 97  
 98      @component.output_types(documents=list[Document], links=list[str])
 99      def run(self, query: str) -> dict[str, list[Document] | list[str]]:
100          """
101          Uses [SearchApi](https://www.searchapi.io/) to search the web.
102  
103          :param query: Search query.
104          :returns: A dictionary with the following keys:
105              - "documents": List of documents returned by the search engine.
106              - "links": List of links returned by the search engine.
107          :raises TimeoutError: If the request to the SearchApi API times out.
108          :raises SearchApiError: If an error occurs while querying the SearchApi API.
109          """
110          payload, headers = self._prepare_request(query)
111          try:
112              response = httpx.get(SEARCHAPI_BASE_URL, headers=headers, params=payload, timeout=90)
113              response.raise_for_status()  # Will raise an HTTPError for bad responses
114          except httpx.ConnectTimeout as error:
115              raise TimeoutError(f"Request to {self.__class__.__name__} timed out.") from error
116  
117          except httpx.HTTPStatusError as e:
118              raise SearchApiError(
119                  f"An error occurred while querying {self.__class__.__name__}. Error: {e}, Response: {e.response.text}"
120              ) from e
121  
122          except httpx.HTTPError as e:
123              raise SearchApiError(f"An error occurred while querying {self.__class__.__name__}. Error: {e}") from e
124  
125          documents, links = self._parse_response(response)
126  
127          logger.debug(
128              "SearchApi returned {number_documents} documents for the query '{query}'",
129              number_documents=len(documents),
130              query=query,
131          )
132          return {"documents": documents[: self.top_k], "links": links[: self.top_k]}
133  
134      @component.output_types(documents=list[Document], links=list[str])
135      async def run_async(self, query: str) -> dict[str, list[Document] | list[str]]:
136          """
137          Asynchronously uses [SearchApi](https://www.searchapi.io/) to search the web.
138  
139          This is the asynchronous version of the `run` method with the same parameters and return values.
140  
141  
142          :param query: Search query.
143          :returns: A dictionary with the following keys:
144              - "documents": List of documents returned by the search engine.
145              - "links": List of links returned by the search engine.
146          :raises TimeoutError: If the request to the SearchApi API times out.
147          :raises SearchApiError: If an error occurs while querying the SearchApi API.
148          """
149          payload, headers = self._prepare_request(query)
150          try:
151              async with httpx.AsyncClient() as client:
152                  response = await client.get(SEARCHAPI_BASE_URL, headers=headers, params=payload, timeout=90)
153                  response.raise_for_status()  # Will raise an HTTPError for bad responses
154          except httpx.ConnectTimeout as error:
155              raise TimeoutError(f"Request to {self.__class__.__name__} timed out.") from error
156  
157          except httpx.HTTPStatusError as e:
158              raise SearchApiError(
159                  f"An error occurred while querying {self.__class__.__name__}. Error: {e}, Response: {e.response.text}"
160              ) from e
161  
162          except httpx.HTTPError as e:
163              raise SearchApiError(f"An error occurred while querying {self.__class__.__name__}. Error: {e}") from e
164  
165          documents, links = self._parse_response(response)
166  
167          logger.debug(
168              "SearchApi returned {number_documents} documents for the query '{query}'",
169              number_documents=len(documents),
170              query=query,
171          )
172          return {"documents": documents[: self.top_k], "links": links[: self.top_k]}
173  
174      def _prepare_request(self, query: str) -> tuple[httpx._types.QueryParamTypes, httpx._types.HeaderTypes]:
175          query_prepend = "OR ".join(f"site:{domain} " for domain in self.allowed_domains) if self.allowed_domains else ""
176          payload = {"q": query_prepend + " " + query, **self.search_params}
177          headers = {"Authorization": f"Bearer {self.api_key.resolve_value()}", "X-SearchApi-Source": "Haystack"}
178          return payload, headers
179  
180      @staticmethod
181      def _parse_response(response: httpx.Response) -> tuple[list[Document], list[str]]:
182          # Request succeeded
183          json_result = response.json()
184  
185          # organic results are the main results from the search engine
186          organic_results = []
187          if "organic_results" in json_result:
188              for result in json_result["organic_results"]:
189                  organic_results.append(
190                      Document.from_dict({"title": result["title"], "content": result["snippet"], "link": result["link"]})
191                  )
192  
193          # answer box has a direct answer to the query
194          answer_box = []
195          if "answer_box" in json_result:
196              answer_box = [
197                  Document.from_dict(
198                      {
199                          "title": json_result["answer_box"].get("title", ""),
200                          "content": json_result["answer_box"].get("answer", ""),
201                          "link": json_result["answer_box"].get("link", ""),
202                      }
203                  )
204              ]
205  
206          knowledge_graph = []
207          if "knowledge_graph" in json_result:
208              knowledge_graph = [
209                  Document.from_dict(
210                      {
211                          "title": json_result["knowledge_graph"].get("title", ""),
212                          "content": json_result["knowledge_graph"].get("description", ""),
213                      }
214                  )
215              ]
216  
217          related_questions = []
218          if "related_questions" in json_result:
219              for result in json_result["related_questions"]:
220                  related_questions.append(
221                      Document.from_dict(
222                          {
223                              "title": result["question"],
224                              "content": result["answer"] if result.get("answer") else result.get("answer_highlight", ""),
225                              "link": result.get("source", {}).get("link", ""),
226                          }
227                      )
228                  )
229  
230          documents = answer_box + knowledge_graph + organic_results + related_questions
231  
232          links = [result["link"] for result in json_result["organic_results"]]
233          return documents, links