/ haystack / tools / searchable_toolset.py
searchable_toolset.py
  1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
  2  #
  3  # SPDX-License-Identifier: Apache-2.0
  4  
  5  from collections.abc import Iterator
  6  from typing import TYPE_CHECKING, Annotated, Any
  7  
  8  from haystack.core.serialization import generate_qualified_class_name, import_class_by_name
  9  from haystack.dataclasses import Document
 10  from haystack.document_stores.in_memory import InMemoryDocumentStore
 11  from haystack.document_stores.types import DuplicatePolicy
 12  from haystack.tools.from_function import create_tool_from_function
 13  from haystack.tools.tool import Tool
 14  from haystack.tools.toolset import Toolset
 15  from haystack.tools.utils import flatten_tools_or_toolsets, warm_up_tools
 16  
 17  if TYPE_CHECKING:
 18      from haystack.tools import ToolsType
 19  
 20  
 21  class SearchableToolset(Toolset):
 22      """
 23      Dynamic tool discovery from large catalogs using BM25 search.
 24  
 25      This Toolset enables LLMs to discover and use tools from large catalogs through
 26      BM25-based search. Instead of exposing all tools at once (which can overwhelm the
 27      LLM context), it provides a `search_tools` bootstrap tool that allows the LLM to
 28      find and load specific tools as needed.
 29  
 30      For very small catalogs (below `search_threshold`), acts as a simple passthrough
 31      exposing all tools directly without any discovery mechanism.
 32  
 33      ### Usage Example
 34  
 35      ```python
 36      from haystack.components.agents import Agent
 37      from haystack.components.generators.chat import OpenAIChatGenerator
 38      from haystack.dataclasses import ChatMessage
 39      from haystack.tools import Tool, SearchableToolset
 40  
 41      # Create a catalog of tools
 42      catalog = [
 43          Tool(name="get_weather", description="Get weather for a city",
 44               parameters={}, function=lambda: None),
 45          Tool(name="search_web", description="Search the web",
 46               parameters={}, function=lambda: None),
 47          # ... 100s more tools
 48      ]
 49      toolset = SearchableToolset(catalog=catalog)
 50  
 51      agent = Agent(chat_generator=OpenAIChatGenerator(), tools=toolset)
 52  
 53      # The agent is initially provided only with the search_tools tool and will use it to find relevant tools.
 54      result = agent.run(messages=[ChatMessage.from_user("What's the weather in Milan?")])
 55      ```
 56      """
 57  
 58      _VALID_SEARCH_TOOL_PARAMS = {"tool_keywords", "k"}
 59  
 60      def __init__(
 61          self,
 62          catalog: "ToolsType",
 63          *,
 64          top_k: int = 3,
 65          search_threshold: int = 8,
 66          search_tool_name: str = "search_tools",
 67          search_tool_description: str | None = None,
 68          search_tool_parameters_description: dict[str, str] | None = None,
 69      ) -> None:
 70          """
 71          Initialize the SearchableToolset.
 72  
 73          :param catalog: Source of tools - a list of Tools, list of Toolsets, or a single Toolset.
 74          :param top_k: Default number of results for search_tools.
 75          :param search_threshold: Minimum catalog size to activate search.
 76              If catalog has fewer tools, acts as passthrough (all tools visible).
 77              Default is 8.
 78          :param search_tool_name: Custom name for the bootstrap search tool. Default is "search_tools".
 79          :param search_tool_description: Custom description for the bootstrap search tool.
 80              If not provided, uses a default description.
 81          :param search_tool_parameters_description: Custom descriptions for the bootstrap search tool's parameters.
 82              Keys must be a subset of `{"tool_keywords", "k"}`.
 83              Example: `{"tool_keywords": "Keywords to find tools, e.g. 'email send'"}`
 84          """
 85          valid_catalog = isinstance(catalog, Toolset) or (
 86              isinstance(catalog, list) and all(isinstance(item, (Tool, Toolset)) for item in catalog)
 87          )
 88          if not valid_catalog:
 89              raise TypeError(
 90                  f"Invalid catalog type: {type(catalog)}. Expected Tool, Toolset, or list of Tools and/or Toolsets."
 91              )
 92  
 93          if search_tool_parameters_description is not None:
 94              invalid_keys = set(search_tool_parameters_description.keys()) - self._VALID_SEARCH_TOOL_PARAMS
 95              if invalid_keys:
 96                  raise ValueError(
 97                      f"Invalid search_tool_parameters_description keys: {invalid_keys}. "
 98                      f"Valid keys are: {self._VALID_SEARCH_TOOL_PARAMS}"
 99                  )
100  
101          # Store raw catalog; flattening is deferred to warm_up() so that lazy
102          # toolsets (e.g. MCPToolset with eager_connect=False) can connect first.
103          self._raw_catalog: "ToolsType" = catalog
104          self._catalog: list[Tool] = []
105  
106          self._top_k = top_k
107          self._search_threshold = search_threshold
108          self._search_tool_name = search_tool_name
109          self._search_tool_description = search_tool_description
110          self._search_tool_parameters_description = search_tool_parameters_description
111  
112          # Runtime state (initialized in warm_up)
113          self._discovered_tools: dict[str, Tool] = {}
114          self._bootstrap_tool: Tool | None = None
115          self._document_store: InMemoryDocumentStore | None = None
116          self._warmed_up = False
117  
118          # Initialize parent with empty tools list - we manage tools dynamically
119          super().__init__(tools=[])
120  
121      def __add__(self, other: Tool | Toolset | list[Tool]) -> "Toolset":
122          """Concatenation is not supported for SearchableToolset."""
123          raise NotImplementedError("SearchableToolset does not support concatenation.")
124  
125      def add(self, tool: Tool | Toolset) -> None:
126          """Adding new tools after initialization is not supported for SearchableToolset."""
127          raise NotImplementedError("SearchableToolset does not support adding new tools after initialization.")
128  
129      def _is_passthrough(self) -> bool:
130          """
131          Internal method to check if operating in passthrough mode (small catalog). Must be called after warm_up().
132          """
133          return len(self._catalog) < self._search_threshold
134  
135      def warm_up(self) -> None:
136          """
137          Prepare the toolset for use.
138  
139          Warms up child toolsets first (so lazy toolsets like MCPToolset can connect),
140          then flattens the catalog, indexes it, and creates the search_tools bootstrap tool.
141          In passthrough mode, it warms up all catalog tools directly.
142          Must be called before using the toolset with an Agent.
143          """
144          if self._warmed_up:
145              return
146  
147          # Warm up child toolsets first (triggers lazy connections like MCPToolset)
148          warm_up_tools(self._raw_catalog)
149          # Now flatten — lazy toolsets will have their real tools available
150          self._catalog = flatten_tools_or_toolsets(self._raw_catalog)
151  
152          if self._is_passthrough():
153              for tool in self._catalog:
154                  tool.warm_up()
155          else:
156              self._document_store = InMemoryDocumentStore()
157              self._tool_by_name = {tool.name: tool for tool in self._catalog}
158              documents = [
159                  Document(content=f"{tool.name} {tool.description}", meta={"tool_name": tool.name})
160                  for tool in self._catalog
161              ]
162              self._document_store.write_documents(documents, policy=DuplicatePolicy.OVERWRITE)
163              self._bootstrap_tool = self._create_search_tool()
164  
165          self._warmed_up = True
166  
167      def clear(self) -> None:
168          """
169          Clear all discovered tools.
170  
171          This method allows resetting the toolset's discovered tools between agent runs
172          when the same toolset instance is reused. This can be useful for long-running
173          applications to control memory usage or to start fresh searches.
174          """
175          self._discovered_tools.clear()
176  
177      def _create_search_tool(self) -> Tool:
178          """Create the search_tools bootstrap tool."""
179  
180          tool_by_name = {tool.name: tool for tool in self._catalog}
181  
182          def search_tools(
183              tool_keywords: Annotated[
184                  str,
185                  "Space-separated words from tool names/descriptions (e.g. 'route weather search')."
186                  " NOT the user's question or task—use vocabulary from the tools you need.",
187              ],
188              k: Annotated[int | None, f"Number of results to return (default: {self._top_k})"] = None,
189          ) -> str:
190              """
191              ALWAYS use this tool FIRST when you need to invoke some tools but don't have the right one loaded yet.
192  
193              Provide space separated tool keywords likely to appear in tool names/descriptions
194              (e.g. 'route distance weather', 'search email'). Do NOT pass the user's request or task (e.g.
195              'things to do in X', 'user question'); matching is keyword-based. Returns loaded
196              tool names; they become available immediately.
197              """
198              num_results = k if k is not None else self._top_k
199  
200              if not tool_keywords.strip():
201                  return (
202                      "No tool keywords provided. Please provide space-separated words likely to appear in tool "
203                      "names/descriptions (e.g. 'route weather search')."
204                  )
205  
206              # at this point, the toolset has been warmed up, so self._document_store is not None
207              results = self._document_store.bm25_retrieval(query=tool_keywords, top_k=num_results)  # type: ignore[union-attr]
208  
209              if not results:
210                  return "No tools found matching these keywords. Try different keywords."
211  
212              # Add found tools to _discovered_tools. These become available to the LLM
213              # on the next agent iteration when __iter__ is called again - the Agent
214              # re-iterates over the toolset each loop, picking up newly discovered tools.
215              # The return message here just confirms what was found; actual tool availability
216              # comes through the dynamic iteration mechanism. This way we also save tokens
217              # by not returning the full tool definitions.
218              tool_names = []
219              for doc in results:
220                  tool = tool_by_name[doc.meta["tool_name"]]
221                  tool.warm_up()
222                  self._discovered_tools[tool.name] = tool
223                  tool_names.append(tool.name)
224  
225              return f"Found and loaded {len(tool_names)} tool(s): {', '.join(tool_names)}. Use them directly as tools."
226  
227          bootstrap_tool = create_tool_from_function(
228              function=search_tools, name=self._search_tool_name, description=self._search_tool_description
229          )
230  
231          # Override parameter descriptions if custom ones were provided
232          if self._search_tool_parameters_description:
233              for param_name, desc in self._search_tool_parameters_description.items():
234                  if param_name in bootstrap_tool.parameters.get("properties", {}):
235                      bootstrap_tool.parameters["properties"][param_name]["description"] = desc
236  
237          return bootstrap_tool
238  
239      def __iter__(self) -> Iterator[Tool]:
240          """
241          Iterate over available tools.
242  
243          In passthrough mode, yields all catalog tools.
244          Otherwise, yields bootstrap tool + discovered tools.
245          Automatically calls warm_up() if needed to ensure bootstrap tool is available.
246          """
247          if not self._warmed_up:
248              self.warm_up()
249          if self._is_passthrough():
250              yield from self._catalog
251          else:
252              if self._bootstrap_tool is not None:
253                  yield self._bootstrap_tool
254              yield from self._discovered_tools.values()
255  
256      def __len__(self) -> int:
257          """Return the number of currently available tools."""
258          # the number of tools is computed by invoking __iter__ on the toolset
259          return sum(1 for _ in self)
260  
261      def __contains__(self, item: str | Tool) -> bool:
262          """
263          Check if a tool is available by Tool instance or tool name string.
264  
265          :param item: Tool instance or tool name string.
266          :returns: True if the tool is available, False otherwise.
267          """
268          if isinstance(item, str):
269              return any(tool.name == item for tool in self)
270          if isinstance(item, Tool):
271              return any(tool == item for tool in self)
272          raise TypeError(f"Invalid item type: {type(item)}. Must be Tool or str.")
273  
274      def __getitem__(self, index: int) -> Tool:
275          """
276          Get a tool by index.
277  
278          :param index: Index of the tool to retrieve.
279          :returns: The tool at the given index.
280          :raises IndexError: If the index is out of range.
281          """
282          return list(self)[index]
283  
284      def to_dict(self) -> dict[str, Any]:
285          """
286          Serialize the toolset to a dictionary.
287  
288          :returns: Dictionary representation of the toolset.
289          """
290          catalog_items: list[Tool | Toolset] = (
291              [self._raw_catalog] if isinstance(self._raw_catalog, Toolset) else list(self._raw_catalog)
292          )
293  
294          data: dict[str, Any] = {
295              "catalog": [item.to_dict() for item in catalog_items],
296              "top_k": self._top_k,
297              "search_threshold": self._search_threshold,
298              "search_tool_name": self._search_tool_name,
299              "search_tool_description": self._search_tool_description,
300              "search_tool_parameters_description": self._search_tool_parameters_description,
301          }
302  
303          return {"type": generate_qualified_class_name(type(self)), "data": data}
304  
305      @classmethod
306      def from_dict(cls, data: dict[str, Any]) -> "SearchableToolset":
307          """
308          Deserialize a toolset from a dictionary.
309  
310          :param data: Dictionary representation of the toolset.
311          :returns: New SearchableToolset instance.
312          """
313          inner_data = data["data"]
314  
315          # Deserialize catalog items (may be Tool or Toolset instances)
316          catalog_data = inner_data.get("catalog", [])
317          catalog: list[Tool | Toolset] = []
318          for item_data in catalog_data:
319              item_class = import_class_by_name(item_data["type"])
320              if not issubclass(item_class, (Tool, Toolset)):
321                  raise TypeError(f"Class '{item_class}' is not a subclass of Tool or Toolset")
322              catalog.append(item_class.from_dict(item_data))
323  
324          optional_keys = (
325              "top_k",
326              "search_threshold",
327              "search_tool_name",
328              "search_tool_description",
329              "search_tool_parameters_description",
330          )
331          return cls(catalog=catalog, **{k: inner_data[k] for k in optional_keys if k in inner_data})