searchable_toolset.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 from collections.abc import Iterator 6 from typing import TYPE_CHECKING, Annotated, Any 7 8 from haystack.core.serialization import generate_qualified_class_name, import_class_by_name 9 from haystack.dataclasses import Document 10 from haystack.document_stores.in_memory import InMemoryDocumentStore 11 from haystack.document_stores.types import DuplicatePolicy 12 from haystack.tools.from_function import create_tool_from_function 13 from haystack.tools.tool import Tool 14 from haystack.tools.toolset import Toolset 15 from haystack.tools.utils import flatten_tools_or_toolsets, warm_up_tools 16 17 if TYPE_CHECKING: 18 from haystack.tools import ToolsType 19 20 21 class SearchableToolset(Toolset): 22 """ 23 Dynamic tool discovery from large catalogs using BM25 search. 24 25 This Toolset enables LLMs to discover and use tools from large catalogs through 26 BM25-based search. Instead of exposing all tools at once (which can overwhelm the 27 LLM context), it provides a `search_tools` bootstrap tool that allows the LLM to 28 find and load specific tools as needed. 29 30 For very small catalogs (below `search_threshold`), acts as a simple passthrough 31 exposing all tools directly without any discovery mechanism. 32 33 ### Usage Example 34 35 ```python 36 from haystack.components.agents import Agent 37 from haystack.components.generators.chat import OpenAIChatGenerator 38 from haystack.dataclasses import ChatMessage 39 from haystack.tools import Tool, SearchableToolset 40 41 # Create a catalog of tools 42 catalog = [ 43 Tool(name="get_weather", description="Get weather for a city", 44 parameters={}, function=lambda: None), 45 Tool(name="search_web", description="Search the web", 46 parameters={}, function=lambda: None), 47 # ... 100s more tools 48 ] 49 toolset = SearchableToolset(catalog=catalog) 50 51 agent = Agent(chat_generator=OpenAIChatGenerator(), tools=toolset) 52 53 # The agent is initially provided only with the search_tools tool and will use it to find relevant tools. 54 result = agent.run(messages=[ChatMessage.from_user("What's the weather in Milan?")]) 55 ``` 56 """ 57 58 _VALID_SEARCH_TOOL_PARAMS = {"tool_keywords", "k"} 59 60 def __init__( 61 self, 62 catalog: "ToolsType", 63 *, 64 top_k: int = 3, 65 search_threshold: int = 8, 66 search_tool_name: str = "search_tools", 67 search_tool_description: str | None = None, 68 search_tool_parameters_description: dict[str, str] | None = None, 69 ) -> None: 70 """ 71 Initialize the SearchableToolset. 72 73 :param catalog: Source of tools - a list of Tools, list of Toolsets, or a single Toolset. 74 :param top_k: Default number of results for search_tools. 75 :param search_threshold: Minimum catalog size to activate search. 76 If catalog has fewer tools, acts as passthrough (all tools visible). 77 Default is 8. 78 :param search_tool_name: Custom name for the bootstrap search tool. Default is "search_tools". 79 :param search_tool_description: Custom description for the bootstrap search tool. 80 If not provided, uses a default description. 81 :param search_tool_parameters_description: Custom descriptions for the bootstrap search tool's parameters. 82 Keys must be a subset of `{"tool_keywords", "k"}`. 83 Example: `{"tool_keywords": "Keywords to find tools, e.g. 'email send'"}` 84 """ 85 valid_catalog = isinstance(catalog, Toolset) or ( 86 isinstance(catalog, list) and all(isinstance(item, (Tool, Toolset)) for item in catalog) 87 ) 88 if not valid_catalog: 89 raise TypeError( 90 f"Invalid catalog type: {type(catalog)}. Expected Tool, Toolset, or list of Tools and/or Toolsets." 91 ) 92 93 if search_tool_parameters_description is not None: 94 invalid_keys = set(search_tool_parameters_description.keys()) - self._VALID_SEARCH_TOOL_PARAMS 95 if invalid_keys: 96 raise ValueError( 97 f"Invalid search_tool_parameters_description keys: {invalid_keys}. " 98 f"Valid keys are: {self._VALID_SEARCH_TOOL_PARAMS}" 99 ) 100 101 # Store raw catalog; flattening is deferred to warm_up() so that lazy 102 # toolsets (e.g. MCPToolset with eager_connect=False) can connect first. 103 self._raw_catalog: "ToolsType" = catalog 104 self._catalog: list[Tool] = [] 105 106 self._top_k = top_k 107 self._search_threshold = search_threshold 108 self._search_tool_name = search_tool_name 109 self._search_tool_description = search_tool_description 110 self._search_tool_parameters_description = search_tool_parameters_description 111 112 # Runtime state (initialized in warm_up) 113 self._discovered_tools: dict[str, Tool] = {} 114 self._bootstrap_tool: Tool | None = None 115 self._document_store: InMemoryDocumentStore | None = None 116 self._warmed_up = False 117 118 # Initialize parent with empty tools list - we manage tools dynamically 119 super().__init__(tools=[]) 120 121 def __add__(self, other: Tool | Toolset | list[Tool]) -> "Toolset": 122 """Concatenation is not supported for SearchableToolset.""" 123 raise NotImplementedError("SearchableToolset does not support concatenation.") 124 125 def add(self, tool: Tool | Toolset) -> None: 126 """Adding new tools after initialization is not supported for SearchableToolset.""" 127 raise NotImplementedError("SearchableToolset does not support adding new tools after initialization.") 128 129 def _is_passthrough(self) -> bool: 130 """ 131 Internal method to check if operating in passthrough mode (small catalog). Must be called after warm_up(). 132 """ 133 return len(self._catalog) < self._search_threshold 134 135 def warm_up(self) -> None: 136 """ 137 Prepare the toolset for use. 138 139 Warms up child toolsets first (so lazy toolsets like MCPToolset can connect), 140 then flattens the catalog, indexes it, and creates the search_tools bootstrap tool. 141 In passthrough mode, it warms up all catalog tools directly. 142 Must be called before using the toolset with an Agent. 143 """ 144 if self._warmed_up: 145 return 146 147 # Warm up child toolsets first (triggers lazy connections like MCPToolset) 148 warm_up_tools(self._raw_catalog) 149 # Now flatten — lazy toolsets will have their real tools available 150 self._catalog = flatten_tools_or_toolsets(self._raw_catalog) 151 152 if self._is_passthrough(): 153 for tool in self._catalog: 154 tool.warm_up() 155 else: 156 self._document_store = InMemoryDocumentStore() 157 self._tool_by_name = {tool.name: tool for tool in self._catalog} 158 documents = [ 159 Document(content=f"{tool.name} {tool.description}", meta={"tool_name": tool.name}) 160 for tool in self._catalog 161 ] 162 self._document_store.write_documents(documents, policy=DuplicatePolicy.OVERWRITE) 163 self._bootstrap_tool = self._create_search_tool() 164 165 self._warmed_up = True 166 167 def clear(self) -> None: 168 """ 169 Clear all discovered tools. 170 171 This method allows resetting the toolset's discovered tools between agent runs 172 when the same toolset instance is reused. This can be useful for long-running 173 applications to control memory usage or to start fresh searches. 174 """ 175 self._discovered_tools.clear() 176 177 def _create_search_tool(self) -> Tool: 178 """Create the search_tools bootstrap tool.""" 179 180 tool_by_name = {tool.name: tool for tool in self._catalog} 181 182 def search_tools( 183 tool_keywords: Annotated[ 184 str, 185 "Space-separated words from tool names/descriptions (e.g. 'route weather search')." 186 " NOT the user's question or task—use vocabulary from the tools you need.", 187 ], 188 k: Annotated[int | None, f"Number of results to return (default: {self._top_k})"] = None, 189 ) -> str: 190 """ 191 ALWAYS use this tool FIRST when you need to invoke some tools but don't have the right one loaded yet. 192 193 Provide space separated tool keywords likely to appear in tool names/descriptions 194 (e.g. 'route distance weather', 'search email'). Do NOT pass the user's request or task (e.g. 195 'things to do in X', 'user question'); matching is keyword-based. Returns loaded 196 tool names; they become available immediately. 197 """ 198 num_results = k if k is not None else self._top_k 199 200 if not tool_keywords.strip(): 201 return ( 202 "No tool keywords provided. Please provide space-separated words likely to appear in tool " 203 "names/descriptions (e.g. 'route weather search')." 204 ) 205 206 # at this point, the toolset has been warmed up, so self._document_store is not None 207 results = self._document_store.bm25_retrieval(query=tool_keywords, top_k=num_results) # type: ignore[union-attr] 208 209 if not results: 210 return "No tools found matching these keywords. Try different keywords." 211 212 # Add found tools to _discovered_tools. These become available to the LLM 213 # on the next agent iteration when __iter__ is called again - the Agent 214 # re-iterates over the toolset each loop, picking up newly discovered tools. 215 # The return message here just confirms what was found; actual tool availability 216 # comes through the dynamic iteration mechanism. This way we also save tokens 217 # by not returning the full tool definitions. 218 tool_names = [] 219 for doc in results: 220 tool = tool_by_name[doc.meta["tool_name"]] 221 tool.warm_up() 222 self._discovered_tools[tool.name] = tool 223 tool_names.append(tool.name) 224 225 return f"Found and loaded {len(tool_names)} tool(s): {', '.join(tool_names)}. Use them directly as tools." 226 227 bootstrap_tool = create_tool_from_function( 228 function=search_tools, name=self._search_tool_name, description=self._search_tool_description 229 ) 230 231 # Override parameter descriptions if custom ones were provided 232 if self._search_tool_parameters_description: 233 for param_name, desc in self._search_tool_parameters_description.items(): 234 if param_name in bootstrap_tool.parameters.get("properties", {}): 235 bootstrap_tool.parameters["properties"][param_name]["description"] = desc 236 237 return bootstrap_tool 238 239 def __iter__(self) -> Iterator[Tool]: 240 """ 241 Iterate over available tools. 242 243 In passthrough mode, yields all catalog tools. 244 Otherwise, yields bootstrap tool + discovered tools. 245 Automatically calls warm_up() if needed to ensure bootstrap tool is available. 246 """ 247 if not self._warmed_up: 248 self.warm_up() 249 if self._is_passthrough(): 250 yield from self._catalog 251 else: 252 if self._bootstrap_tool is not None: 253 yield self._bootstrap_tool 254 yield from self._discovered_tools.values() 255 256 def __len__(self) -> int: 257 """Return the number of currently available tools.""" 258 # the number of tools is computed by invoking __iter__ on the toolset 259 return sum(1 for _ in self) 260 261 def __contains__(self, item: str | Tool) -> bool: 262 """ 263 Check if a tool is available by Tool instance or tool name string. 264 265 :param item: Tool instance or tool name string. 266 :returns: True if the tool is available, False otherwise. 267 """ 268 if isinstance(item, str): 269 return any(tool.name == item for tool in self) 270 if isinstance(item, Tool): 271 return any(tool == item for tool in self) 272 raise TypeError(f"Invalid item type: {type(item)}. Must be Tool or str.") 273 274 def __getitem__(self, index: int) -> Tool: 275 """ 276 Get a tool by index. 277 278 :param index: Index of the tool to retrieve. 279 :returns: The tool at the given index. 280 :raises IndexError: If the index is out of range. 281 """ 282 return list(self)[index] 283 284 def to_dict(self) -> dict[str, Any]: 285 """ 286 Serialize the toolset to a dictionary. 287 288 :returns: Dictionary representation of the toolset. 289 """ 290 catalog_items: list[Tool | Toolset] = ( 291 [self._raw_catalog] if isinstance(self._raw_catalog, Toolset) else list(self._raw_catalog) 292 ) 293 294 data: dict[str, Any] = { 295 "catalog": [item.to_dict() for item in catalog_items], 296 "top_k": self._top_k, 297 "search_threshold": self._search_threshold, 298 "search_tool_name": self._search_tool_name, 299 "search_tool_description": self._search_tool_description, 300 "search_tool_parameters_description": self._search_tool_parameters_description, 301 } 302 303 return {"type": generate_qualified_class_name(type(self)), "data": data} 304 305 @classmethod 306 def from_dict(cls, data: dict[str, Any]) -> "SearchableToolset": 307 """ 308 Deserialize a toolset from a dictionary. 309 310 :param data: Dictionary representation of the toolset. 311 :returns: New SearchableToolset instance. 312 """ 313 inner_data = data["data"] 314 315 # Deserialize catalog items (may be Tool or Toolset instances) 316 catalog_data = inner_data.get("catalog", []) 317 catalog: list[Tool | Toolset] = [] 318 for item_data in catalog_data: 319 item_class = import_class_by_name(item_data["type"]) 320 if not issubclass(item_class, (Tool, Toolset)): 321 raise TypeError(f"Class '{item_class}' is not a subclass of Tool or Toolset") 322 catalog.append(item_class.from_dict(item_data)) 323 324 optional_keys = ( 325 "top_k", 326 "search_threshold", 327 "search_tool_name", 328 "search_tool_description", 329 "search_tool_parameters_description", 330 ) 331 return cls(catalog=catalog, **{k: inner_data[k] for k in optional_keys if k in inner_data})