protocol.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 from typing import Any, Protocol 6 7 from haystack.dataclasses import Document 8 from haystack.document_stores.types.policy import DuplicatePolicy 9 10 11 class DocumentStore(Protocol): 12 """ 13 Stores Documents to be used by the components of a Pipeline. 14 15 Classes implementing this protocol often store the documents permanently and allow specialized components to 16 perform retrieval on them, either by embedding, by keyword, hybrid, and so on, depending on the backend used. 17 18 In order to retrieve documents, consider using a Retriever that supports the DocumentStore implementation that 19 you're using. 20 """ 21 22 def to_dict(self) -> dict[str, Any]: 23 """ 24 Serializes this store to a dictionary. 25 """ 26 ... 27 28 @classmethod 29 def from_dict(cls, data: dict[str, Any]) -> "DocumentStore": 30 """ 31 Deserializes the store from a dictionary. 32 """ 33 ... 34 35 def count_documents(self) -> int: 36 """ 37 Returns the number of documents stored. 38 """ 39 ... 40 41 def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]: 42 """ 43 Returns the documents that match the filters provided. 44 45 Filters are defined as nested dictionaries that can be of two types: 46 - Comparison 47 - Logic 48 49 Comparison dictionaries must contain the keys: 50 51 - `field` 52 - `operator` 53 - `value` 54 55 Logic dictionaries must contain the keys: 56 57 - `operator` 58 - `conditions` 59 60 The `conditions` key must be a list of dictionaries, either of type Comparison or Logic. 61 62 The `operator` value in Comparison dictionaries must be one of: 63 64 - `==` 65 - `!=` 66 - `>` 67 - `>=` 68 - `<` 69 - `<=` 70 - `in` 71 - `not in` 72 73 The `operator` values in Logic dictionaries must be one of: 74 75 - `NOT` 76 - `OR` 77 - `AND` 78 79 80 A simple filter: 81 ```python 82 filters = {"field": "meta.type", "operator": "==", "value": "article"} 83 ``` 84 85 A more complex filter: 86 ```python 87 filters = { 88 "operator": "AND", 89 "conditions": [ 90 {"field": "meta.type", "operator": "==", "value": "article"}, 91 {"field": "meta.date", "operator": ">=", "value": 1420066800}, 92 {"field": "meta.date", "operator": "<", "value": 1609455600}, 93 {"field": "meta.rating", "operator": ">=", "value": 3}, 94 { 95 "operator": "OR", 96 "conditions": [ 97 {"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]}, 98 {"field": "meta.publisher", "operator": "==", "value": "nytimes"}, 99 ], 100 }, 101 ], 102 } 103 104 :param filters: the filters to apply to the document list. 105 :returns: a list of Documents that match the given filters. 106 """ 107 ... 108 109 def write_documents(self, documents: list[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int: 110 """ 111 Writes Documents into the DocumentStore. 112 113 :param documents: a list of Document objects. 114 :param policy: the policy to apply when a Document with the same id already exists in the DocumentStore. 115 - `DuplicatePolicy.NONE`: Default policy, behaviour depends on the Document Store. 116 - `DuplicatePolicy.SKIP`: If a Document with the same id already exists, it is skipped and not written. 117 - `DuplicatePolicy.OVERWRITE`: If a Document with the same id already exists, it is overwritten. 118 - `DuplicatePolicy.FAIL`: If a Document with the same id already exists, an error is raised. 119 :raises DuplicateError: If `policy` is set to `DuplicatePolicy.FAIL` and a Document with the same id already 120 exists. 121 :returns: The number of Documents written. 122 If `DuplicatePolicy.OVERWRITE` is used, this number is always equal to the number of documents in input. 123 If `DuplicatePolicy.SKIP` is used, this number can be lower than the number of documents in the input list. 124 """ 125 ... 126 127 def delete_documents(self, document_ids: list[str]) -> None: 128 """ 129 Deletes all documents with a matching document_ids from the DocumentStore. 130 131 Fails with `MissingDocumentError` if no document with this id is present in the DocumentStore. 132 133 :param document_ids: the object_ids to delete 134 """ 135 ...