/ haystack / document_stores / types / protocol.py
protocol.py
  1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
  2  #
  3  # SPDX-License-Identifier: Apache-2.0
  4  
  5  from typing import Any, Protocol
  6  
  7  from haystack.dataclasses import Document
  8  from haystack.document_stores.types.policy import DuplicatePolicy
  9  
 10  
 11  class DocumentStore(Protocol):
 12      """
 13      Stores Documents to be used by the components of a Pipeline.
 14  
 15      Classes implementing this protocol often store the documents permanently and allow specialized components to
 16      perform retrieval on them, either by embedding, by keyword, hybrid, and so on, depending on the backend used.
 17  
 18      In order to retrieve documents, consider using a Retriever that supports the DocumentStore implementation that
 19      you're using.
 20      """
 21  
 22      def to_dict(self) -> dict[str, Any]:
 23          """
 24          Serializes this store to a dictionary.
 25          """
 26          ...
 27  
 28      @classmethod
 29      def from_dict(cls, data: dict[str, Any]) -> "DocumentStore":
 30          """
 31          Deserializes the store from a dictionary.
 32          """
 33          ...
 34  
 35      def count_documents(self) -> int:
 36          """
 37          Returns the number of documents stored.
 38          """
 39          ...
 40  
 41      def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]:
 42          """
 43          Returns the documents that match the filters provided.
 44  
 45          Filters are defined as nested dictionaries that can be of two types:
 46          - Comparison
 47          - Logic
 48  
 49          Comparison dictionaries must contain the keys:
 50  
 51          - `field`
 52          - `operator`
 53          - `value`
 54  
 55          Logic dictionaries must contain the keys:
 56  
 57          - `operator`
 58          - `conditions`
 59  
 60          The `conditions` key must be a list of dictionaries, either of type Comparison or Logic.
 61  
 62          The `operator` value in Comparison dictionaries must be one of:
 63  
 64          - `==`
 65          - `!=`
 66          - `>`
 67          - `>=`
 68          - `<`
 69          - `<=`
 70          - `in`
 71          - `not in`
 72  
 73          The `operator` values in Logic dictionaries must be one of:
 74  
 75          - `NOT`
 76          - `OR`
 77          - `AND`
 78  
 79  
 80          A simple filter:
 81          ```python
 82          filters = {"field": "meta.type", "operator": "==", "value": "article"}
 83          ```
 84  
 85          A more complex filter:
 86          ```python
 87          filters = {
 88              "operator": "AND",
 89              "conditions": [
 90                  {"field": "meta.type", "operator": "==", "value": "article"},
 91                  {"field": "meta.date", "operator": ">=", "value": 1420066800},
 92                  {"field": "meta.date", "operator": "<", "value": 1609455600},
 93                  {"field": "meta.rating", "operator": ">=", "value": 3},
 94                  {
 95                      "operator": "OR",
 96                      "conditions": [
 97                          {"field": "meta.genre", "operator": "in", "value": ["economy", "politics"]},
 98                          {"field": "meta.publisher", "operator": "==", "value": "nytimes"},
 99                      ],
100                  },
101              ],
102          }
103  
104          :param filters: the filters to apply to the document list.
105          :returns: a list of Documents that match the given filters.
106          """
107          ...
108  
109      def write_documents(self, documents: list[Document], policy: DuplicatePolicy = DuplicatePolicy.NONE) -> int:
110          """
111          Writes Documents into the DocumentStore.
112  
113          :param documents: a list of Document objects.
114          :param policy: the policy to apply when a Document with the same id already exists in the DocumentStore.
115              - `DuplicatePolicy.NONE`: Default policy, behaviour depends on the Document Store.
116              - `DuplicatePolicy.SKIP`: If a Document with the same id already exists, it is skipped and not written.
117              - `DuplicatePolicy.OVERWRITE`: If a Document with the same id already exists, it is overwritten.
118              - `DuplicatePolicy.FAIL`: If a Document with the same id already exists, an error is raised.
119          :raises DuplicateError: If `policy` is set to `DuplicatePolicy.FAIL` and a Document with the same id already
120              exists.
121          :returns: The number of Documents written.
122              If `DuplicatePolicy.OVERWRITE` is used, this number is always equal to the number of documents in input.
123              If `DuplicatePolicy.SKIP` is used, this number can be lower than the number of documents in the input list.
124          """
125          ...
126  
127      def delete_documents(self, document_ids: list[str]) -> None:
128          """
129          Deletes all documents with a matching document_ids from the DocumentStore.
130  
131          Fails with `MissingDocumentError` if no document with this id is present in the DocumentStore.
132  
133          :param document_ids: the object_ids to delete
134          """
135          ...