/ haystack / components / preprocessors / document_preprocessor.py
document_preprocessor.py
  1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
  2  #
  3  # SPDX-License-Identifier: Apache-2.0
  4  
  5  from collections.abc import Callable
  6  from typing import TYPE_CHECKING, Any, Literal
  7  
  8  from haystack import Document, Pipeline, default_from_dict, default_to_dict, super_component
  9  from haystack.components.preprocessors.document_cleaner import DocumentCleaner
 10  from haystack.components.preprocessors.document_splitter import DocumentSplitter, Language
 11  from haystack.utils import deserialize_callable, serialize_callable
 12  
 13  
 14  @super_component
 15  class DocumentPreprocessor:
 16      """
 17      A SuperComponent that first splits and then cleans documents.
 18  
 19      This component consists of a DocumentSplitter followed by a DocumentCleaner in a single pipeline.
 20      It takes a list of documents as input and returns a processed list of documents.
 21  
 22      Usage example:
 23      ```python
 24      from haystack import Document
 25      from haystack.components.preprocessors import DocumentPreprocessor
 26  
 27      doc = Document(content="I love pizza!")
 28      preprocessor = DocumentPreprocessor()
 29      result = preprocessor.run(documents=[doc])
 30      print(result["documents"])
 31      ```
 32      """
 33  
 34      def __init__(  # noqa: PLR0913 (too-many-arguments)
 35          self,
 36          *,
 37          # --- DocumentSplitter arguments ---
 38          split_by: Literal["function", "page", "passage", "period", "word", "line", "sentence"] = "word",
 39          split_length: int = 250,
 40          split_overlap: int = 0,
 41          split_threshold: int = 0,
 42          splitting_function: Callable[[str], list[str]] | None = None,
 43          respect_sentence_boundary: bool = False,
 44          language: Language = "en",
 45          use_split_rules: bool = True,
 46          extend_abbreviations: bool = True,
 47          # --- DocumentCleaner arguments ---
 48          remove_empty_lines: bool = True,
 49          remove_extra_whitespaces: bool = True,
 50          remove_repeated_substrings: bool = False,
 51          keep_id: bool = False,
 52          remove_substrings: list[str] | None = None,
 53          remove_regex: str | None = None,
 54          unicode_normalization: Literal["NFC", "NFKC", "NFD", "NFKD"] | None = None,
 55          ascii_only: bool = False,
 56      ) -> None:
 57          """
 58          Initialize a DocumentPreProcessor that first splits and then cleans documents.
 59  
 60          **Splitter Parameters**:
 61          :param split_by: The unit of splitting: "function", "page", "passage", "period", "word", "line", or "sentence".
 62          :param split_length: The maximum number of units (words, lines, pages, and so on) in each split.
 63          :param split_overlap: The number of overlapping units between consecutive splits.
 64          :param split_threshold: The minimum number of units per split. If a split is smaller than this, it's merged
 65              with the previous split.
 66          :param splitting_function: A custom function for splitting if `split_by="function"`.
 67          :param respect_sentence_boundary: If `True`, splits by words but tries not to break inside a sentence.
 68          :param language: Language used by the sentence tokenizer if `split_by="sentence"` or
 69              `respect_sentence_boundary=True`.
 70          :param use_split_rules: Whether to apply additional splitting heuristics for the sentence splitter.
 71          :param extend_abbreviations: Whether to extend the sentence splitter with curated abbreviations for certain
 72              languages.
 73  
 74          **Cleaner Parameters**:
 75          :param remove_empty_lines: If `True`, removes empty lines.
 76          :param remove_extra_whitespaces: If `True`, removes extra whitespaces.
 77          :param remove_repeated_substrings: If `True`, removes repeated substrings like headers/footers across pages.
 78          :param keep_id: If `True`, keeps the original document IDs.
 79          :param remove_substrings: A list of strings to remove from the document content.
 80          :param remove_regex: A regex pattern whose matches will be removed from the document content.
 81          :param unicode_normalization: Unicode normalization form to apply to the text, for example `"NFC"`.
 82          :param ascii_only: If `True`, converts text to ASCII only.
 83          """
 84          # Store arguments for serialization
 85          self.remove_empty_lines = remove_empty_lines
 86          self.remove_extra_whitespaces = remove_extra_whitespaces
 87          self.remove_repeated_substrings = remove_repeated_substrings
 88          self.keep_id = keep_id
 89          self.remove_substrings = remove_substrings
 90          self.remove_regex = remove_regex
 91          self.unicode_normalization = unicode_normalization
 92          self.ascii_only = ascii_only
 93  
 94          self.split_by = split_by
 95          self.split_length = split_length
 96          self.split_overlap = split_overlap
 97          self.split_threshold = split_threshold
 98          self.splitting_function = splitting_function
 99          self.respect_sentence_boundary = respect_sentence_boundary
100          self.language = language
101          self.use_split_rules = use_split_rules
102          self.extend_abbreviations = extend_abbreviations
103  
104          # Instantiate sub-components
105          splitter = DocumentSplitter(
106              split_by=self.split_by,
107              split_length=self.split_length,
108              split_overlap=self.split_overlap,
109              split_threshold=self.split_threshold,
110              splitting_function=self.splitting_function,
111              respect_sentence_boundary=self.respect_sentence_boundary,
112              language=self.language,
113              use_split_rules=self.use_split_rules,
114              extend_abbreviations=self.extend_abbreviations,
115          )
116  
117          cleaner = DocumentCleaner(
118              remove_empty_lines=self.remove_empty_lines,
119              remove_extra_whitespaces=self.remove_extra_whitespaces,
120              remove_repeated_substrings=self.remove_repeated_substrings,
121              keep_id=self.keep_id,
122              remove_substrings=self.remove_substrings,
123              remove_regex=self.remove_regex,
124              unicode_normalization=self.unicode_normalization,
125              ascii_only=self.ascii_only,
126          )
127  
128          # Build the Pipeline
129          pp = Pipeline()
130  
131          pp.add_component("splitter", splitter)
132          pp.add_component("cleaner", cleaner)
133  
134          # Connect the splitter output to cleaner
135          pp.connect("splitter.documents", "cleaner.documents")
136          self.pipeline = pp
137  
138          # Define how pipeline inputs/outputs map to sub-component inputs/outputs
139          self.input_mapping = {
140              # The pipeline input "documents" feeds into "splitter.documents"
141              "documents": ["splitter.documents"]
142          }
143          # The pipeline output "documents" comes from "cleaner.documents"
144          self.output_mapping = {"cleaner.documents": "documents"}
145  
146      if TYPE_CHECKING:
147          # fake method, never executed, but static analyzers will not complain about missing method
148          def run(self, *, documents: list[Document]) -> dict[str, list[Document]]:  # noqa: D102
149              ...
150          def warm_up(self) -> None:  # noqa: D102
151              ...
152  
153      def to_dict(self) -> dict[str, Any]:
154          """
155          Serialize SuperComponent to a dictionary.
156  
157          :return:
158              Dictionary with serialized data.
159          """
160          splitting_function = None
161          if self.splitting_function is not None:
162              splitting_function = serialize_callable(self.splitting_function)
163  
164          return default_to_dict(
165              self,
166              remove_empty_lines=self.remove_empty_lines,
167              remove_extra_whitespaces=self.remove_extra_whitespaces,
168              remove_repeated_substrings=self.remove_repeated_substrings,
169              keep_id=self.keep_id,
170              remove_substrings=self.remove_substrings,
171              remove_regex=self.remove_regex,
172              unicode_normalization=self.unicode_normalization,
173              ascii_only=self.ascii_only,
174              split_by=self.split_by,
175              split_length=self.split_length,
176              split_overlap=self.split_overlap,
177              split_threshold=self.split_threshold,
178              splitting_function=splitting_function,
179              respect_sentence_boundary=self.respect_sentence_boundary,
180              language=self.language,
181              use_split_rules=self.use_split_rules,
182              extend_abbreviations=self.extend_abbreviations,
183          )
184  
185      @classmethod
186      def from_dict(cls, data: dict[str, Any]) -> "DocumentPreprocessor":
187          """
188          Deserializes the SuperComponent from a dictionary.
189  
190          :param data:
191              Dictionary to deserialize from.
192          :returns:
193              Deserialized SuperComponent.
194          """
195          splitting_function = data["init_parameters"].get("splitting_function", None)
196          if splitting_function:
197              data["init_parameters"]["splitting_function"] = deserialize_callable(splitting_function)
198          return default_from_dict(cls, data)