document_preprocessor.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 from collections.abc import Callable 6 from typing import TYPE_CHECKING, Any, Literal 7 8 from haystack import Document, Pipeline, default_from_dict, default_to_dict, super_component 9 from haystack.components.preprocessors.document_cleaner import DocumentCleaner 10 from haystack.components.preprocessors.document_splitter import DocumentSplitter, Language 11 from haystack.utils import deserialize_callable, serialize_callable 12 13 14 @super_component 15 class DocumentPreprocessor: 16 """ 17 A SuperComponent that first splits and then cleans documents. 18 19 This component consists of a DocumentSplitter followed by a DocumentCleaner in a single pipeline. 20 It takes a list of documents as input and returns a processed list of documents. 21 22 Usage example: 23 ```python 24 from haystack import Document 25 from haystack.components.preprocessors import DocumentPreprocessor 26 27 doc = Document(content="I love pizza!") 28 preprocessor = DocumentPreprocessor() 29 result = preprocessor.run(documents=[doc]) 30 print(result["documents"]) 31 ``` 32 """ 33 34 def __init__( # noqa: PLR0913 (too-many-arguments) 35 self, 36 *, 37 # --- DocumentSplitter arguments --- 38 split_by: Literal["function", "page", "passage", "period", "word", "line", "sentence"] = "word", 39 split_length: int = 250, 40 split_overlap: int = 0, 41 split_threshold: int = 0, 42 splitting_function: Callable[[str], list[str]] | None = None, 43 respect_sentence_boundary: bool = False, 44 language: Language = "en", 45 use_split_rules: bool = True, 46 extend_abbreviations: bool = True, 47 # --- DocumentCleaner arguments --- 48 remove_empty_lines: bool = True, 49 remove_extra_whitespaces: bool = True, 50 remove_repeated_substrings: bool = False, 51 keep_id: bool = False, 52 remove_substrings: list[str] | None = None, 53 remove_regex: str | None = None, 54 unicode_normalization: Literal["NFC", "NFKC", "NFD", "NFKD"] | None = None, 55 ascii_only: bool = False, 56 ) -> None: 57 """ 58 Initialize a DocumentPreProcessor that first splits and then cleans documents. 59 60 **Splitter Parameters**: 61 :param split_by: The unit of splitting: "function", "page", "passage", "period", "word", "line", or "sentence". 62 :param split_length: The maximum number of units (words, lines, pages, and so on) in each split. 63 :param split_overlap: The number of overlapping units between consecutive splits. 64 :param split_threshold: The minimum number of units per split. If a split is smaller than this, it's merged 65 with the previous split. 66 :param splitting_function: A custom function for splitting if `split_by="function"`. 67 :param respect_sentence_boundary: If `True`, splits by words but tries not to break inside a sentence. 68 :param language: Language used by the sentence tokenizer if `split_by="sentence"` or 69 `respect_sentence_boundary=True`. 70 :param use_split_rules: Whether to apply additional splitting heuristics for the sentence splitter. 71 :param extend_abbreviations: Whether to extend the sentence splitter with curated abbreviations for certain 72 languages. 73 74 **Cleaner Parameters**: 75 :param remove_empty_lines: If `True`, removes empty lines. 76 :param remove_extra_whitespaces: If `True`, removes extra whitespaces. 77 :param remove_repeated_substrings: If `True`, removes repeated substrings like headers/footers across pages. 78 :param keep_id: If `True`, keeps the original document IDs. 79 :param remove_substrings: A list of strings to remove from the document content. 80 :param remove_regex: A regex pattern whose matches will be removed from the document content. 81 :param unicode_normalization: Unicode normalization form to apply to the text, for example `"NFC"`. 82 :param ascii_only: If `True`, converts text to ASCII only. 83 """ 84 # Store arguments for serialization 85 self.remove_empty_lines = remove_empty_lines 86 self.remove_extra_whitespaces = remove_extra_whitespaces 87 self.remove_repeated_substrings = remove_repeated_substrings 88 self.keep_id = keep_id 89 self.remove_substrings = remove_substrings 90 self.remove_regex = remove_regex 91 self.unicode_normalization = unicode_normalization 92 self.ascii_only = ascii_only 93 94 self.split_by = split_by 95 self.split_length = split_length 96 self.split_overlap = split_overlap 97 self.split_threshold = split_threshold 98 self.splitting_function = splitting_function 99 self.respect_sentence_boundary = respect_sentence_boundary 100 self.language = language 101 self.use_split_rules = use_split_rules 102 self.extend_abbreviations = extend_abbreviations 103 104 # Instantiate sub-components 105 splitter = DocumentSplitter( 106 split_by=self.split_by, 107 split_length=self.split_length, 108 split_overlap=self.split_overlap, 109 split_threshold=self.split_threshold, 110 splitting_function=self.splitting_function, 111 respect_sentence_boundary=self.respect_sentence_boundary, 112 language=self.language, 113 use_split_rules=self.use_split_rules, 114 extend_abbreviations=self.extend_abbreviations, 115 ) 116 117 cleaner = DocumentCleaner( 118 remove_empty_lines=self.remove_empty_lines, 119 remove_extra_whitespaces=self.remove_extra_whitespaces, 120 remove_repeated_substrings=self.remove_repeated_substrings, 121 keep_id=self.keep_id, 122 remove_substrings=self.remove_substrings, 123 remove_regex=self.remove_regex, 124 unicode_normalization=self.unicode_normalization, 125 ascii_only=self.ascii_only, 126 ) 127 128 # Build the Pipeline 129 pp = Pipeline() 130 131 pp.add_component("splitter", splitter) 132 pp.add_component("cleaner", cleaner) 133 134 # Connect the splitter output to cleaner 135 pp.connect("splitter.documents", "cleaner.documents") 136 self.pipeline = pp 137 138 # Define how pipeline inputs/outputs map to sub-component inputs/outputs 139 self.input_mapping = { 140 # The pipeline input "documents" feeds into "splitter.documents" 141 "documents": ["splitter.documents"] 142 } 143 # The pipeline output "documents" comes from "cleaner.documents" 144 self.output_mapping = {"cleaner.documents": "documents"} 145 146 if TYPE_CHECKING: 147 # fake method, never executed, but static analyzers will not complain about missing method 148 def run(self, *, documents: list[Document]) -> dict[str, list[Document]]: # noqa: D102 149 ... 150 def warm_up(self) -> None: # noqa: D102 151 ... 152 153 def to_dict(self) -> dict[str, Any]: 154 """ 155 Serialize SuperComponent to a dictionary. 156 157 :return: 158 Dictionary with serialized data. 159 """ 160 splitting_function = None 161 if self.splitting_function is not None: 162 splitting_function = serialize_callable(self.splitting_function) 163 164 return default_to_dict( 165 self, 166 remove_empty_lines=self.remove_empty_lines, 167 remove_extra_whitespaces=self.remove_extra_whitespaces, 168 remove_repeated_substrings=self.remove_repeated_substrings, 169 keep_id=self.keep_id, 170 remove_substrings=self.remove_substrings, 171 remove_regex=self.remove_regex, 172 unicode_normalization=self.unicode_normalization, 173 ascii_only=self.ascii_only, 174 split_by=self.split_by, 175 split_length=self.split_length, 176 split_overlap=self.split_overlap, 177 split_threshold=self.split_threshold, 178 splitting_function=splitting_function, 179 respect_sentence_boundary=self.respect_sentence_boundary, 180 language=self.language, 181 use_split_rules=self.use_split_rules, 182 extend_abbreviations=self.extend_abbreviations, 183 ) 184 185 @classmethod 186 def from_dict(cls, data: dict[str, Any]) -> "DocumentPreprocessor": 187 """ 188 Deserializes the SuperComponent from a dictionary. 189 190 :param data: 191 Dictionary to deserialize from. 192 :returns: 193 Deserialized SuperComponent. 194 """ 195 splitting_function = data["init_parameters"].get("splitting_function", None) 196 if splitting_function: 197 data["init_parameters"]["splitting_function"] = deserialize_callable(splitting_function) 198 return default_from_dict(cls, data)