multi_file_converter.py
1 # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai> 2 # 3 # SPDX-License-Identifier: Apache-2.0 4 5 from enum import Enum 6 from pathlib import Path 7 from typing import TYPE_CHECKING, Any 8 9 from haystack import Document, Pipeline, super_component 10 from haystack.components.converters import ( 11 CSVToDocument, 12 DOCXToDocument, 13 HTMLToDocument, 14 JSONConverter, 15 PPTXToDocument, 16 PyPDFToDocument, 17 TextFileToDocument, 18 XLSXToDocument, 19 ) 20 from haystack.components.joiners import DocumentJoiner 21 from haystack.components.routers import FileTypeRouter 22 from haystack.dataclasses import ByteStream 23 24 25 class ConverterMimeType(str, Enum): 26 CSV = "text/csv" 27 DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" 28 HTML = "text/html" 29 JSON = "application/json" 30 MD = "text/markdown" 31 TEXT = "text/plain" 32 PDF = "application/pdf" 33 PPTX = "application/vnd.openxmlformats-officedocument.presentationml.presentation" 34 XLSX = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" 35 36 37 @super_component 38 class MultiFileConverter: 39 """ 40 A file converter that handles conversion of multiple file types. 41 42 The MultiFileConverter handles the following file types: 43 - CSV 44 - DOCX 45 - HTML 46 - JSON 47 - MD 48 - TEXT 49 - PDF (no OCR) 50 - PPTX 51 - XLSX 52 53 Usage example: 54 ``` 55 from haystack.super_components.converters import MultiFileConverter 56 57 converter = MultiFileConverter() 58 converter.run(sources=["test/test_files/txt/doc_1.txt", "test/test_files/pdf/sample_pdf_1.pdf"], meta={}) 59 ``` 60 """ 61 62 def __init__(self, encoding: str = "utf-8", json_content_key: str = "content") -> None: 63 """ 64 Initialize the MultiFileConverter. 65 66 :param encoding: The encoding to use when reading files. 67 :param json_content_key: The key to use in a content field in a document when converting JSON files. 68 """ 69 self.encoding = encoding 70 self.json_content_key = json_content_key 71 72 # initialize components 73 router = FileTypeRouter( 74 mime_types=[mime_type.value for mime_type in ConverterMimeType], 75 # Ensure common extensions are registered. Tests on Windows fail otherwise. 76 additional_mimetypes={ 77 "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", 78 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx", 79 "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx", 80 }, 81 ) 82 83 # Create pipeline and add components 84 pp = Pipeline() 85 86 pp.add_component("router", router) 87 pp.add_component("docx", DOCXToDocument(link_format="markdown")) 88 pp.add_component( 89 "html", 90 HTMLToDocument( 91 extraction_kwargs={"output_format": "markdown", "include_tables": True, "include_links": True} 92 ), 93 ) 94 pp.add_component("json", JSONConverter(content_key=self.json_content_key)) 95 pp.add_component("md", TextFileToDocument(encoding=self.encoding)) 96 pp.add_component("text", TextFileToDocument(encoding=self.encoding)) 97 pp.add_component("pdf", PyPDFToDocument()) 98 pp.add_component("pptx", PPTXToDocument()) 99 pp.add_component("xlsx", XLSXToDocument()) 100 pp.add_component("joiner", DocumentJoiner()) 101 pp.add_component("csv", CSVToDocument(encoding=self.encoding)) 102 103 for mime_type in ConverterMimeType: 104 pp.connect(f"router.{mime_type.value}", str(mime_type).lower().rsplit(".", maxsplit=1)[-1]) 105 106 pp.connect("docx.documents", "joiner.documents") 107 pp.connect("html.documents", "joiner.documents") 108 pp.connect("json.documents", "joiner.documents") 109 pp.connect("md.documents", "joiner.documents") 110 pp.connect("text.documents", "joiner.documents") 111 pp.connect("pdf.documents", "joiner.documents") 112 pp.connect("pptx.documents", "joiner.documents") 113 114 pp.connect("csv.documents", "joiner.documents") 115 pp.connect("xlsx.documents", "joiner.documents") 116 117 self.pipeline = pp 118 self.output_mapping = { 119 "joiner.documents": "documents", 120 "router.unclassified": "unclassified", 121 "router.failed": "failed", 122 } 123 self.input_mapping = {"sources": ["router.sources"], "meta": ["router.meta"]} 124 125 if TYPE_CHECKING: 126 # fake method, never executed, but static analyzers will not complain about missing method 127 def run( # noqa: D102 128 self, *, sources: list[str | Path | ByteStream], meta: dict[str, Any] | list[dict[str, Any]] | None = None 129 ) -> dict[str, list[Document]]: # noqa: D102 130 ... 131 132 def warm_up(self) -> None: # noqa: D102 133 ...