Cradicle Explorer

/ haystack / components / converters / multi_file_converter.py
multi_file_converter.py
  1  # SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
  2  #
  3  # SPDX-License-Identifier: Apache-2.0
  4  
  5  from enum import Enum
  6  from pathlib import Path
  7  from typing import TYPE_CHECKING, Any
  8  
  9  from haystack import Document, Pipeline, super_component
 10  from haystack.components.converters import (
 11      CSVToDocument,
 12      DOCXToDocument,
 13      HTMLToDocument,
 14      JSONConverter,
 15      PPTXToDocument,
 16      PyPDFToDocument,
 17      TextFileToDocument,
 18      XLSXToDocument,
 19  )
 20  from haystack.components.joiners import DocumentJoiner
 21  from haystack.components.routers import FileTypeRouter
 22  from haystack.dataclasses import ByteStream
 23  
 24  
 25  class ConverterMimeType(str, Enum):
 26      CSV = "text/csv"
 27      DOCX = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
 28      HTML = "text/html"
 29      JSON = "application/json"
 30      MD = "text/markdown"
 31      TEXT = "text/plain"
 32      PDF = "application/pdf"
 33      PPTX = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
 34      XLSX = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
 35  
 36  
 37  @super_component
 38  class MultiFileConverter:
 39      """
 40      A file converter that handles conversion of multiple file types.
 41  
 42      The MultiFileConverter handles the following file types:
 43      - CSV
 44      - DOCX
 45      - HTML
 46      - JSON
 47      - MD
 48      - TEXT
 49      - PDF (no OCR)
 50      - PPTX
 51      - XLSX
 52  
 53      Usage example:
 54      ```
 55      from haystack.super_components.converters import MultiFileConverter
 56  
 57      converter = MultiFileConverter()
 58      converter.run(sources=["test/test_files/txt/doc_1.txt", "test/test_files/pdf/sample_pdf_1.pdf"], meta={})
 59      ```
 60      """
 61  
 62      def __init__(self, encoding: str = "utf-8", json_content_key: str = "content") -> None:
 63          """
 64          Initialize the MultiFileConverter.
 65  
 66          :param encoding: The encoding to use when reading files.
 67          :param json_content_key: The key to use in a content field in a document when converting JSON files.
 68          """
 69          self.encoding = encoding
 70          self.json_content_key = json_content_key
 71  
 72          # initialize components
 73          router = FileTypeRouter(
 74              mime_types=[mime_type.value for mime_type in ConverterMimeType],
 75              # Ensure common extensions are registered. Tests on Windows fail otherwise.
 76              additional_mimetypes={
 77                  "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
 78                  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
 79                  "application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
 80              },
 81          )
 82  
 83          # Create pipeline and add components
 84          pp = Pipeline()
 85  
 86          pp.add_component("router", router)
 87          pp.add_component("docx", DOCXToDocument(link_format="markdown"))
 88          pp.add_component(
 89              "html",
 90              HTMLToDocument(
 91                  extraction_kwargs={"output_format": "markdown", "include_tables": True, "include_links": True}
 92              ),
 93          )
 94          pp.add_component("json", JSONConverter(content_key=self.json_content_key))
 95          pp.add_component("md", TextFileToDocument(encoding=self.encoding))
 96          pp.add_component("text", TextFileToDocument(encoding=self.encoding))
 97          pp.add_component("pdf", PyPDFToDocument())
 98          pp.add_component("pptx", PPTXToDocument())
 99          pp.add_component("xlsx", XLSXToDocument())
100          pp.add_component("joiner", DocumentJoiner())
101          pp.add_component("csv", CSVToDocument(encoding=self.encoding))
102  
103          for mime_type in ConverterMimeType:
104              pp.connect(f"router.{mime_type.value}", str(mime_type).lower().rsplit(".", maxsplit=1)[-1])
105  
106          pp.connect("docx.documents", "joiner.documents")
107          pp.connect("html.documents", "joiner.documents")
108          pp.connect("json.documents", "joiner.documents")
109          pp.connect("md.documents", "joiner.documents")
110          pp.connect("text.documents", "joiner.documents")
111          pp.connect("pdf.documents", "joiner.documents")
112          pp.connect("pptx.documents", "joiner.documents")
113  
114          pp.connect("csv.documents", "joiner.documents")
115          pp.connect("xlsx.documents", "joiner.documents")
116  
117          self.pipeline = pp
118          self.output_mapping = {
119              "joiner.documents": "documents",
120              "router.unclassified": "unclassified",
121              "router.failed": "failed",
122          }
123          self.input_mapping = {"sources": ["router.sources"], "meta": ["router.meta"]}
124  
125      if TYPE_CHECKING:
126          # fake method, never executed, but static analyzers will not complain about missing method
127          def run(  # noqa: D102
128              self, *, sources: list[str | Path | ByteStream], meta: dict[str, Any] | list[dict[str, Any]] | None = None
129          ) -> dict[str, list[Document]]:  # noqa: D102
130              ...
131  
132          def warm_up(self) -> None:  # noqa: D102
133              ...