/ src / config / pipeline.py
pipeline.py
 1  """Pipeline configuration for ingestion and query steps."""
 2  
 3  
 4  from pydantic import BaseModel, Field
 5  
 6  
 7  class IngestionPipelineConfig(BaseModel):
 8      """Configuration for ingestion pipeline steps.
 9  
10      Each step can be enabled or disabled independently.
11      """
12  
13      load_enabled: bool = Field(
14          default=True,
15          description="Enable/disable the load step (converts files to markdown)",
16      )
17      preprocess_enabled: bool = Field(
18          default=True,
19          description="Enable/disable the preprocessing step (removes duplicates, etc.)",
20      )
21      chunk_enabled: bool = Field(
22          default=True,
23          description="Enable/disable the chunking step",
24      )
25      save_enabled: bool = Field(
26          default=True,
27          description="Enable/disable the save step (includes embedding generation and vector database storage)",
28      )
29      parallel_enabled: bool = Field(
30          default=False,
31          description="Enable/disable parallel processing of multiple files using threading",
32      )
33      max_workers: int | None = Field(
34          default=None,
35          description="Maximum number of parallel workers (None = CPU count, 1 = sequential)",
36      )
37  
38  
39  class QueryPipelineConfig(BaseModel):
40      """Configuration for query pipeline steps.
41  
42      Each step can be enabled or disabled independently.
43      """
44  
45      retrieve_enabled: bool = Field(
46          default=True,
47          description="Enable/disable the retrieval step (includes query embedding)",
48      )
49      rerank_enabled: bool = Field(
50          default=False,
51          description="Enable/disable the reranking step (cross-encoder scoring)",
52      )
53      generation_enabled: bool = Field(
54          default=True,
55          description="Enable/disable the LLM generation step",
56      )
57      generation_prompt: str | None = Field(
58          default=None,
59          description="Prompt template for answer generation (use {context} and {query} placeholders)",
60      )
61  
62  
63  class PipelineConfig(BaseModel):
64      """Configuration for both ingestion and query pipelines."""
65  
66      ingestion: IngestionPipelineConfig | None = Field(
67          default_factory=IngestionPipelineConfig,
68          description="Ingestion pipeline configuration",
69      )
70      query: QueryPipelineConfig | None = Field(
71          default_factory=QueryPipelineConfig,
72          description="Query pipeline configuration",
73      )
74