pipeline.py
1 """Pipeline configuration for ingestion and query steps.""" 2 3 4 from pydantic import BaseModel, Field 5 6 7 class IngestionPipelineConfig(BaseModel): 8 """Configuration for ingestion pipeline steps. 9 10 Each step can be enabled or disabled independently. 11 """ 12 13 load_enabled: bool = Field( 14 default=True, 15 description="Enable/disable the load step (converts files to markdown)", 16 ) 17 preprocess_enabled: bool = Field( 18 default=True, 19 description="Enable/disable the preprocessing step (removes duplicates, etc.)", 20 ) 21 chunk_enabled: bool = Field( 22 default=True, 23 description="Enable/disable the chunking step", 24 ) 25 save_enabled: bool = Field( 26 default=True, 27 description="Enable/disable the save step (includes embedding generation and vector database storage)", 28 ) 29 parallel_enabled: bool = Field( 30 default=False, 31 description="Enable/disable parallel processing of multiple files using threading", 32 ) 33 max_workers: int | None = Field( 34 default=None, 35 description="Maximum number of parallel workers (None = CPU count, 1 = sequential)", 36 ) 37 38 39 class QueryPipelineConfig(BaseModel): 40 """Configuration for query pipeline steps. 41 42 Each step can be enabled or disabled independently. 43 """ 44 45 retrieve_enabled: bool = Field( 46 default=True, 47 description="Enable/disable the retrieval step (includes query embedding)", 48 ) 49 rerank_enabled: bool = Field( 50 default=False, 51 description="Enable/disable the reranking step (cross-encoder scoring)", 52 ) 53 generation_enabled: bool = Field( 54 default=True, 55 description="Enable/disable the LLM generation step", 56 ) 57 generation_prompt: str | None = Field( 58 default=None, 59 description="Prompt template for answer generation (use {context} and {query} placeholders)", 60 ) 61 62 63 class PipelineConfig(BaseModel): 64 """Configuration for both ingestion and query pipelines.""" 65 66 ingestion: IngestionPipelineConfig | None = Field( 67 default_factory=IngestionPipelineConfig, 68 description="Ingestion pipeline configuration", 69 ) 70 query: QueryPipelineConfig | None = Field( 71 default_factory=QueryPipelineConfig, 72 description="Query pipeline configuration", 73 ) 74