/ config-example.yaml
config-example.yaml
  1  # Example configuration file for SMPTE-Copilot RAG system
  2  # Copy this file to config.yaml and modify as needed
  3  
  4  input_source:
  5    source_type: local           # Options: local, s3
  6    source_config: null          # For local sources, uses paths.input_path (see below)
  7    # For S3, use:
  8    # source_type: s3
  9    # source_config:
 10    #   bucket_name: "my-documents-bucket"  # Required: S3 bucket name
 11    #   prefix: "documents/"                # Optional: S3 key prefix to filter files
 12    #   aws_access_key_id: "YOUR_ACCESS_KEY"      # Optional: AWS credentials (or use IAM role/env vars)
 13    #   aws_secret_access_key: "YOUR_SECRET_KEY"  # Optional: AWS credentials
 14    #   aws_session_token: "YOUR_SESSION_TOKEN"   # Optional: AWS session token (required for temporary credentials)
 15    #   region_name: "us-east-1"                  # Optional: AWS region
 16    #   # endpoint_url: "http://custom-s3:9000"   # Optional: For S3-compatible services (MinIO, etc.)
 17  
 18  loader:
 19    file_type_mapping:         # Map file extensions to loader types (required)
 20      # Format: list of entries, each with a list of extensions that use the same loader
 21      # This avoids repeating loader configurations for multiple file types
 22      - extensions: [.pdf, .docx]  # Multiple extensions can share the same loader config
 23        loader_name: docling
 24        loader_config: 
 25          picture_description_enabled: false  # Set to true to enable image descriptions (requires llm credentials below)
 26                                              # llm_api_key, llm_endpoint, llm_model are required if picture_description_enabled is true
 27          llm_api_key: "YOUR_LLM_API_KEY"
 28          llm_endpoint: https://generativelanguage.googleapis.com/v1beta/openai/chat/completions # LLM endpoint for image description
 29          llm_model: gemini-2.5-flash-lite # Model for image description
 30          image_description_prompt: | 
 31            Describe the image in 2–4 bullet points focusing only on text, charts, and table headers. 
 32            If the image is decorative, output: DECORATIVE. Max 60 words.
 33          image_description_timeout: 30  # Timeout in seconds for image description generation
 34      - extensions: [.mp4]
 35        loader_name: whisper
 36        loader_config:
 37          model_name: base  # Whisper model: tiny, base, small, medium, large (default: base)
 38          device: cpu  # Device to run on: cpu or cuda (default: cpu)
 39          language: en  # Language code (e.g., "en", "es") or null for auto-detection (default: en)
 40          include_timestamps: true  # Include timestamps in markdown output (default: true)
 41  
 42  preprocessing:
 43    preprocessing_name: rapidfuzz  # Preprocessor type (rapidfuzz)
 44    preprocessing_config:      # Preprocessor-specific configuration
 45      min_repetitions: 3       # Minimum number of times a line must appear (or be similar) to be considered repeated (default: 3)
 46      similarity_threshold: 0.85  # Minimum similarity ratio for fuzzy matching (0.0 to 1.0, default: 0.85)
 47  
 48  chunking:
 49    chunker_name: langchain    # Chunker name (langchain, hybrid)
 50    chunker_config:            # Chunker-specific configuration (recommended)
 51      chunk_size: 1000         # Size of text chunks in characters (for langchain)
 52      chunk_overlap: 200       # Overlap between chunks in characters (for langchain)
 53      method: recursive         # Options: recursive, character, token (for langchain)
 54    # For hybrid chunking (semantic + token-based):
 55    # chunker_name: hybrid
 56    # chunker_config:
 57    #   max_tokens: 2000        # Maximum tokens per chunk (default: 2000)
 58    #   merge_peers: false      # Whether to merge peer chunks (default: false)
 59    #   tokenizer: simple       # Tokenizer type: simple (default) or gemini
 60    #   tokenizer_config:       # Tokenizer-specific configuration
 61    #     chars_per_token_ratio: 1.5  # Char-to-token ratio for threshold estimation (default: 1.5)
 62    #                                 # Lower = more conservative, triggers token check earlier
 63    #     split_buffer_size: 5       # Words to buffer before checking limits (default: 5)
 64    #     # For gemini tokenizer:
 65    #     # llm_api_key: "${GOOGLE_API_KEY}"  # Google API key for accurate token counting
 66    #     # llm_model: "gemini-embedding-001"        # Gemini model for token counting
 67    #     # For simple tokenizer: no additional config needed
 68    # Note: chunk_size and chunk_overlap are NOT used by hybrid chunker (it's token-based)
 69    
 70  embedding:
 71    embed_name: huggingface    # Options: huggingface, openai, gemini
 72    embed_config:              # Additional model-specific arguments (dict)
 73      # For HuggingFace models, you can specify the specific model name:
 74      model_name: "sentence-transformers/all-MiniLM-L6-v2"  # Default if not specified
 75      # For OpenAI, you would specify:
 76      # model_name: "text-embedding-3-small"  # or other OpenAI embedding model
 77      # For Gemini, you would specify:
 78      # model_name: "embedding-001"  # or other Gemini embedding model
 79      # google_api_key: 
 80  
 81  vector_store:
 82    store_name: chromadb        # Vector store name (options: chromadb, qdrant)
 83    store_config:               # Store-specific configuration (dict)
 84      persist_directory: ./vector_db  # Directory to persist vector store data (relative to current working directory)
 85      collection_name: rag_collection    # Collection name in the vector store
 86    # For Qdrant, include url in store_config:
 87    # store_name: qdrant
 88    # store_config:
 89    #   persist_directory: ./vector_db
 90    #   collection_name: rag_collection
 91    #   url: http://qdrant:6333  # Qdrant server URL
 92  
 93  retrieval:
 94    searcher_strategy: similarity  # Retrieval strategy
 95    k: 5                          # Number of results to retrieve
 96    searcher_config: null         # Additional searcher-specific configuration (dict)
 97  
 98  reranking:
 99    reranker_name: gemini         # Reranker type (currently only gemini supported)
100    reranker_config:              # Reranker-specific configuration
101      model: gemini-2.5-flash     # Gemini model for reranking
102      api_key:                    # Gemini API key (or use ${GOOGLE_API_KEY} environment variable)
103      # max_chars: 2000           # Optional: Maximum characters of document content to send for scoring (default: 2000)
104      scoring_prompt: |           # Prompt template for document relevance scoring (use {query} and {document} placeholders)
105        Rate document relevance from 0-10.
106  
107        Query: {query}
108  
109        Document:
110        {document}
111  
112        Instructions: Return ONLY a single number from 0 to 10.
113        - 10 = Directly answers with details
114        - 5 = Partially relevant
115        - 0 = Not relevant
116  
117        Output format: Just the number, nothing else.
118  
119  llm:
120    llm_name: gemini
121    llm_config:
122      model: gemini-2.5-flash
123      api_key: 
124      temperature: 0.3              # Controls randomness (0.0-1.0, default: 0.3)
125      max_output_tokens: 2048       # Maximum tokens in response (default: 2048) 
126  
127  paths:
128    input_path: ./data                 # Base directory for local input sources
129    markdown_dir: ./data/markdown      # Directory for markdown output
130  
131  logging:
132    level: INFO                    # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
133  
134  access_control:
135    # Ingestion settings (applied to all ingested documents)
136    default_access_tags: ["Public"]        # Default access tags for ingested documents (e.g., ["Finance", "Public"])
137    
138    # Query settings (applied to all queries)
139    default_user_role: "Public"   # Default user role for query access control (e.g., "Finance_Manager")
140    access_mapping_file: "./access_mapping.json"  # Path to unified folder-to-tags and role-to-tags mapping
141    
142    # Access denial notification behavior
143    notify_on_denied_access: false  # If true, notify users about restricted documents instead of silent filtering
144                                    # When false (default): uses efficient Vector Store filtering (silent)
145  
146  user_resolver:
147    # User role resolver configuration
148    # Determines user roles dynamically from identity information (e.g., email from OpenWebUI)
149    # Note: default_role is taken from access_control.default_user_role to avoid duplication
150    resolver_name: json             # Options: json (future: database, ldap)
151    resolver_config:                # Resolver-specific configuration
152      mapping_file: "./user_mapping.json"  # Path to JSON file with user-to-role mappings
153  
154  pipeline:
155    # Configure which steps are enabled in the ingestion and query pipelines
156    ingestion:
157      load_enabled: true              
158      preprocess_enabled: true        
159      chunk_enabled: true             
160      save_enabled: true
161      # Parallelization settings for processing multiple files (uses threading)
162      parallel_enabled: false         # Enable parallel processing using threading (default: false)
163      max_workers: null               # Max parallel workers (null = CPU count, 1 = sequential)
164    
165    query:
166      retrieve_enabled: true          # Enable document retrieval step
167      rerank_enabled: false           # Enable reranking step (improves precision but adds latency)
168      generation_enabled: true        # Enable LLM answer generation step
169      generation_prompt: |            # Prompt template for answer generation (use {context} and {query} placeholders)
170        You are SMPTE-Copilot, an expert technical assistant.
171  
172        Your task is to answer the user's question based on the provided context documents.
173  
174        Guidelines:
175        - Synthesize and integrate information from the context to provide a comprehensive answer
176        - Be concise but thorough, using technical terminology when appropriate
177        - Always cite sources using [1], [2], etc., referring to the context blocks
178        - If the context contains relevant information but doesn't directly answer the question, explain what the context reveals about the topic
179        - Only say "I don't know based on the provided documents" if the context is completely unrelated to the question
180        - Do not fabricate information that isn't supported by the context
181  
182        Context Documents:
183        {context}
184  
185        Question:
186        {query}