/ config-example.yaml
config-example.yaml
1 # Example configuration file for SMPTE-Copilot RAG system 2 # Copy this file to config.yaml and modify as needed 3 4 input_source: 5 source_type: local # Options: local, s3 6 source_config: null # For local sources, uses paths.input_path (see below) 7 # For S3, use: 8 # source_type: s3 9 # source_config: 10 # bucket_name: "my-documents-bucket" # Required: S3 bucket name 11 # prefix: "documents/" # Optional: S3 key prefix to filter files 12 # aws_access_key_id: "YOUR_ACCESS_KEY" # Optional: AWS credentials (or use IAM role/env vars) 13 # aws_secret_access_key: "YOUR_SECRET_KEY" # Optional: AWS credentials 14 # aws_session_token: "YOUR_SESSION_TOKEN" # Optional: AWS session token (required for temporary credentials) 15 # region_name: "us-east-1" # Optional: AWS region 16 # # endpoint_url: "http://custom-s3:9000" # Optional: For S3-compatible services (MinIO, etc.) 17 18 loader: 19 file_type_mapping: # Map file extensions to loader types (required) 20 # Format: list of entries, each with a list of extensions that use the same loader 21 # This avoids repeating loader configurations for multiple file types 22 - extensions: [.pdf, .docx] # Multiple extensions can share the same loader config 23 loader_name: docling 24 loader_config: 25 picture_description_enabled: false # Set to true to enable image descriptions (requires llm credentials below) 26 # llm_api_key, llm_endpoint, llm_model are required if picture_description_enabled is true 27 llm_api_key: "YOUR_LLM_API_KEY" 28 llm_endpoint: https://generativelanguage.googleapis.com/v1beta/openai/chat/completions # LLM endpoint for image description 29 llm_model: gemini-2.5-flash-lite # Model for image description 30 image_description_prompt: | 31 Describe the image in 2–4 bullet points focusing only on text, charts, and table headers. 32 If the image is decorative, output: DECORATIVE. Max 60 words. 33 image_description_timeout: 30 # Timeout in seconds for image description generation 34 - extensions: [.mp4] 35 loader_name: whisper 36 loader_config: 37 model_name: base # Whisper model: tiny, base, small, medium, large (default: base) 38 device: cpu # Device to run on: cpu or cuda (default: cpu) 39 language: en # Language code (e.g., "en", "es") or null for auto-detection (default: en) 40 include_timestamps: true # Include timestamps in markdown output (default: true) 41 42 preprocessing: 43 preprocessing_name: rapidfuzz # Preprocessor type (rapidfuzz) 44 preprocessing_config: # Preprocessor-specific configuration 45 min_repetitions: 3 # Minimum number of times a line must appear (or be similar) to be considered repeated (default: 3) 46 similarity_threshold: 0.85 # Minimum similarity ratio for fuzzy matching (0.0 to 1.0, default: 0.85) 47 48 chunking: 49 chunker_name: langchain # Chunker name (langchain, hybrid) 50 chunker_config: # Chunker-specific configuration (recommended) 51 chunk_size: 1000 # Size of text chunks in characters (for langchain) 52 chunk_overlap: 200 # Overlap between chunks in characters (for langchain) 53 method: recursive # Options: recursive, character, token (for langchain) 54 # For hybrid chunking (semantic + token-based): 55 # chunker_name: hybrid 56 # chunker_config: 57 # max_tokens: 2000 # Maximum tokens per chunk (default: 2000) 58 # merge_peers: false # Whether to merge peer chunks (default: false) 59 # tokenizer: simple # Tokenizer type: simple (default) or gemini 60 # tokenizer_config: # Tokenizer-specific configuration 61 # chars_per_token_ratio: 1.5 # Char-to-token ratio for threshold estimation (default: 1.5) 62 # # Lower = more conservative, triggers token check earlier 63 # split_buffer_size: 5 # Words to buffer before checking limits (default: 5) 64 # # For gemini tokenizer: 65 # # llm_api_key: "${GOOGLE_API_KEY}" # Google API key for accurate token counting 66 # # llm_model: "gemini-embedding-001" # Gemini model for token counting 67 # # For simple tokenizer: no additional config needed 68 # Note: chunk_size and chunk_overlap are NOT used by hybrid chunker (it's token-based) 69 70 embedding: 71 embed_name: huggingface # Options: huggingface, openai, gemini 72 embed_config: # Additional model-specific arguments (dict) 73 # For HuggingFace models, you can specify the specific model name: 74 model_name: "sentence-transformers/all-MiniLM-L6-v2" # Default if not specified 75 # For OpenAI, you would specify: 76 # model_name: "text-embedding-3-small" # or other OpenAI embedding model 77 # For Gemini, you would specify: 78 # model_name: "embedding-001" # or other Gemini embedding model 79 # google_api_key: 80 81 vector_store: 82 store_name: chromadb # Vector store name (options: chromadb, qdrant) 83 store_config: # Store-specific configuration (dict) 84 persist_directory: ./vector_db # Directory to persist vector store data (relative to current working directory) 85 collection_name: rag_collection # Collection name in the vector store 86 # For Qdrant, include url in store_config: 87 # store_name: qdrant 88 # store_config: 89 # persist_directory: ./vector_db 90 # collection_name: rag_collection 91 # url: http://qdrant:6333 # Qdrant server URL 92 93 retrieval: 94 searcher_strategy: similarity # Retrieval strategy 95 k: 5 # Number of results to retrieve 96 searcher_config: null # Additional searcher-specific configuration (dict) 97 98 reranking: 99 reranker_name: gemini # Reranker type (currently only gemini supported) 100 reranker_config: # Reranker-specific configuration 101 model: gemini-2.5-flash # Gemini model for reranking 102 api_key: # Gemini API key (or use ${GOOGLE_API_KEY} environment variable) 103 # max_chars: 2000 # Optional: Maximum characters of document content to send for scoring (default: 2000) 104 scoring_prompt: | # Prompt template for document relevance scoring (use {query} and {document} placeholders) 105 Rate document relevance from 0-10. 106 107 Query: {query} 108 109 Document: 110 {document} 111 112 Instructions: Return ONLY a single number from 0 to 10. 113 - 10 = Directly answers with details 114 - 5 = Partially relevant 115 - 0 = Not relevant 116 117 Output format: Just the number, nothing else. 118 119 llm: 120 llm_name: gemini 121 llm_config: 122 model: gemini-2.5-flash 123 api_key: 124 temperature: 0.3 # Controls randomness (0.0-1.0, default: 0.3) 125 max_output_tokens: 2048 # Maximum tokens in response (default: 2048) 126 127 paths: 128 input_path: ./data # Base directory for local input sources 129 markdown_dir: ./data/markdown # Directory for markdown output 130 131 logging: 132 level: INFO # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) 133 134 access_control: 135 # Ingestion settings (applied to all ingested documents) 136 default_access_tags: ["Public"] # Default access tags for ingested documents (e.g., ["Finance", "Public"]) 137 138 # Query settings (applied to all queries) 139 default_user_role: "Public" # Default user role for query access control (e.g., "Finance_Manager") 140 access_mapping_file: "./access_mapping.json" # Path to unified folder-to-tags and role-to-tags mapping 141 142 # Access denial notification behavior 143 notify_on_denied_access: false # If true, notify users about restricted documents instead of silent filtering 144 # When false (default): uses efficient Vector Store filtering (silent) 145 146 user_resolver: 147 # User role resolver configuration 148 # Determines user roles dynamically from identity information (e.g., email from OpenWebUI) 149 # Note: default_role is taken from access_control.default_user_role to avoid duplication 150 resolver_name: json # Options: json (future: database, ldap) 151 resolver_config: # Resolver-specific configuration 152 mapping_file: "./user_mapping.json" # Path to JSON file with user-to-role mappings 153 154 pipeline: 155 # Configure which steps are enabled in the ingestion and query pipelines 156 ingestion: 157 load_enabled: true 158 preprocess_enabled: true 159 chunk_enabled: true 160 save_enabled: true 161 # Parallelization settings for processing multiple files (uses threading) 162 parallel_enabled: false # Enable parallel processing using threading (default: false) 163 max_workers: null # Max parallel workers (null = CPU count, 1 = sequential) 164 165 query: 166 retrieve_enabled: true # Enable document retrieval step 167 rerank_enabled: false # Enable reranking step (improves precision but adds latency) 168 generation_enabled: true # Enable LLM answer generation step 169 generation_prompt: | # Prompt template for answer generation (use {context} and {query} placeholders) 170 You are SMPTE-Copilot, an expert technical assistant. 171 172 Your task is to answer the user's question based on the provided context documents. 173 174 Guidelines: 175 - Synthesize and integrate information from the context to provide a comprehensive answer 176 - Be concise but thorough, using technical terminology when appropriate 177 - Always cite sources using [1], [2], etc., referring to the context blocks 178 - If the context contains relevant information but doesn't directly answer the question, explain what the context reveals about the topic 179 - Only say "I don't know based on the provided documents" if the context is completely unrelated to the question 180 - Do not fabricate information that isn't supported by the context 181 182 Context Documents: 183 {context} 184 185 Question: 186 {query}