chunking_strategies.py
1 """ 2 Chunking Strategies for RAG 3 4 This example demonstrates different approaches to splitting documents into 5 chunks for effective retrieval. 6 7 RAG Concept: Chunking is critical for RAG quality. Too large chunks dilute 8 relevance; too small chunks lose context. The right strategy depends on 9 your content type and query patterns. 10 """ 11 12 from praisonaiagents import Agent 13 14 # Sample long document: Product documentation 15 PRODUCT_DOCUMENTATION = """ 16 # CloudManager Pro - Complete User Guide 17 18 ## Chapter 1: Getting Started 19 20 CloudManager Pro is an enterprise cloud management platform that helps organizations 21 monitor, optimize, and secure their multi-cloud infrastructure. This guide covers 22 installation, configuration, and daily operations. 23 24 ### 1.1 System Requirements 25 26 Before installing CloudManager Pro, ensure your system meets these requirements: 27 - Operating System: Ubuntu 20.04+, RHEL 8+, or Windows Server 2019+ 28 - Memory: Minimum 16GB RAM, recommended 32GB for production 29 - Storage: 100GB SSD for application, additional storage for logs 30 - Network: Outbound HTTPS access to cloud provider APIs 31 32 ### 1.2 Installation Steps 33 34 1. Download the installer from the customer portal 35 2. Run the installation script with root privileges 36 3. Configure the database connection 37 4. Set up the initial admin account 38 5. Verify the installation with the health check command 39 40 ## Chapter 2: Dashboard Overview 41 42 The main dashboard provides a unified view of your cloud resources across all 43 connected providers. Key metrics displayed include: 44 45 ### 2.1 Resource Metrics 46 47 - Total compute instances across all clouds 48 - Storage utilization and growth trends 49 - Network bandwidth consumption 50 - Cost breakdown by provider and project 51 52 ### 2.2 Alert Summary 53 54 The alert panel shows active incidents categorized by severity: 55 - Critical: Immediate action required (service outages) 56 - Warning: Attention needed within 24 hours 57 - Info: Informational notifications 58 59 ## Chapter 3: Cost Optimization 60 61 CloudManager Pro includes powerful cost optimization features that can reduce 62 your cloud spending by 20-40%. 63 64 ### 3.1 Right-sizing Recommendations 65 66 The platform analyzes resource utilization patterns and suggests optimal 67 instance sizes. Recommendations are based on: 68 - CPU utilization over 14-day periods 69 - Memory usage patterns 70 - Network I/O requirements 71 72 ### 3.2 Reserved Instance Planning 73 74 Use the RI Planner to identify opportunities for reserved instance purchases. 75 The tool calculates potential savings based on your usage patterns and 76 commitment preferences (1-year vs 3-year terms). 77 78 ## Chapter 4: Security Features 79 80 ### 4.1 Compliance Monitoring 81 82 CloudManager Pro continuously monitors your infrastructure against common 83 compliance frameworks including SOC 2, HIPAA, and PCI-DSS. Non-compliant 84 resources are flagged with remediation guidance. 85 86 ### 4.2 Access Control 87 88 Role-based access control (RBAC) allows fine-grained permissions: 89 - Admin: Full access to all features 90 - Operator: Can view and modify resources 91 - Viewer: Read-only access to dashboards 92 - Billing: Access to cost and billing features only 93 """ 94 95 96 def chunk_by_paragraphs(text: str, min_length: int = 100) -> list: 97 """Split text into paragraph-based chunks.""" 98 paragraphs = text.split("\n\n") 99 chunks = [] 100 current_chunk = "" 101 102 for para in paragraphs: 103 para = para.strip() 104 if not para: 105 continue 106 107 if len(current_chunk) + len(para) < 500: 108 current_chunk += "\n\n" + para if current_chunk else para 109 else: 110 if len(current_chunk) >= min_length: 111 chunks.append({"content": current_chunk.strip()}) 112 current_chunk = para 113 114 if current_chunk and len(current_chunk) >= min_length: 115 chunks.append({"content": current_chunk.strip()}) 116 117 return chunks 118 119 120 def chunk_by_sections(text: str) -> list: 121 """Split text by markdown headers (sections).""" 122 import re 123 124 # Split by headers (## or ###) 125 sections = re.split(r'\n(?=##+ )', text) 126 chunks = [] 127 128 for section in sections: 129 section = section.strip() 130 if len(section) > 50: # Skip very short sections 131 # Extract title if present 132 lines = section.split('\n') 133 title = lines[0].replace('#', '').strip() if lines[0].startswith('#') else "Untitled" 134 chunks.append({ 135 "id": title.lower().replace(' ', '_')[:30], 136 "content": section 137 }) 138 139 return chunks 140 141 142 def chunk_by_fixed_size(text: str, chunk_size: int = 500, overlap: int = 50) -> list: 143 """Split text into fixed-size chunks with overlap.""" 144 chunks = [] 145 start = 0 146 147 while start < len(text): 148 end = start + chunk_size 149 chunk_text = text[start:end] 150 151 # Try to break at sentence boundary 152 if end < len(text): 153 last_period = chunk_text.rfind('.') 154 if last_period > chunk_size * 0.5: 155 end = start + last_period + 1 156 chunk_text = text[start:end] 157 158 chunks.append({"content": chunk_text.strip()}) 159 start = end - overlap 160 161 return chunks 162 163 164 def demonstrate_chunking_strategies(): 165 """Show different chunking approaches and their effects.""" 166 167 print("=" * 60) 168 print("CHUNKING STRATEGIES COMPARISON") 169 print("=" * 60) 170 171 # Strategy 1: Paragraph-based chunking 172 para_chunks = chunk_by_paragraphs(PRODUCT_DOCUMENTATION) 173 print(f"\nš Paragraph Chunking: {len(para_chunks)} chunks") 174 print(f" Average chunk size: {sum(len(c['content']) for c in para_chunks) // len(para_chunks)} chars") 175 print(f" Sample chunk: {para_chunks[0]['content'][:100]}...") 176 177 # Strategy 2: Section-based chunking 178 section_chunks = chunk_by_sections(PRODUCT_DOCUMENTATION) 179 print(f"\nš Section Chunking: {len(section_chunks)} chunks") 180 print(f" Average chunk size: {sum(len(c['content']) for c in section_chunks) // len(section_chunks)} chars") 181 print(f" Sections: {[c['id'] for c in section_chunks[:5]]}...") 182 183 # Strategy 3: Fixed-size chunking 184 fixed_chunks = chunk_by_fixed_size(PRODUCT_DOCUMENTATION, chunk_size=400, overlap=50) 185 print(f"\nš Fixed-Size Chunking (400 chars, 50 overlap): {len(fixed_chunks)} chunks") 186 print(f" Average chunk size: {sum(len(c['content']) for c in fixed_chunks) // len(fixed_chunks)} chars") 187 188 189 def rag_with_section_chunks(): 190 """Demonstrate RAG using section-based chunks.""" 191 192 # Create section-based chunks 193 chunks = chunk_by_sections(PRODUCT_DOCUMENTATION) 194 195 # Build context from chunks 196 context = "\n\n".join([f"[{c['id']}]\n{c['content']}" for c in chunks]) 197 198 # Create agent with chunked knowledge in instructions 199 agent = Agent( 200 name="Product Expert", 201 instructions=f"""You are a CloudManager Pro product expert. 202 Answer questions using the product documentation. 203 Be specific and reference relevant sections when helpful. 204 205 PRODUCT DOCUMENTATION: 206 {context}""", 207 output="silent" 208 ) 209 210 queries = [ 211 "What are the system requirements for CloudManager Pro?", 212 "How can I reduce cloud costs?", 213 "What compliance frameworks are supported?" 214 ] 215 216 print("\n" + "=" * 60) 217 print("RAG WITH SECTION-BASED CHUNKS") 218 print("=" * 60) 219 220 for query in queries: 221 print(f"\nš Query: {query}") 222 response = agent.chat(query) 223 print(f"š” Answer: {response[:250]}..." if len(str(response)) > 250 else f"š” Answer: {response}") 224 print("-" * 40) 225 226 227 def semantic_chunking_concept(): 228 """Explain semantic chunking concept (agent-driven).""" 229 230 print("\n" + "=" * 60) 231 print("SEMANTIC CHUNKING CONCEPT") 232 print("=" * 60) 233 234 print(""" 235 Semantic chunking goes beyond fixed rules by considering meaning: 236 237 1. **Sentence Embedding Similarity** 238 - Compute embeddings for each sentence 239 - Group sentences with similar embeddings 240 - Split when similarity drops below threshold 241 242 2. **Topic-Based Chunking** 243 - Identify topic shifts in the document 244 - Create chunks that maintain topical coherence 245 246 3. **Agent-Driven Chunking** 247 - Use an LLM to identify logical boundaries 248 - Preserve context and relationships 249 250 PraisonAI's Knowledge system handles chunking automatically, 251 but understanding these strategies helps optimize retrieval quality. 252 """) 253 254 255 def main(): 256 """Run all chunking strategy examples.""" 257 print("\nš PraisonAI Chunking Strategies Examples\n") 258 259 # Example 1: Compare chunking strategies 260 demonstrate_chunking_strategies() 261 262 # Example 2: RAG with section chunks 263 rag_with_section_chunks() 264 265 # Example 3: Semantic chunking concept 266 semantic_chunking_concept() 267 268 print("\nā Chunking strategy examples completed!") 269 270 271 if __name__ == "__main__": 272 main()