Cradicle Explorer

/ examples / rag / chunking_strategies.py
chunking_strategies.py
  1  """
  2  Chunking Strategies for RAG
  3  
  4  This example demonstrates different approaches to splitting documents into
  5  chunks for effective retrieval.
  6  
  7  RAG Concept: Chunking is critical for RAG quality. Too large chunks dilute
  8  relevance; too small chunks lose context. The right strategy depends on
  9  your content type and query patterns.
 10  """
 11  
 12  from praisonaiagents import Agent
 13  
 14  # Sample long document: Product documentation
 15  PRODUCT_DOCUMENTATION = """
 16  # CloudManager Pro - Complete User Guide
 17  
 18  ## Chapter 1: Getting Started
 19  
 20  CloudManager Pro is an enterprise cloud management platform that helps organizations
 21  monitor, optimize, and secure their multi-cloud infrastructure. This guide covers
 22  installation, configuration, and daily operations.
 23  
 24  ### 1.1 System Requirements
 25  
 26  Before installing CloudManager Pro, ensure your system meets these requirements:
 27  - Operating System: Ubuntu 20.04+, RHEL 8+, or Windows Server 2019+
 28  - Memory: Minimum 16GB RAM, recommended 32GB for production
 29  - Storage: 100GB SSD for application, additional storage for logs
 30  - Network: Outbound HTTPS access to cloud provider APIs
 31  
 32  ### 1.2 Installation Steps
 33  
 34  1. Download the installer from the customer portal
 35  2. Run the installation script with root privileges
 36  3. Configure the database connection
 37  4. Set up the initial admin account
 38  5. Verify the installation with the health check command
 39  
 40  ## Chapter 2: Dashboard Overview
 41  
 42  The main dashboard provides a unified view of your cloud resources across all
 43  connected providers. Key metrics displayed include:
 44  
 45  ### 2.1 Resource Metrics
 46  
 47  - Total compute instances across all clouds
 48  - Storage utilization and growth trends
 49  - Network bandwidth consumption
 50  - Cost breakdown by provider and project
 51  
 52  ### 2.2 Alert Summary
 53  
 54  The alert panel shows active incidents categorized by severity:
 55  - Critical: Immediate action required (service outages)
 56  - Warning: Attention needed within 24 hours
 57  - Info: Informational notifications
 58  
 59  ## Chapter 3: Cost Optimization
 60  
 61  CloudManager Pro includes powerful cost optimization features that can reduce
 62  your cloud spending by 20-40%.
 63  
 64  ### 3.1 Right-sizing Recommendations
 65  
 66  The platform analyzes resource utilization patterns and suggests optimal
 67  instance sizes. Recommendations are based on:
 68  - CPU utilization over 14-day periods
 69  - Memory usage patterns
 70  - Network I/O requirements
 71  
 72  ### 3.2 Reserved Instance Planning
 73  
 74  Use the RI Planner to identify opportunities for reserved instance purchases.
 75  The tool calculates potential savings based on your usage patterns and
 76  commitment preferences (1-year vs 3-year terms).
 77  
 78  ## Chapter 4: Security Features
 79  
 80  ### 4.1 Compliance Monitoring
 81  
 82  CloudManager Pro continuously monitors your infrastructure against common
 83  compliance frameworks including SOC 2, HIPAA, and PCI-DSS. Non-compliant
 84  resources are flagged with remediation guidance.
 85  
 86  ### 4.2 Access Control
 87  
 88  Role-based access control (RBAC) allows fine-grained permissions:
 89  - Admin: Full access to all features
 90  - Operator: Can view and modify resources
 91  - Viewer: Read-only access to dashboards
 92  - Billing: Access to cost and billing features only
 93  """
 94  
 95  
 96  def chunk_by_paragraphs(text: str, min_length: int = 100) -> list:
 97      """Split text into paragraph-based chunks."""
 98      paragraphs = text.split("\n\n")
 99      chunks = []
100      current_chunk = ""
101      
102      for para in paragraphs:
103          para = para.strip()
104          if not para:
105              continue
106              
107          if len(current_chunk) + len(para) < 500:
108              current_chunk += "\n\n" + para if current_chunk else para
109          else:
110              if len(current_chunk) >= min_length:
111                  chunks.append({"content": current_chunk.strip()})
112              current_chunk = para
113      
114      if current_chunk and len(current_chunk) >= min_length:
115          chunks.append({"content": current_chunk.strip()})
116      
117      return chunks
118  
119  
120  def chunk_by_sections(text: str) -> list:
121      """Split text by markdown headers (sections)."""
122      import re
123      
124      # Split by headers (## or ###)
125      sections = re.split(r'\n(?=##+ )', text)
126      chunks = []
127      
128      for section in sections:
129          section = section.strip()
130          if len(section) > 50:  # Skip very short sections
131              # Extract title if present
132              lines = section.split('\n')
133              title = lines[0].replace('#', '').strip() if lines[0].startswith('#') else "Untitled"
134              chunks.append({
135                  "id": title.lower().replace(' ', '_')[:30],
136                  "content": section
137              })
138      
139      return chunks
140  
141  
142  def chunk_by_fixed_size(text: str, chunk_size: int = 500, overlap: int = 50) -> list:
143      """Split text into fixed-size chunks with overlap."""
144      chunks = []
145      start = 0
146      
147      while start < len(text):
148          end = start + chunk_size
149          chunk_text = text[start:end]
150          
151          # Try to break at sentence boundary
152          if end < len(text):
153              last_period = chunk_text.rfind('.')
154              if last_period > chunk_size * 0.5:
155                  end = start + last_period + 1
156                  chunk_text = text[start:end]
157          
158          chunks.append({"content": chunk_text.strip()})
159          start = end - overlap
160      
161      return chunks
162  
163  
164  def demonstrate_chunking_strategies():
165      """Show different chunking approaches and their effects."""
166      
167      print("=" * 60)
168      print("CHUNKING STRATEGIES COMPARISON")
169      print("=" * 60)
170      
171      # Strategy 1: Paragraph-based chunking
172      para_chunks = chunk_by_paragraphs(PRODUCT_DOCUMENTATION)
173      print(f"\n📄 Paragraph Chunking: {len(para_chunks)} chunks")
174      print(f"   Average chunk size: {sum(len(c['content']) for c in para_chunks) // len(para_chunks)} chars")
175      print(f"   Sample chunk: {para_chunks[0]['content'][:100]}...")
176      
177      # Strategy 2: Section-based chunking
178      section_chunks = chunk_by_sections(PRODUCT_DOCUMENTATION)
179      print(f"\n📑 Section Chunking: {len(section_chunks)} chunks")
180      print(f"   Average chunk size: {sum(len(c['content']) for c in section_chunks) // len(section_chunks)} chars")
181      print(f"   Sections: {[c['id'] for c in section_chunks[:5]]}...")
182      
183      # Strategy 3: Fixed-size chunking
184      fixed_chunks = chunk_by_fixed_size(PRODUCT_DOCUMENTATION, chunk_size=400, overlap=50)
185      print(f"\n📏 Fixed-Size Chunking (400 chars, 50 overlap): {len(fixed_chunks)} chunks")
186      print(f"   Average chunk size: {sum(len(c['content']) for c in fixed_chunks) // len(fixed_chunks)} chars")
187  
188  
189  def rag_with_section_chunks():
190      """Demonstrate RAG using section-based chunks."""
191      
192      # Create section-based chunks
193      chunks = chunk_by_sections(PRODUCT_DOCUMENTATION)
194      
195      # Build context from chunks
196      context = "\n\n".join([f"[{c['id']}]\n{c['content']}" for c in chunks])
197      
198      # Create agent with chunked knowledge in instructions
199      agent = Agent(
200          name="Product Expert",
201          instructions=f"""You are a CloudManager Pro product expert.
202          Answer questions using the product documentation.
203          Be specific and reference relevant sections when helpful.
204          
205          PRODUCT DOCUMENTATION:
206          {context}""",
207          output="silent"
208      )
209      
210      queries = [
211          "What are the system requirements for CloudManager Pro?",
212          "How can I reduce cloud costs?",
213          "What compliance frameworks are supported?"
214      ]
215      
216      print("\n" + "=" * 60)
217      print("RAG WITH SECTION-BASED CHUNKS")
218      print("=" * 60)
219      
220      for query in queries:
221          print(f"\n📝 Query: {query}")
222          response = agent.chat(query)
223          print(f"💡 Answer: {response[:250]}..." if len(str(response)) > 250 else f"💡 Answer: {response}")
224          print("-" * 40)
225  
226  
227  def semantic_chunking_concept():
228      """Explain semantic chunking concept (agent-driven)."""
229      
230      print("\n" + "=" * 60)
231      print("SEMANTIC CHUNKING CONCEPT")
232      print("=" * 60)
233      
234      print("""
235      Semantic chunking goes beyond fixed rules by considering meaning:
236      
237      1. **Sentence Embedding Similarity**
238         - Compute embeddings for each sentence
239         - Group sentences with similar embeddings
240         - Split when similarity drops below threshold
241      
242      2. **Topic-Based Chunking**
243         - Identify topic shifts in the document
244         - Create chunks that maintain topical coherence
245      
246      3. **Agent-Driven Chunking**
247         - Use an LLM to identify logical boundaries
248         - Preserve context and relationships
249      
250      PraisonAI's Knowledge system handles chunking automatically,
251      but understanding these strategies helps optimize retrieval quality.
252      """)
253  
254  
255  def main():
256      """Run all chunking strategy examples."""
257      print("\n🚀 PraisonAI Chunking Strategies Examples\n")
258      
259      # Example 1: Compare chunking strategies
260      demonstrate_chunking_strategies()
261      
262      # Example 2: RAG with section chunks
263      rag_with_section_chunks()
264      
265      # Example 3: Semantic chunking concept
266      semantic_chunking_concept()
267      
268      print("\n✅ Chunking strategy examples completed!")
269  
270  
271  if __name__ == "__main__":
272      main()