Cradicle Explorer

/ examples / knowledge / large_corpus_strategy_demo.py
large_corpus_strategy_demo.py
  1  #!/usr/bin/env python3
  2  """
  3  Example: Large Corpus Strategy Selection Demo
  4  
  5  This example demonstrates:
  6  1. Automatic strategy selection based on corpus size
  7  2. Different retrieval strategies (DIRECT, BASIC, HYBRID, RERANKED, COMPRESSED)
  8  3. Agent-centric knowledge retrieval with unique codes
  9  
 10  Requirements:
 11  - pip install praisonaiagents[knowledge]
 12  - OPENAI_API_KEY environment variable
 13  
 14  Usage:
 15      python large_corpus_strategy_demo.py
 16  """
 17  
 18  import os
 19  import tempfile
 20  import shutil
 21  
 22  from praisonaiagents import Agent
 23  
 24  
 25  # Unique verification codes that CANNOT be guessed
 26  VERIFICATION_CODES = {
 27      "project_alpha": "ALPHA-7X9K2",
 28      "project_beta": "BETA-3M8N1",
 29      "project_gamma": "GAMMA-5P2Q7",
 30      "budget_code": "BUDGET-9R4T6",
 31      "security_clearance": "CLEARANCE-2W8Y3",
 32  }
 33  
 34  
 35  def create_test_corpus(temp_dir: str, num_files: int = 10) -> list:
 36      """Create a test corpus with unique codes in each file."""
 37      files = []
 38      
 39      # Create main project files with unique codes
 40      for i, (project, code) in enumerate(VERIFICATION_CODES.items()):
 41          filepath = os.path.join(temp_dir, f"{project}_doc.txt")
 42          with open(filepath, 'w') as f:
 43              f.write(f"""
 44  {project.upper().replace('_', ' ')} Documentation
 45  {'=' * 50}
 46  
 47  Project Overview:
 48  This document contains confidential information about {project}.
 49  The verification code for this project is: {code}
 50  
 51  Key Details:
 52  - Project started: 2024-0{i+1}-15
 53  - Team size: {10 + i * 5} members
 54  - Status: Active
 55  - Priority: {'High' if i < 2 else 'Medium'}
 56  
 57  Access Requirements:
 58  To access this project, use verification code: {code}
 59  All access attempts are logged for security purposes.
 60  
 61  Last updated: 2024-12-01
 62              """)
 63          files.append(filepath)
 64      
 65      # Create additional filler files to simulate larger corpus
 66      for i in range(num_files - len(VERIFICATION_CODES)):
 67          filepath = os.path.join(temp_dir, f"general_doc_{i}.txt")
 68          with open(filepath, 'w') as f:
 69              f.write(f"""
 70  General Documentation File {i}
 71  {'=' * 40}
 72  
 73  This is a general documentation file containing various information
 74  about company processes and procedures.
 75  
 76  Section {i}.1: Overview
 77  Lorem ipsum dolor sit amet, consectetur adipiscing elit.
 78  Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
 79  
 80  Section {i}.2: Procedures
 81  Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris.
 82  Duis aute irure dolor in reprehenderit in voluptate velit esse.
 83  
 84  Section {i}.3: Guidelines
 85  Excepteur sint occaecat cupidatat non proident, sunt in culpa.
 86  Qui officia deserunt mollit anim id est laborum.
 87  
 88  Document ID: DOC-{1000 + i}
 89              """)
 90          files.append(filepath)
 91      
 92      return files
 93  
 94  
 95  def main():
 96      temp_dir = tempfile.mkdtemp(prefix='praison_strategy_')
 97      
 98      try:
 99          print("=" * 60)
100          print("Example: Large Corpus Strategy Selection Demo")
101          print("=" * 60)
102          
103          # Create test corpus
104          files = create_test_corpus(temp_dir, num_files=15)
105          print(f"\nCreated {len(files)} test documents in: {temp_dir}")
106          print(f"Unique verification codes embedded: {len(VERIFICATION_CODES)}")
107          
108          # Show strategy selection based on corpus size
109          print("\n" + "-" * 40)
110          print("Strategy Selection Demo")
111          print("-" * 40)
112          
113          try:
114              from praisonaiagents.rag import select_strategy, RetrievalStrategy
115              from praisonaiagents.knowledge.indexing import CorpusStats
116              
117              # Get corpus stats
118              stats = CorpusStats.from_directory(temp_dir)
119              print(f"\nCorpus Statistics:")
120              print(f"  Files: {stats.file_count}")
121              print(f"  Estimated tokens: {stats.total_tokens}")
122              print(f"  Recommended strategy: {stats.strategy_recommendation}")
123              
124              # Show strategy selection for different corpus sizes
125              print(f"\nStrategy selection by corpus size:")
126              for size in [100, 1000, 10000, 50000, 100000]:
127                  strategy = select_strategy(corpus_tokens=size)
128                  print(f"  {size:>6} tokens -> {strategy.value}")
129                  
130          except ImportError as e:
131              print(f"Note: Strategy module not available: {e}")
132          
133          # Create agent with knowledge
134          print("\n" + "-" * 40)
135          print("Agent Knowledge Retrieval Test")
136          print("-" * 40)
137          
138          agent = Agent(
139              name="ProjectExpert",
140              instructions="""You are a project documentation expert.
141  Answer questions based ONLY on the provided knowledge context.
142  When asked about verification codes, provide the EXACT code from the documents.
143  If information is not in the context, say 'Information not found in documents.'""",
144              knowledge=[temp_dir],
145              user_id="strategy_demo_user",
146              output="verbose",  # Use new consolidated param
147          )
148          
149          # Test retrieval with unique codes
150          test_questions = [
151              ("What is the verification code for Project Alpha?", "ALPHA-7X9K2"),
152              ("What is the budget code?", "BUDGET-9R4T6"),
153              ("What is the security clearance code?", "CLEARANCE-2W8Y3"),
154          ]
155          
156          print("\nTesting retrieval of unique codes:\n")
157          
158          for question, expected_code in test_questions:
159              print(f"Q: {question}")
160              response = agent.chat(question)
161              print(f"A: {response[:200]}...")
162              
163              if expected_code in response.upper():
164                  print(f"✅ VERIFIED: Found code {expected_code}\n")
165              else:
166                  print(f"❌ WARNING: Expected code {expected_code} not found\n")
167          
168          print("=" * 60)
169          print("Demo Complete")
170          print("=" * 60)
171          
172      finally:
173          shutil.rmtree(temp_dir, ignore_errors=True)
174  
175  
176  if __name__ == "__main__":
177      main()