large_corpus_strategy_demo.py
1 #!/usr/bin/env python3 2 """ 3 Example: Large Corpus Strategy Selection Demo 4 5 This example demonstrates: 6 1. Automatic strategy selection based on corpus size 7 2. Different retrieval strategies (DIRECT, BASIC, HYBRID, RERANKED, COMPRESSED) 8 3. Agent-centric knowledge retrieval with unique codes 9 10 Requirements: 11 - pip install praisonaiagents[knowledge] 12 - OPENAI_API_KEY environment variable 13 14 Usage: 15 python large_corpus_strategy_demo.py 16 """ 17 18 import os 19 import tempfile 20 import shutil 21 22 from praisonaiagents import Agent 23 24 25 # Unique verification codes that CANNOT be guessed 26 VERIFICATION_CODES = { 27 "project_alpha": "ALPHA-7X9K2", 28 "project_beta": "BETA-3M8N1", 29 "project_gamma": "GAMMA-5P2Q7", 30 "budget_code": "BUDGET-9R4T6", 31 "security_clearance": "CLEARANCE-2W8Y3", 32 } 33 34 35 def create_test_corpus(temp_dir: str, num_files: int = 10) -> list: 36 """Create a test corpus with unique codes in each file.""" 37 files = [] 38 39 # Create main project files with unique codes 40 for i, (project, code) in enumerate(VERIFICATION_CODES.items()): 41 filepath = os.path.join(temp_dir, f"{project}_doc.txt") 42 with open(filepath, 'w') as f: 43 f.write(f""" 44 {project.upper().replace('_', ' ')} Documentation 45 {'=' * 50} 46 47 Project Overview: 48 This document contains confidential information about {project}. 49 The verification code for this project is: {code} 50 51 Key Details: 52 - Project started: 2024-0{i+1}-15 53 - Team size: {10 + i * 5} members 54 - Status: Active 55 - Priority: {'High' if i < 2 else 'Medium'} 56 57 Access Requirements: 58 To access this project, use verification code: {code} 59 All access attempts are logged for security purposes. 60 61 Last updated: 2024-12-01 62 """) 63 files.append(filepath) 64 65 # Create additional filler files to simulate larger corpus 66 for i in range(num_files - len(VERIFICATION_CODES)): 67 filepath = os.path.join(temp_dir, f"general_doc_{i}.txt") 68 with open(filepath, 'w') as f: 69 f.write(f""" 70 General Documentation File {i} 71 {'=' * 40} 72 73 This is a general documentation file containing various information 74 about company processes and procedures. 75 76 Section {i}.1: Overview 77 Lorem ipsum dolor sit amet, consectetur adipiscing elit. 78 Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. 79 80 Section {i}.2: Procedures 81 Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris. 82 Duis aute irure dolor in reprehenderit in voluptate velit esse. 83 84 Section {i}.3: Guidelines 85 Excepteur sint occaecat cupidatat non proident, sunt in culpa. 86 Qui officia deserunt mollit anim id est laborum. 87 88 Document ID: DOC-{1000 + i} 89 """) 90 files.append(filepath) 91 92 return files 93 94 95 def main(): 96 temp_dir = tempfile.mkdtemp(prefix='praison_strategy_') 97 98 try: 99 print("=" * 60) 100 print("Example: Large Corpus Strategy Selection Demo") 101 print("=" * 60) 102 103 # Create test corpus 104 files = create_test_corpus(temp_dir, num_files=15) 105 print(f"\nCreated {len(files)} test documents in: {temp_dir}") 106 print(f"Unique verification codes embedded: {len(VERIFICATION_CODES)}") 107 108 # Show strategy selection based on corpus size 109 print("\n" + "-" * 40) 110 print("Strategy Selection Demo") 111 print("-" * 40) 112 113 try: 114 from praisonaiagents.rag import select_strategy, RetrievalStrategy 115 from praisonaiagents.knowledge.indexing import CorpusStats 116 117 # Get corpus stats 118 stats = CorpusStats.from_directory(temp_dir) 119 print(f"\nCorpus Statistics:") 120 print(f" Files: {stats.file_count}") 121 print(f" Estimated tokens: {stats.total_tokens}") 122 print(f" Recommended strategy: {stats.strategy_recommendation}") 123 124 # Show strategy selection for different corpus sizes 125 print(f"\nStrategy selection by corpus size:") 126 for size in [100, 1000, 10000, 50000, 100000]: 127 strategy = select_strategy(corpus_tokens=size) 128 print(f" {size:>6} tokens -> {strategy.value}") 129 130 except ImportError as e: 131 print(f"Note: Strategy module not available: {e}") 132 133 # Create agent with knowledge 134 print("\n" + "-" * 40) 135 print("Agent Knowledge Retrieval Test") 136 print("-" * 40) 137 138 agent = Agent( 139 name="ProjectExpert", 140 instructions="""You are a project documentation expert. 141 Answer questions based ONLY on the provided knowledge context. 142 When asked about verification codes, provide the EXACT code from the documents. 143 If information is not in the context, say 'Information not found in documents.'""", 144 knowledge=[temp_dir], 145 user_id="strategy_demo_user", 146 output="verbose", # Use new consolidated param 147 ) 148 149 # Test retrieval with unique codes 150 test_questions = [ 151 ("What is the verification code for Project Alpha?", "ALPHA-7X9K2"), 152 ("What is the budget code?", "BUDGET-9R4T6"), 153 ("What is the security clearance code?", "CLEARANCE-2W8Y3"), 154 ] 155 156 print("\nTesting retrieval of unique codes:\n") 157 158 for question, expected_code in test_questions: 159 print(f"Q: {question}") 160 response = agent.chat(question) 161 print(f"A: {response[:200]}...") 162 163 if expected_code in response.upper(): 164 print(f"✅ VERIFIED: Found code {expected_code}\n") 165 else: 166 print(f"❌ WARNING: Expected code {expected_code} not found\n") 167 168 print("=" * 60) 169 print("Demo Complete") 170 print("=" * 60) 171 172 finally: 173 shutil.rmtree(temp_dir, ignore_errors=True) 174 175 176 if __name__ == "__main__": 177 main()