rag_tools.py
1 """ 2 RAG tools for document retrieval in interactive modes. 3 """ 4 5 from pathlib import Path 6 from typing import List, Optional 7 from langchain_core.documents import Document 8 from langchain_text_splitters import RecursiveCharacterTextSplitter 9 10 11 class SimpleRetriever: 12 """A simple keyword-based retriever.""" 13 14 def __init__(self, documents: List[Document]): 15 self.documents = documents 16 17 def retrieve(self, query: str, k: int = 3) -> List[Document]: 18 """Retrieve most relevant documents using keyword matching.""" 19 query_terms = set(query.lower().split()) 20 21 scores = [] 22 for doc in self.documents: 23 content = doc.page_content.lower() 24 score = sum(1 for term in query_terms if term in content) 25 scores.append((score, doc)) 26 27 scores.sort(reverse=True, key=lambda x: x[0]) 28 return [doc for _, doc in scores[:k]] 29 30 31 class DocumentStore: 32 """Stores and retrieves documents for RAG.""" 33 34 def __init__(self): 35 self.documents: List[Document] = [] 36 self.splits: List[Document] = [] 37 self.retriever: Optional[SimpleRetriever] = None 38 self.loaded_files: List[str] = [] 39 40 def load_documents(self, file_paths: List[str]) -> int: 41 """ 42 Load documents from files. 43 44 Returns: 45 Number of documents successfully loaded 46 """ 47 loaded_count = 0 48 49 for file_path in file_paths: 50 path = Path(file_path) 51 52 if not path.exists(): 53 continue 54 55 if not path.is_file(): 56 continue 57 58 try: 59 with open(path, 'r', encoding='utf-8') as f: 60 content = f.read() 61 self.documents.append(Document( 62 page_content=content, 63 metadata={"source": str(path), "filename": path.name} 64 )) 65 self.loaded_files.append(str(path)) 66 loaded_count += 1 67 except Exception: 68 pass 69 70 if loaded_count > 0: 71 self._build_index() 72 73 return loaded_count 74 75 def _build_index(self): 76 """Build search index from loaded documents.""" 77 # Split documents 78 text_splitter = RecursiveCharacterTextSplitter( 79 chunk_size=1000, 80 chunk_overlap=200 81 ) 82 self.splits = text_splitter.split_documents(self.documents) 83 self.retriever = SimpleRetriever(self.splits) 84 85 def query(self, query: str, k: int = 3) -> str: 86 """ 87 Query the document store. 88 89 Args: 90 query: Search query 91 k: Number of results to return 92 93 Returns: 94 Formatted context from relevant documents 95 """ 96 if not self.retriever: 97 return "No documents loaded. Please load documents first." 98 99 relevant_docs = self.retriever.retrieve(query, k=k) 100 101 if not relevant_docs: 102 return "No relevant documents found for your query." 103 104 # Build context 105 results = [] 106 for i, doc in enumerate(relevant_docs, 1): 107 source = doc.metadata.get("filename", doc.metadata.get("source", "Unknown")) 108 results.append(f"[Document {i} - {source}]\n{doc.page_content}\n") 109 110 return "\n".join(results) 111 112 def list_documents(self) -> str: 113 """List all loaded documents.""" 114 if not self.loaded_files: 115 return "No documents loaded." 116 117 return f"Loaded {len(self.loaded_files)} document(s):\n" + "\n".join( 118 f" • {Path(f).name}" for f in self.loaded_files 119 ) 120 121 def has_documents(self) -> bool: 122 """Check if any documents are loaded.""" 123 return len(self.documents) > 0 124 125 126 def create_rag_tool(doc_store: DocumentStore): 127 """ 128 Create a RAG query tool function for the agent. 129 130 Args: 131 doc_store: DocumentStore instance 132 133 Returns: 134 Function that can be used as an agent tool 135 """ 136 def query_documents(query: str) -> str: 137 """ 138 Search through loaded documents for relevant information. 139 Use this when you need to find information from the documents provided by the user. 140 141 Args: 142 query: The search query or question 143 144 Returns: 145 Relevant document excerpts 146 """ 147 if not doc_store.has_documents(): 148 return "No documents are currently loaded. Ask the user to provide documents." 149 150 return doc_store.query(query, k=3) 151 152 return query_documents