Cradicle Explorer

/ archive / python-cli-final / kamaji / rag_tools.py
rag_tools.py
  1  """
  2  RAG tools for document retrieval in interactive modes.
  3  """
  4  
  5  from pathlib import Path
  6  from typing import List, Optional
  7  from langchain_core.documents import Document
  8  from langchain_text_splitters import RecursiveCharacterTextSplitter
  9  
 10  
 11  class SimpleRetriever:
 12      """A simple keyword-based retriever."""
 13  
 14      def __init__(self, documents: List[Document]):
 15          self.documents = documents
 16  
 17      def retrieve(self, query: str, k: int = 3) -> List[Document]:
 18          """Retrieve most relevant documents using keyword matching."""
 19          query_terms = set(query.lower().split())
 20  
 21          scores = []
 22          for doc in self.documents:
 23              content = doc.page_content.lower()
 24              score = sum(1 for term in query_terms if term in content)
 25              scores.append((score, doc))
 26  
 27          scores.sort(reverse=True, key=lambda x: x[0])
 28          return [doc for _, doc in scores[:k]]
 29  
 30  
 31  class DocumentStore:
 32      """Stores and retrieves documents for RAG."""
 33  
 34      def __init__(self):
 35          self.documents: List[Document] = []
 36          self.splits: List[Document] = []
 37          self.retriever: Optional[SimpleRetriever] = None
 38          self.loaded_files: List[str] = []
 39  
 40      def load_documents(self, file_paths: List[str]) -> int:
 41          """
 42          Load documents from files.
 43  
 44          Returns:
 45              Number of documents successfully loaded
 46          """
 47          loaded_count = 0
 48  
 49          for file_path in file_paths:
 50              path = Path(file_path)
 51  
 52              if not path.exists():
 53                  continue
 54  
 55              if not path.is_file():
 56                  continue
 57  
 58              try:
 59                  with open(path, 'r', encoding='utf-8') as f:
 60                      content = f.read()
 61                      self.documents.append(Document(
 62                          page_content=content,
 63                          metadata={"source": str(path), "filename": path.name}
 64                      ))
 65                      self.loaded_files.append(str(path))
 66                      loaded_count += 1
 67              except Exception:
 68                  pass
 69  
 70          if loaded_count > 0:
 71              self._build_index()
 72  
 73          return loaded_count
 74  
 75      def _build_index(self):
 76          """Build search index from loaded documents."""
 77          # Split documents
 78          text_splitter = RecursiveCharacterTextSplitter(
 79              chunk_size=1000,
 80              chunk_overlap=200
 81          )
 82          self.splits = text_splitter.split_documents(self.documents)
 83          self.retriever = SimpleRetriever(self.splits)
 84  
 85      def query(self, query: str, k: int = 3) -> str:
 86          """
 87          Query the document store.
 88  
 89          Args:
 90              query: Search query
 91              k: Number of results to return
 92  
 93          Returns:
 94              Formatted context from relevant documents
 95          """
 96          if not self.retriever:
 97              return "No documents loaded. Please load documents first."
 98  
 99          relevant_docs = self.retriever.retrieve(query, k=k)
100  
101          if not relevant_docs:
102              return "No relevant documents found for your query."
103  
104          # Build context
105          results = []
106          for i, doc in enumerate(relevant_docs, 1):
107              source = doc.metadata.get("filename", doc.metadata.get("source", "Unknown"))
108              results.append(f"[Document {i} - {source}]\n{doc.page_content}\n")
109  
110          return "\n".join(results)
111  
112      def list_documents(self) -> str:
113          """List all loaded documents."""
114          if not self.loaded_files:
115              return "No documents loaded."
116  
117          return f"Loaded {len(self.loaded_files)} document(s):\n" + "\n".join(
118              f"  • {Path(f).name}" for f in self.loaded_files
119          )
120  
121      def has_documents(self) -> bool:
122          """Check if any documents are loaded."""
123          return len(self.documents) > 0
124  
125  
126  def create_rag_tool(doc_store: DocumentStore):
127      """
128      Create a RAG query tool function for the agent.
129  
130      Args:
131          doc_store: DocumentStore instance
132  
133      Returns:
134          Function that can be used as an agent tool
135      """
136      def query_documents(query: str) -> str:
137          """
138          Search through loaded documents for relevant information.
139          Use this when you need to find information from the documents provided by the user.
140  
141          Args:
142              query: The search query or question
143  
144          Returns:
145              Relevant document excerpts
146          """
147          if not doc_store.has_documents():
148              return "No documents are currently loaded. Ask the user to provide documents."
149  
150          return doc_store.query(query, k=3)
151  
152      return query_documents