/ legacy / python / lib / 2_rag_example_simple.py
2_rag_example_simple.py
  1  """
  2  Simple RAG (Retrieval Augmented Generation) Example WITHOUT embeddings.
  3  This demonstrates semantic search using keyword matching when embeddings aren't available.
  4  """
  5  
  6  from langchain_community.llms import Ollama
  7  from langchain.text_splitter import RecursiveCharacterTextSplitter
  8  from langchain.schema import Document
  9  from langchain.prompts import PromptTemplate
 10  from typing import List
 11  import re
 12  
 13  # Initialize Ollama
 14  llm = Ollama(
 15      model="gpt-oss:120b",
 16      base_url="http://192.222.50.154:11434"
 17  )
 18  
 19  class SimpleRetriever:
 20      """A simple keyword-based retriever when embeddings aren't available."""
 21  
 22      def __init__(self, documents: List[Document]):
 23          self.documents = documents
 24  
 25      def retrieve(self, query: str, k: int = 3) -> List[Document]:
 26          """Retrieve most relevant documents using keyword matching."""
 27          query_terms = set(query.lower().split())
 28  
 29          # Score each document
 30          scores = []
 31          for doc in self.documents:
 32              content = doc.page_content.lower()
 33              # Simple scoring: count matching terms
 34              score = sum(1 for term in query_terms if term in content)
 35              scores.append((score, doc))
 36  
 37          # Sort by score and return top k
 38          scores.sort(reverse=True, key=lambda x: x[0])
 39          return [doc for _, doc in scores[:k]]
 40  
 41  def create_simple_rag_system(texts: List[str]):
 42      """
 43      Create a simple RAG system without embeddings.
 44      """
 45      # Create documents
 46      documents = [Document(page_content=text.strip()) for text in texts]
 47  
 48      # Split documents (optional for small docs)
 49      text_splitter = RecursiveCharacterTextSplitter(
 50          chunk_size=1000,
 51          chunk_overlap=200
 52      )
 53      splits = text_splitter.split_documents(documents)
 54  
 55      # Create simple retriever
 56      retriever = SimpleRetriever(splits)
 57  
 58      return retriever
 59  
 60  def query_documents(retriever: SimpleRetriever, query: str):
 61      """
 62      Query documents and generate answer.
 63      """
 64      # Retrieve relevant documents
 65      relevant_docs = retriever.retrieve(query, k=3)
 66  
 67      # Build context from retrieved documents
 68      context = "\n\n".join([doc.page_content for doc in relevant_docs])
 69  
 70      # Create prompt
 71      prompt_template = """Use the following context to answer the question. If you cannot answer based on the context, say so.
 72  
 73  Context:
 74  {context}
 75  
 76  Question: {question}
 77  
 78  Answer:"""
 79  
 80      prompt = prompt_template.format(context=context, question=query)
 81  
 82      # Get answer from LLM
 83      answer = llm.invoke(prompt)
 84  
 85      return {
 86          "result": answer,
 87          "source_documents": relevant_docs
 88      }
 89  
 90  if __name__ == "__main__":
 91      # Sample documents about a fictional company
 92      sample_docs = [
 93          """
 94          Acme Corporation is a technology company founded in 2020.
 95          We specialize in artificial intelligence and machine learning solutions.
 96          Our main products include AutoML Platform, AI Vision, and NLP Suite.
 97          """,
 98          """
 99          Acme Corporation has offices in San Francisco, New York, and London.
100          Our headquarters is located in San Francisco, California.
101          We have over 500 employees worldwide.
102          """,
103          """
104          Acme Corporation's AI Vision product can analyze images and videos
105          for object detection, facial recognition, and scene understanding.
106          It achieves 95% accuracy on standard benchmarks and processes
107          1000 images per second.
108          """,
109          """
110          The AutoML Platform by Acme Corporation allows users to build
111          machine learning models without writing code. It supports
112          classification, regression, and clustering tasks. The platform
113          has been used by over 10,000 companies.
114          """
115      ]
116  
117      print("Creating simple RAG system (no embeddings)...")
118      retriever = create_simple_rag_system(sample_docs)
119      print("RAG system created!\n")
120  
121      # Example queries
122      queries = [
123          "What products does Acme Corporation offer?",
124          "Where are Acme Corporation's offices located?",
125          "What can AI Vision do?"
126      ]
127  
128      for query in queries:
129          print("=" * 60)
130          print(f"Query: {query}")
131          print("=" * 60)
132          result = query_documents(retriever, query)
133          print(f"\nAnswer: {result['result']}\n")
134          print("Source documents used:")
135          for i, doc in enumerate(result['source_documents'], 1):
136              print(f"{i}. {doc.page_content[:100].strip()}...")
137          print()