2_rag_example_simple.py
1 """ 2 Simple RAG (Retrieval Augmented Generation) Example WITHOUT embeddings. 3 This demonstrates semantic search using keyword matching when embeddings aren't available. 4 """ 5 6 from langchain_community.llms import Ollama 7 from langchain.text_splitter import RecursiveCharacterTextSplitter 8 from langchain.schema import Document 9 from langchain.prompts import PromptTemplate 10 from typing import List 11 import re 12 13 # Initialize Ollama 14 llm = Ollama( 15 model="gpt-oss:120b", 16 base_url="http://192.222.50.154:11434" 17 ) 18 19 class SimpleRetriever: 20 """A simple keyword-based retriever when embeddings aren't available.""" 21 22 def __init__(self, documents: List[Document]): 23 self.documents = documents 24 25 def retrieve(self, query: str, k: int = 3) -> List[Document]: 26 """Retrieve most relevant documents using keyword matching.""" 27 query_terms = set(query.lower().split()) 28 29 # Score each document 30 scores = [] 31 for doc in self.documents: 32 content = doc.page_content.lower() 33 # Simple scoring: count matching terms 34 score = sum(1 for term in query_terms if term in content) 35 scores.append((score, doc)) 36 37 # Sort by score and return top k 38 scores.sort(reverse=True, key=lambda x: x[0]) 39 return [doc for _, doc in scores[:k]] 40 41 def create_simple_rag_system(texts: List[str]): 42 """ 43 Create a simple RAG system without embeddings. 44 """ 45 # Create documents 46 documents = [Document(page_content=text.strip()) for text in texts] 47 48 # Split documents (optional for small docs) 49 text_splitter = RecursiveCharacterTextSplitter( 50 chunk_size=1000, 51 chunk_overlap=200 52 ) 53 splits = text_splitter.split_documents(documents) 54 55 # Create simple retriever 56 retriever = SimpleRetriever(splits) 57 58 return retriever 59 60 def query_documents(retriever: SimpleRetriever, query: str): 61 """ 62 Query documents and generate answer. 63 """ 64 # Retrieve relevant documents 65 relevant_docs = retriever.retrieve(query, k=3) 66 67 # Build context from retrieved documents 68 context = "\n\n".join([doc.page_content for doc in relevant_docs]) 69 70 # Create prompt 71 prompt_template = """Use the following context to answer the question. If you cannot answer based on the context, say so. 72 73 Context: 74 {context} 75 76 Question: {question} 77 78 Answer:""" 79 80 prompt = prompt_template.format(context=context, question=query) 81 82 # Get answer from LLM 83 answer = llm.invoke(prompt) 84 85 return { 86 "result": answer, 87 "source_documents": relevant_docs 88 } 89 90 if __name__ == "__main__": 91 # Sample documents about a fictional company 92 sample_docs = [ 93 """ 94 Acme Corporation is a technology company founded in 2020. 95 We specialize in artificial intelligence and machine learning solutions. 96 Our main products include AutoML Platform, AI Vision, and NLP Suite. 97 """, 98 """ 99 Acme Corporation has offices in San Francisco, New York, and London. 100 Our headquarters is located in San Francisco, California. 101 We have over 500 employees worldwide. 102 """, 103 """ 104 Acme Corporation's AI Vision product can analyze images and videos 105 for object detection, facial recognition, and scene understanding. 106 It achieves 95% accuracy on standard benchmarks and processes 107 1000 images per second. 108 """, 109 """ 110 The AutoML Platform by Acme Corporation allows users to build 111 machine learning models without writing code. It supports 112 classification, regression, and clustering tasks. The platform 113 has been used by over 10,000 companies. 114 """ 115 ] 116 117 print("Creating simple RAG system (no embeddings)...") 118 retriever = create_simple_rag_system(sample_docs) 119 print("RAG system created!\n") 120 121 # Example queries 122 queries = [ 123 "What products does Acme Corporation offer?", 124 "Where are Acme Corporation's offices located?", 125 "What can AI Vision do?" 126 ] 127 128 for query in queries: 129 print("=" * 60) 130 print(f"Query: {query}") 131 print("=" * 60) 132 result = query_documents(retriever, query) 133 print(f"\nAnswer: {result['result']}\n") 134 print("Source documents used:") 135 for i, doc in enumerate(result['source_documents'], 1): 136 print(f"{i}. {doc.page_content[:100].strip()}...") 137 print()