Cradicle Explorer

/ archive / python / 2_rag_example.py
2_rag_example.py
  1  """
  2  RAG (Retrieval Augmented Generation) Example.
  3  This demonstrates how to load documents, create embeddings, and query them.
  4  """
  5  
  6  from langchain_community.llms import Ollama
  7  from langchain.text_splitter import RecursiveCharacterTextSplitter
  8  from langchain_community.vectorstores import Chroma
  9  from langchain_community.embeddings import OllamaEmbeddings
 10  from langchain.chains import RetrievalQA
 11  from langchain.schema import Document
 12  
 13  # Initialize Ollama
 14  llm = Ollama(
 15      model="gpt-oss:120b",
 16      base_url="http://192.222.50.154:11434"
 17  )
 18  
 19  # Initialize embeddings (using a smaller model for speed)
 20  # Note: You can change this to use OpenAI embeddings or other providers
 21  embeddings = OllamaEmbeddings(
 22      model="gpt-oss:120b",
 23      base_url="http://192.222.50.154:11434"
 24  )
 25  
 26  def create_vector_db_from_text(texts: list[str]):
 27      """
 28      Create a vector database from a list of text strings.
 29      """
 30      # Create documents
 31      documents = [Document(page_content=text) for text in texts]
 32  
 33      # Split documents into chunks
 34      text_splitter = RecursiveCharacterTextSplitter(
 35          chunk_size=1000,
 36          chunk_overlap=200
 37      )
 38      splits = text_splitter.split_documents(documents)
 39  
 40      # Create vector store
 41      vectorstore = Chroma.from_documents(
 42          documents=splits,
 43          embedding=embeddings,
 44          collection_name="my_collection"
 45      )
 46  
 47      return vectorstore
 48  
 49  def query_documents(vectorstore, query: str):
 50      """
 51      Query the vector database and get an answer.
 52      """
 53      qa_chain = RetrievalQA.from_chain_type(
 54          llm=llm,
 55          chain_type="stuff",
 56          retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
 57          return_source_documents=True
 58      )
 59  
 60      result = qa_chain({"query": query})
 61      return result
 62  
 63  if __name__ == "__main__":
 64      # Sample documents about a fictional company
 65      sample_docs = [
 66          """
 67          Acme Corporation is a technology company founded in 2020.
 68          We specialize in artificial intelligence and machine learning solutions.
 69          Our main products include AutoML Platform, AI Vision, and NLP Suite.
 70          """,
 71          """
 72          Acme Corporation has offices in San Francisco, New York, and London.
 73          Our headquarters is located in San Francisco, California.
 74          We have over 500 employees worldwide.
 75          """,
 76          """
 77          Acme Corporation's AI Vision product can analyze images and videos
 78          for object detection, facial recognition, and scene understanding.
 79          It achieves 95% accuracy on standard benchmarks and processes
 80          1000 images per second.
 81          """,
 82          """
 83          The AutoML Platform by Acme Corporation allows users to build
 84          machine learning models without writing code. It supports
 85          classification, regression, and clustering tasks. The platform
 86          has been used by over 10,000 companies.
 87          """
 88      ]
 89  
 90      print("Creating vector database from documents...")
 91      vectorstore = create_vector_db_from_text(sample_docs)
 92      print("Vector database created!\n")
 93  
 94      # Example queries
 95      queries = [
 96          "What products does Acme Corporation offer?",
 97          "Where are Acme Corporation's offices located?",
 98          "What can AI Vision do?"
 99      ]
100  
101      for query in queries:
102          print("=" * 60)
103          print(f"Query: {query}")
104          print("=" * 60)
105          result = query_documents(vectorstore, query)
106          print(f"\nAnswer: {result['result']}\n")
107          print("Source documents used:")
108          for i, doc in enumerate(result['source_documents'], 1):
109              print(f"{i}. {doc.page_content[:100]}...")
110          print()