/ archive / python / 5_pdf_rag.py
5_pdf_rag.py
  1  """
  2  PDF RAG Example.
  3  This demonstrates how to load PDF files and query them using RAG.
  4  """
  5  
  6  from langchain_community.llms import Ollama
  7  from langchain_community.document_loaders import PyPDFLoader
  8  from langchain.text_splitter import RecursiveCharacterTextSplitter
  9  from langchain_community.vectorstores import FAISS
 10  from langchain_community.embeddings import OllamaEmbeddings
 11  from langchain.chains import RetrievalQA
 12  import os
 13  
 14  # Initialize Ollama
 15  llm = Ollama(
 16      model="gpt-oss:120b",
 17      base_url="http://192.222.50.154:11434"
 18  )
 19  
 20  # Initialize embeddings
 21  embeddings = OllamaEmbeddings(
 22      model="gpt-oss:120b",
 23      base_url="http://192.222.50.154:11434"
 24  )
 25  
 26  def load_pdf_and_create_index(pdf_path: str):
 27      """
 28      Load a PDF file and create a searchable index.
 29      """
 30      # Load PDF
 31      loader = PyPDFLoader(pdf_path)
 32      documents = loader.load()
 33  
 34      print(f"Loaded {len(documents)} pages from PDF")
 35  
 36      # Split documents
 37      text_splitter = RecursiveCharacterTextSplitter(
 38          chunk_size=1000,
 39          chunk_overlap=200
 40      )
 41      splits = text_splitter.split_documents(documents)
 42      print(f"Split into {len(splits)} chunks")
 43  
 44      # Create FAISS index (faster than Chroma for large documents)
 45      vectorstore = FAISS.from_documents(splits, embeddings)
 46      print("Vector index created!")
 47  
 48      return vectorstore
 49  
 50  def query_pdf(vectorstore, query: str, k: int = 3):
 51      """
 52      Query the PDF using RAG.
 53      """
 54      qa_chain = RetrievalQA.from_chain_type(
 55          llm=llm,
 56          chain_type="stuff",
 57          retriever=vectorstore.as_retriever(search_kwargs={"k": k}),
 58          return_source_documents=True
 59      )
 60  
 61      result = qa_chain({"query": query})
 62      return result
 63  
 64  def save_and_load_index(vectorstore, index_path: str):
 65      """
 66      Save and load FAISS index for reuse.
 67      """
 68      # Save index
 69      vectorstore.save_local(index_path)
 70      print(f"Index saved to {index_path}")
 71  
 72      # Load index
 73      loaded_vectorstore = FAISS.load_local(
 74          index_path,
 75          embeddings,
 76          allow_dangerous_deserialization=True
 77      )
 78      print(f"Index loaded from {index_path}")
 79      return loaded_vectorstore
 80  
 81  if __name__ == "__main__":
 82      # Example usage
 83      print("=" * 60)
 84      print("PDF RAG Example")
 85      print("=" * 60)
 86      print("\nThis script demonstrates how to:")
 87      print("1. Load a PDF file")
 88      print("2. Create a searchable vector index")
 89      print("3. Query the PDF using natural language")
 90      print("4. Save and load the index for reuse")
 91      print("\nTo use this script:")
 92      print("1. Place a PDF file in the Language folder")
 93      print("2. Update the pdf_path variable below")
 94      print("3. Run the script")
 95      print("\nExample usage:")
 96      print("""
 97      # Load PDF
 98      pdf_path = "your_document.pdf"
 99      vectorstore = load_pdf_and_create_index(pdf_path)
100  
101      # Query the PDF
102      result = query_pdf(vectorstore, "What is the main topic of this document?")
103      print(result['result'])
104  
105      # Save index for later use
106      save_and_load_index(vectorstore, "pdf_index")
107  
108      # Load existing index
109      loaded_vectorstore = FAISS.load_local("pdf_index", embeddings, allow_dangerous_deserialization=True)
110      """)