5_pdf_rag.py
1 """ 2 PDF RAG Example. 3 This demonstrates how to load PDF files and query them using RAG. 4 """ 5 6 from langchain_community.llms import Ollama 7 from langchain_community.document_loaders import PyPDFLoader 8 from langchain.text_splitter import RecursiveCharacterTextSplitter 9 from langchain_community.vectorstores import FAISS 10 from langchain_community.embeddings import OllamaEmbeddings 11 from langchain.chains import RetrievalQA 12 import os 13 14 # Initialize Ollama 15 llm = Ollama( 16 model="gpt-oss:120b", 17 base_url="http://192.222.50.154:11434" 18 ) 19 20 # Initialize embeddings 21 embeddings = OllamaEmbeddings( 22 model="gpt-oss:120b", 23 base_url="http://192.222.50.154:11434" 24 ) 25 26 def load_pdf_and_create_index(pdf_path: str): 27 """ 28 Load a PDF file and create a searchable index. 29 """ 30 # Load PDF 31 loader = PyPDFLoader(pdf_path) 32 documents = loader.load() 33 34 print(f"Loaded {len(documents)} pages from PDF") 35 36 # Split documents 37 text_splitter = RecursiveCharacterTextSplitter( 38 chunk_size=1000, 39 chunk_overlap=200 40 ) 41 splits = text_splitter.split_documents(documents) 42 print(f"Split into {len(splits)} chunks") 43 44 # Create FAISS index (faster than Chroma for large documents) 45 vectorstore = FAISS.from_documents(splits, embeddings) 46 print("Vector index created!") 47 48 return vectorstore 49 50 def query_pdf(vectorstore, query: str, k: int = 3): 51 """ 52 Query the PDF using RAG. 53 """ 54 qa_chain = RetrievalQA.from_chain_type( 55 llm=llm, 56 chain_type="stuff", 57 retriever=vectorstore.as_retriever(search_kwargs={"k": k}), 58 return_source_documents=True 59 ) 60 61 result = qa_chain({"query": query}) 62 return result 63 64 def save_and_load_index(vectorstore, index_path: str): 65 """ 66 Save and load FAISS index for reuse. 67 """ 68 # Save index 69 vectorstore.save_local(index_path) 70 print(f"Index saved to {index_path}") 71 72 # Load index 73 loaded_vectorstore = FAISS.load_local( 74 index_path, 75 embeddings, 76 allow_dangerous_deserialization=True 77 ) 78 print(f"Index loaded from {index_path}") 79 return loaded_vectorstore 80 81 if __name__ == "__main__": 82 # Example usage 83 print("=" * 60) 84 print("PDF RAG Example") 85 print("=" * 60) 86 print("\nThis script demonstrates how to:") 87 print("1. Load a PDF file") 88 print("2. Create a searchable vector index") 89 print("3. Query the PDF using natural language") 90 print("4. Save and load the index for reuse") 91 print("\nTo use this script:") 92 print("1. Place a PDF file in the Language folder") 93 print("2. Update the pdf_path variable below") 94 print("3. Run the script") 95 print("\nExample usage:") 96 print(""" 97 # Load PDF 98 pdf_path = "your_document.pdf" 99 vectorstore = load_pdf_and_create_index(pdf_path) 100 101 # Query the PDF 102 result = query_pdf(vectorstore, "What is the main topic of this document?") 103 print(result['result']) 104 105 # Save index for later use 106 save_and_load_index(vectorstore, "pdf_index") 107 108 # Load existing index 109 loaded_vectorstore = FAISS.load_local("pdf_index", embeddings, allow_dangerous_deserialization=True) 110 """)