runner.py
1 from pathlib import Path 2 from torch.multiprocessing import Process 3 from ilock import ILock 4 from llama_index.readers.docling import DoclingReader 5 import torch 6 7 def worker(file_path: str, sharedmem): 8 """Worker process for loading documents using docling""" 9 try: 10 reader = DoclingReader() 11 documents = reader.load_data(file_path=Path(file_path)) 12 13 # Convert documents to a format that can be shared via sharedmem 14 docs_data = [] 15 for doc in documents: 16 docs_data.append({ 17 'text': doc.text, 18 'metadata': doc.metadata 19 }) 20 21 sharedmem['documents'] = docs_data 22 sharedmem['error'] = None 23 24 except Exception as e: 25 sharedmem['error'] = str(e) 26 sharedmem['documents'] = None 27 finally: 28 del reader 29 if torch.cuda.is_available(): 30 torch.cuda.empty_cache() 31 32 def load_documents(manager, file_path: str): 33 """Load documents using docling in a separate process""" 34 sharedmem = manager.dict() 35 36 p = Process(target=worker, args=(file_path, sharedmem)) 37 p.start() 38 p.join() 39 p.kill() 40 41 if sharedmem.get('error'): 42 raise Exception(sharedmem['error']) 43 44 if not sharedmem.get('documents'): 45 raise Exception("No documents were loaded") 46 47 # Convert back to Document objects 48 from llama_index.core.schema import Document 49 documents = [] 50 for doc_data in sharedmem['documents']: 51 documents.append(Document( 52 text=doc_data['text'], 53 metadata=doc_data['metadata'] 54 )) 55 56 return documents