/ restai / document / runner.py
runner.py
 1  from pathlib import Path
 2  from torch.multiprocessing import Process
 3  from ilock import ILock
 4  from llama_index.readers.docling import DoclingReader
 5  import torch
 6  
 7  def worker(file_path: str, sharedmem):
 8      """Worker process for loading documents using docling"""
 9      try:
10          reader = DoclingReader()
11          documents = reader.load_data(file_path=Path(file_path))
12          
13          # Convert documents to a format that can be shared via sharedmem
14          docs_data = []
15          for doc in documents:
16              docs_data.append({
17                  'text': doc.text,
18                  'metadata': doc.metadata
19              })
20          
21          sharedmem['documents'] = docs_data
22          sharedmem['error'] = None
23          
24      except Exception as e:
25          sharedmem['error'] = str(e)
26          sharedmem['documents'] = None
27      finally:
28          del reader
29          if torch.cuda.is_available():
30              torch.cuda.empty_cache()
31  
32  def load_documents(manager, file_path: str):
33      """Load documents using docling in a separate process"""
34      sharedmem = manager.dict()
35      
36      p = Process(target=worker, args=(file_path, sharedmem))
37      p.start()
38      p.join()
39      p.kill()
40  
41      if sharedmem.get('error'):
42          raise Exception(sharedmem['error'])
43  
44      if not sharedmem.get('documents'):
45          raise Exception("No documents were loaded")
46  
47      # Convert back to Document objects
48      from llama_index.core.schema import Document
49      documents = []
50      for doc_data in sharedmem['documents']:
51          documents.append(Document(
52              text=doc_data['text'],
53              metadata=doc_data['metadata']
54          ))
55  
56      return documents