/ src / python / txtai / workflow / task / retrieve.py
retrieve.py
 1  """
 2  RetrieveTask module
 3  """
 4  
 5  import os
 6  import tempfile
 7  
 8  from urllib.request import urlretrieve
 9  from urllib.parse import urlparse
10  
11  from .url import UrlTask
12  
13  
14  class RetrieveTask(UrlTask):
15      """
16      Task that retrieves urls (local or remote) to a local directory.
17      """
18  
19      def register(self, directory=None, flatten=True):
20          """
21          Adds retrieve parameters to task.
22  
23          Args:
24              directory: local directory used to store retrieved files
25              flatten: flatten input directory structure, defaults to True
26          """
27  
28          # pylint: disable=W0201
29          # Create default temporary directory if not specified
30          if not directory:
31              # Save tempdir to prevent content from being deleted until this task is out of scope
32              # pylint: disable=R1732
33              self.tempdir = tempfile.TemporaryDirectory()
34              directory = self.tempdir.name
35  
36          # Create output directory if necessary
37          os.makedirs(directory, exist_ok=True)
38  
39          self.directory = directory
40          self.flatten = flatten
41  
42      def prepare(self, element):
43          # Extract file path from URL
44          path = urlparse(element).path
45  
46          if self.flatten:
47              # Flatten directory structure (default)
48              path = os.path.join(self.directory, os.path.basename(path))
49          else:
50              # Derive output path
51              path = os.path.join(self.directory, os.path.normpath(path.lstrip("/")))
52              directory = os.path.dirname(path)
53  
54              # Create local directory, if necessary
55              os.makedirs(directory, exist_ok=True)
56  
57          # Retrieve URL
58          urlretrieve(element, path)
59  
60          # Return new file path
61          return path