retrieve.py
1 """ 2 RetrieveTask module 3 """ 4 5 import os 6 import tempfile 7 8 from urllib.request import urlretrieve 9 from urllib.parse import urlparse 10 11 from .url import UrlTask 12 13 14 class RetrieveTask(UrlTask): 15 """ 16 Task that retrieves urls (local or remote) to a local directory. 17 """ 18 19 def register(self, directory=None, flatten=True): 20 """ 21 Adds retrieve parameters to task. 22 23 Args: 24 directory: local directory used to store retrieved files 25 flatten: flatten input directory structure, defaults to True 26 """ 27 28 # pylint: disable=W0201 29 # Create default temporary directory if not specified 30 if not directory: 31 # Save tempdir to prevent content from being deleted until this task is out of scope 32 # pylint: disable=R1732 33 self.tempdir = tempfile.TemporaryDirectory() 34 directory = self.tempdir.name 35 36 # Create output directory if necessary 37 os.makedirs(directory, exist_ok=True) 38 39 self.directory = directory 40 self.flatten = flatten 41 42 def prepare(self, element): 43 # Extract file path from URL 44 path = urlparse(element).path 45 46 if self.flatten: 47 # Flatten directory structure (default) 48 path = os.path.join(self.directory, os.path.basename(path)) 49 else: 50 # Derive output path 51 path = os.path.join(self.directory, os.path.normpath(path.lstrip("/"))) 52 directory = os.path.dirname(path) 53 54 # Create local directory, if necessary 55 os.makedirs(directory, exist_ok=True) 56 57 # Retrieve URL 58 urlretrieve(element, path) 59 60 # Return new file path 61 return path