/ examples / multistep_workflow / load_raw_data.py
load_raw_data.py
 1  """
 2  Downloads the MovieLens dataset and saves it as an artifact
 3  """
 4  
 5  import os
 6  import tempfile
 7  import zipfile
 8  
 9  import click
10  import requests
11  
12  import mlflow
13  
14  
15  @click.command(
16      help="Downloads the MovieLens dataset and saves it as an mlflow artifact "
17      "called 'ratings-csv-dir'."
18  )
19  @click.option("--url", default="http://files.grouplens.org/datasets/movielens/ml-20m.zip")
20  def load_raw_data(url):
21      with mlflow.start_run():
22          local_dir = tempfile.mkdtemp()
23          local_filename = os.path.join(local_dir, "ml-20m.zip")
24          print(f"Downloading {url} to {local_filename}")
25          r = requests.get(url, stream=True)
26          with open(local_filename, "wb") as f:
27              for chunk in r.iter_content(chunk_size=1024):
28                  if chunk:  # filter out keep-alive new chunks
29                      f.write(chunk)
30  
31          extracted_dir = os.path.join(local_dir, "ml-20m")
32          print(f"Extracting {local_filename} into {extracted_dir}")
33          with zipfile.ZipFile(local_filename, "r") as zip_ref:
34              zip_ref.extractall(local_dir)
35  
36          ratings_file = os.path.join(extracted_dir, "ratings.csv")
37  
38          print(f"Uploading ratings: {ratings_file}")
39          mlflow.log_artifact(ratings_file, "ratings-csv-dir")
40  
41  
42  if __name__ == "__main__":
43      load_raw_data()