load_raw_data.py
1 """ 2 Downloads the MovieLens dataset and saves it as an artifact 3 """ 4 5 import os 6 import tempfile 7 import zipfile 8 9 import click 10 import requests 11 12 import mlflow 13 14 15 @click.command( 16 help="Downloads the MovieLens dataset and saves it as an mlflow artifact " 17 "called 'ratings-csv-dir'." 18 ) 19 @click.option("--url", default="http://files.grouplens.org/datasets/movielens/ml-20m.zip") 20 def load_raw_data(url): 21 with mlflow.start_run(): 22 local_dir = tempfile.mkdtemp() 23 local_filename = os.path.join(local_dir, "ml-20m.zip") 24 print(f"Downloading {url} to {local_filename}") 25 r = requests.get(url, stream=True) 26 with open(local_filename, "wb") as f: 27 for chunk in r.iter_content(chunk_size=1024): 28 if chunk: # filter out keep-alive new chunks 29 f.write(chunk) 30 31 extracted_dir = os.path.join(local_dir, "ml-20m") 32 print(f"Extracting {local_filename} into {extracted_dir}") 33 with zipfile.ZipFile(local_filename, "r") as zip_ref: 34 zip_ref.extractall(local_dir) 35 36 ratings_file = os.path.join(extracted_dir, "ratings.csv") 37 38 print(f"Uploading ratings: {ratings_file}") 39 mlflow.log_artifact(ratings_file, "ratings-csv-dir") 40 41 42 if __name__ == "__main__": 43 load_raw_data()