train.py
1 # The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality 2 # P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. 3 # Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009. 4 5 import logging 6 import sys 7 import warnings 8 from urllib.parse import urlparse 9 10 import numpy as np 11 import pandas as pd 12 from sklearn.linear_model import ElasticNet 13 from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 14 from sklearn.model_selection import train_test_split 15 16 import mlflow 17 import mlflow.sklearn 18 from mlflow.models import infer_signature 19 20 logging.basicConfig(level=logging.WARN) 21 logger = logging.getLogger(__name__) 22 23 24 def eval_metrics(actual, pred): 25 rmse = np.sqrt(mean_squared_error(actual, pred)) 26 mae = mean_absolute_error(actual, pred) 27 r2 = r2_score(actual, pred) 28 return rmse, mae, r2 29 30 31 if __name__ == "__main__": 32 warnings.filterwarnings("ignore") 33 np.random.seed(40) 34 35 # Read the wine-quality csv file from the URL 36 csv_url = ( 37 "https://raw.githubusercontent.com/mlflow/mlflow/master/tests/datasets/winequality-red.csv" 38 ) 39 try: 40 data = pd.read_csv(csv_url, sep=";") 41 except Exception as e: 42 logger.exception( 43 "Unable to download training & test CSV, check your internet connection. Error: %s", e 44 ) 45 46 # Split the data into training and test sets. (0.75, 0.25) split. 47 train, test = train_test_split(data) 48 49 # The predicted column is "quality" which is a scalar from [3, 9] 50 train_x = train.drop(["quality"], axis=1) 51 test_x = test.drop(["quality"], axis=1) 52 train_y = train[["quality"]] 53 test_y = test[["quality"]] 54 55 alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5 56 l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5 57 58 with mlflow.start_run(): 59 lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42) 60 lr.fit(train_x, train_y) 61 62 predicted_qualities = lr.predict(test_x) 63 64 (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities) 65 66 print(f"Elasticnet model (alpha={alpha:f}, l1_ratio={l1_ratio:f}):") 67 print(f" RMSE: {rmse}") 68 print(f" MAE: {mae}") 69 print(f" R2: {r2}") 70 71 mlflow.log_param("alpha", alpha) 72 mlflow.log_param("l1_ratio", l1_ratio) 73 mlflow.log_metric("rmse", rmse) 74 mlflow.log_metric("r2", r2) 75 mlflow.log_metric("mae", mae) 76 77 predictions = lr.predict(train_x) 78 signature = infer_signature(train_x, predictions) 79 80 tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme 81 82 # Model registry does not work with file store 83 if tracking_url_type_store != "file": 84 # Register the model 85 # There are other ways to use the Model Registry, which depends on the use case, 86 # please refer to the doc for more information: 87 # https://mlflow.org/docs/latest/model-registry.html#api-workflow 88 mlflow.sklearn.log_model( 89 lr, name="model", registered_model_name="ElasticnetWineModel", signature=signature 90 ) 91 else: 92 mlflow.sklearn.log_model(lr, name="model", signature=signature)