train.ipynb
1 { 2 "cells": [ 3 { 4 "attachments": {}, 5 "cell_type": "markdown", 6 "metadata": {}, 7 "source": [ 8 "# MLflow Training Tutorial\n", 9 "\n", 10 "This `train.pynb` Jupyter notebook predicts the quality of wine using [sklearn.linear_model.ElasticNet](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html). \n", 11 "\n", 12 "> This is the Jupyter notebook version of the `train.py` example\n", 13 "\n", 14 "Attribution\n", 15 "* The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality\n", 16 "* P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.\n", 17 "* Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.\n" 18 ] 19 }, 20 { 21 "cell_type": "code", 22 "execution_count": 1, 23 "metadata": {}, 24 "outputs": [], 25 "source": [ 26 "import logging\n", 27 "import warnings\n", 28 "\n", 29 "\n", 30 "# Wine Quality Sample\n", 31 "def train(in_alpha, in_l1_ratio):\n", 32 " import numpy as np\n", 33 " import pandas as pd\n", 34 " from sklearn.linear_model import ElasticNet\n", 35 " from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score\n", 36 " from sklearn.model_selection import train_test_split\n", 37 "\n", 38 " import mlflow\n", 39 " import mlflow.sklearn\n", 40 " from mlflow.models import infer_signature\n", 41 "\n", 42 " logging.basicConfig(level=logging.WARN)\n", 43 " logger = logging.getLogger(__name__)\n", 44 "\n", 45 " def eval_metrics(actual, pred):\n", 46 " rmse = np.sqrt(mean_squared_error(actual, pred))\n", 47 " mae = mean_absolute_error(actual, pred)\n", 48 " r2 = r2_score(actual, pred)\n", 49 " return rmse, mae, r2\n", 50 "\n", 51 " warnings.filterwarnings(\"ignore\")\n", 52 " np.random.seed(40)\n", 53 "\n", 54 " # Read the wine-quality csv file from the URL\n", 55 " csv_url = (\n", 56 " \"http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv\"\n", 57 " )\n", 58 " try:\n", 59 " data = pd.read_csv(csv_url, sep=\";\")\n", 60 " except Exception as e:\n", 61 " logger.exception(\n", 62 " f\"Unable to download training & test CSV, check your internet connection. Error: {e}\"\n", 63 " )\n", 64 "\n", 65 " # Split the data into training and test sets. (0.75, 0.25) split.\n", 66 " train, test = train_test_split(data)\n", 67 "\n", 68 " # The predicted column is \"quality\" which is a scalar from [3, 9]\n", 69 " train_x = train.drop([\"quality\"], axis=1)\n", 70 " test_x = test.drop([\"quality\"], axis=1)\n", 71 " train_y = train[[\"quality\"]]\n", 72 " test_y = test[[\"quality\"]]\n", 73 "\n", 74 " # Set default values if no alpha is provided\n", 75 " alpha = 0.5 if float(in_alpha) is None else float(in_alpha)\n", 76 "\n", 77 " # Set default values if no l1_ratio is provided\n", 78 " l1_ratio = 0.5 if float(in_l1_ratio) is None else float(in_l1_ratio)\n", 79 "\n", 80 " # Useful for multiple runs (only doing one run in this sample notebook)\n", 81 " with mlflow.start_run():\n", 82 " # Execute ElasticNet\n", 83 " lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)\n", 84 " lr.fit(train_x, train_y)\n", 85 "\n", 86 " # Evaluate Metrics\n", 87 " predicted_qualities = lr.predict(test_x)\n", 88 " (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)\n", 89 "\n", 90 " # Print out metrics\n", 91 " print(f\"Elasticnet model (alpha={alpha:f}, l1_ratio={l1_ratio:f}):\")\n", 92 " print(f\" RMSE: {rmse}\")\n", 93 " print(f\" MAE: {mae}\")\n", 94 " print(f\" R2: {r2}\")\n", 95 "\n", 96 " # Infer model signature\n", 97 " predictions = lr.predict(train_x)\n", 98 " signature = infer_signature(train_x, predictions)\n", 99 "\n", 100 " # Log parameter, metrics, and model to MLflow\n", 101 " mlflow.log_param(\"alpha\", alpha)\n", 102 " mlflow.log_param(\"l1_ratio\", l1_ratio)\n", 103 " mlflow.log_metric(\"rmse\", rmse)\n", 104 " mlflow.log_metric(\"r2\", r2)\n", 105 " mlflow.log_metric(\"mae\", mae)\n", 106 "\n", 107 " mlflow.sklearn.log_model(lr, name=\"model\", signature=signature)" 108 ] 109 }, 110 { 111 "cell_type": "code", 112 "execution_count": 2, 113 "metadata": {}, 114 "outputs": [ 115 { 116 "name": "stdout", 117 "output_type": "stream", 118 "text": [ 119 "Elasticnet model (alpha=0.500000, l1_ratio=0.500000):\n", 120 " RMSE: 0.82224284975954\n", 121 " MAE: 0.6278761410160691\n", 122 " R2: 0.12678721972772689\n" 123 ] 124 } 125 ], 126 "source": [ 127 "train(0.5, 0.5)" 128 ] 129 }, 130 { 131 "cell_type": "code", 132 "execution_count": 3, 133 "metadata": {}, 134 "outputs": [ 135 { 136 "name": "stdout", 137 "output_type": "stream", 138 "text": [ 139 "Elasticnet model (alpha=0.200000, l1_ratio=0.200000):\n", 140 " RMSE: 0.7859129997062342\n", 141 " MAE: 0.6155290394093894\n", 142 " R2: 0.20224631822892092\n" 143 ] 144 } 145 ], 146 "source": [ 147 "train(0.2, 0.2)" 148 ] 149 }, 150 { 151 "cell_type": "code", 152 "execution_count": 4, 153 "metadata": {}, 154 "outputs": [ 155 { 156 "name": "stdout", 157 "output_type": "stream", 158 "text": [ 159 "Elasticnet model (alpha=0.100000, l1_ratio=0.100000):\n", 160 " RMSE: 0.7792546522251949\n", 161 " MAE: 0.6112547988118587\n", 162 " R2: 0.2157063843066196\n" 163 ] 164 } 165 ], 166 "source": [ 167 "train(0.1, 0.1)" 168 ] 169 } 170 ], 171 "metadata": { 172 "kernelspec": { 173 "display_name": "Python 3", 174 "language": "python", 175 "name": "python3" 176 }, 177 "language_info": { 178 "codemirror_mode": { 179 "name": "ipython", 180 "version": 3 181 }, 182 "file_extension": ".py", 183 "mimetype": "text/x-python", 184 "name": "python", 185 "nbconvert_exporter": "python", 186 "pygments_lexer": "ipython3", 187 "version": "3.6.5" 188 } 189 }, 190 "nbformat": 4, 191 "nbformat_minor": 2 192 }