search_hyperopt.py
1 """ 2 Example of hyperparameter search in MLflow using Hyperopt. 3 4 The run method will instantiate and run Hyperopt optimizer. Each parameter configuration is 5 evaluated in a new MLflow run invoking main entry point with selected parameters. 6 7 The runs are evaluated based on validation set loss. Test set score is calculated to verify the 8 results. 9 10 11 This example currently does not support parallel execution. 12 """ 13 14 import click 15 import numpy as np 16 from hyperopt import fmin, hp, rand, tpe 17 18 import mlflow.projects 19 from mlflow.tracking import MlflowClient 20 21 _inf = np.finfo(np.float64).max 22 23 24 @click.command( 25 help="Perform hyperparameter search with Hyperopt library. Optimize dl_train target." 26 ) 27 @click.option("--max-runs", type=click.INT, default=10, help="Maximum number of runs to evaluate.") 28 @click.option("--epochs", type=click.INT, default=500, help="Number of epochs") 29 @click.option("--metric", type=click.STRING, default="rmse", help="Metric to optimize on.") 30 @click.option("--algo", type=click.STRING, default="tpe.suggest", help="Optimizer algorithm.") 31 @click.option("--seed", type=click.INT, default=97531, help="Seed for the random generator") 32 @click.argument("training_data") 33 def train(training_data, max_runs, epochs, metric, algo, seed): 34 """ 35 Run hyperparameter optimization. 36 """ 37 # create random file to store run ids of the training tasks 38 tracking_client = MlflowClient() 39 40 def new_eval( 41 nepochs, experiment_id, null_train_loss, null_valid_loss, null_test_loss, return_all=False 42 ): 43 """ 44 Create a new eval function 45 46 Args: 47 nepochs: Number of epochs to train the model. 48 experiment_id: Experiment id for the training run. 49 null_train_loss: Loss of a null model on the training dataset. 50 null_valid_loss: Loss of a null model on the validation dataset. 51 null_test_loss Loss of a null model on the test dataset. 52 return_all: If True, return train, validation, and test loss. 53 Otherwise, return only the validation loss. 54 Default is False. 55 56 Returns: 57 An evaluation function that trains the model and logs metrics to MLflow. 58 """ 59 60 def eval(params): 61 """ 62 Train Keras model with given parameters by invoking MLflow run. 63 64 Notice we store runUuid and resulting metric in a file. We will later use these to pick 65 the best run and to log the runUuids of the child runs as an artifact. This is a 66 temporary workaround until MLflow offers better mechanism of linking runs together. 67 68 Args: 69 params: Parameters to the train_keras script we optimize over: 70 learning_rate, drop_out_1 71 72 Returns: 73 The metric value evaluated on the validation data. 74 """ 75 import mlflow.tracking 76 77 lr, momentum = params 78 with mlflow.start_run(nested=True) as child_run: 79 p = mlflow.projects.run( 80 uri=".", 81 entry_point="train", 82 run_id=child_run.info.run_id, 83 parameters={ 84 "training_data": training_data, 85 "epochs": str(nepochs), 86 "learning_rate": str(lr), 87 "momentum": str(momentum), 88 "seed": seed, 89 }, 90 experiment_id=experiment_id, 91 synchronous=False, # Allow the run to fail if a model is not properly created 92 ) 93 succeeded = p.wait() 94 mlflow.log_params({"lr": lr, "momentum": momentum}) 95 96 if succeeded: 97 training_run = tracking_client.get_run(p.run_id) 98 metrics = training_run.data.metrics 99 # cap the loss at the loss of the null model 100 train_loss = min(null_train_loss, metrics[f"train_{metric}"]) 101 valid_loss = min(null_valid_loss, metrics[f"val_{metric}"]) 102 test_loss = min(null_test_loss, metrics[f"test_{metric}"]) 103 else: 104 # run failed => return null loss 105 tracking_client.set_terminated(p.run_id, "FAILED") 106 train_loss = null_train_loss 107 valid_loss = null_valid_loss 108 test_loss = null_test_loss 109 110 mlflow.log_metrics({ 111 f"train_{metric}": train_loss, 112 f"val_{metric}": valid_loss, 113 f"test_{metric}": test_loss, 114 }) 115 116 if return_all: 117 return train_loss, valid_loss, test_loss 118 else: 119 return valid_loss 120 121 return eval 122 123 space = [ 124 hp.uniform("lr", 1e-5, 1e-1), 125 hp.uniform("momentum", 0.0, 1.0), 126 ] 127 128 with mlflow.start_run() as run: 129 experiment_id = run.info.experiment_id 130 # Evaluate null model first. 131 train_null_loss, valid_null_loss, test_null_loss = new_eval( 132 0, experiment_id, _inf, _inf, _inf, True 133 )(params=[0, 0]) 134 best = fmin( 135 fn=new_eval(epochs, experiment_id, train_null_loss, valid_null_loss, test_null_loss), 136 space=space, 137 algo=tpe.suggest if algo == "tpe.suggest" else rand.suggest, 138 max_evals=max_runs, 139 ) 140 mlflow.set_tag("best params", str(best)) 141 # find the best run, log its metrics as the final metrics of this run. 142 client = MlflowClient() 143 runs = client.search_runs( 144 [experiment_id], f"tags.mlflow.parentRunId = '{run.info.run_id}' " 145 ) 146 best_val_train = _inf 147 best_val_valid = _inf 148 best_val_test = _inf 149 best_run = None 150 for r in runs: 151 if r.data.metrics["val_rmse"] < best_val_valid: 152 best_run = r 153 best_val_train = r.data.metrics["train_rmse"] 154 best_val_valid = r.data.metrics["val_rmse"] 155 best_val_test = r.data.metrics["test_rmse"] 156 mlflow.set_tag("best_run", best_run.info.run_id) 157 mlflow.log_metrics({ 158 f"train_{metric}": best_val_train, 159 f"val_{metric}": best_val_valid, 160 f"test_{metric}": best_val_test, 161 }) 162 163 164 if __name__ == "__main__": 165 train()