Cradicle Explorer

/ examples / hyperparam / search_hyperopt.py
search_hyperopt.py
  1  """
  2  Example of hyperparameter search in MLflow using Hyperopt.
  3  
  4  The run method will instantiate and run Hyperopt optimizer. Each parameter configuration is
  5  evaluated in a new MLflow run invoking main entry point with selected parameters.
  6  
  7  The runs are evaluated based on validation set loss. Test set score is calculated to verify the
  8  results.
  9  
 10  
 11  This example currently does not support parallel execution.
 12  """
 13  
 14  import click
 15  import numpy as np
 16  from hyperopt import fmin, hp, rand, tpe
 17  
 18  import mlflow.projects
 19  from mlflow.tracking import MlflowClient
 20  
 21  _inf = np.finfo(np.float64).max
 22  
 23  
 24  @click.command(
 25      help="Perform hyperparameter search with Hyperopt library. Optimize dl_train target."
 26  )
 27  @click.option("--max-runs", type=click.INT, default=10, help="Maximum number of runs to evaluate.")
 28  @click.option("--epochs", type=click.INT, default=500, help="Number of epochs")
 29  @click.option("--metric", type=click.STRING, default="rmse", help="Metric to optimize on.")
 30  @click.option("--algo", type=click.STRING, default="tpe.suggest", help="Optimizer algorithm.")
 31  @click.option("--seed", type=click.INT, default=97531, help="Seed for the random generator")
 32  @click.argument("training_data")
 33  def train(training_data, max_runs, epochs, metric, algo, seed):
 34      """
 35      Run hyperparameter optimization.
 36      """
 37      # create random file to store run ids of the training tasks
 38      tracking_client = MlflowClient()
 39  
 40      def new_eval(
 41          nepochs, experiment_id, null_train_loss, null_valid_loss, null_test_loss, return_all=False
 42      ):
 43          """
 44          Create a new eval function
 45  
 46          Args:
 47              nepochs: Number of epochs to train the model.
 48              experiment_id: Experiment id for the training run.
 49              null_train_loss: Loss of a null model on the training dataset.
 50              null_valid_loss: Loss of a null model on the validation dataset.
 51              null_test_loss Loss of a null model on the test dataset.
 52              return_all: If True, return train, validation, and test loss.
 53                  Otherwise, return only the validation loss.
 54                  Default is False.
 55  
 56          Returns:
 57              An evaluation function that trains the model and logs metrics to MLflow.
 58          """
 59  
 60          def eval(params):
 61              """
 62              Train Keras model with given parameters by invoking MLflow run.
 63  
 64              Notice we store runUuid and resulting metric in a file. We will later use these to pick
 65              the best run and to log the runUuids of the child runs as an artifact. This is a
 66              temporary workaround until MLflow offers better mechanism of linking runs together.
 67  
 68              Args:
 69                  params: Parameters to the train_keras script we optimize over:
 70                      learning_rate, drop_out_1
 71  
 72              Returns:
 73                  The metric value evaluated on the validation data.
 74              """
 75              import mlflow.tracking
 76  
 77              lr, momentum = params
 78              with mlflow.start_run(nested=True) as child_run:
 79                  p = mlflow.projects.run(
 80                      uri=".",
 81                      entry_point="train",
 82                      run_id=child_run.info.run_id,
 83                      parameters={
 84                          "training_data": training_data,
 85                          "epochs": str(nepochs),
 86                          "learning_rate": str(lr),
 87                          "momentum": str(momentum),
 88                          "seed": seed,
 89                      },
 90                      experiment_id=experiment_id,
 91                      synchronous=False,  # Allow the run to fail if a model is not properly created
 92                  )
 93                  succeeded = p.wait()
 94                  mlflow.log_params({"lr": lr, "momentum": momentum})
 95  
 96              if succeeded:
 97                  training_run = tracking_client.get_run(p.run_id)
 98                  metrics = training_run.data.metrics
 99                  # cap the loss at the loss of the null model
100                  train_loss = min(null_train_loss, metrics[f"train_{metric}"])
101                  valid_loss = min(null_valid_loss, metrics[f"val_{metric}"])
102                  test_loss = min(null_test_loss, metrics[f"test_{metric}"])
103              else:
104                  # run failed => return null loss
105                  tracking_client.set_terminated(p.run_id, "FAILED")
106                  train_loss = null_train_loss
107                  valid_loss = null_valid_loss
108                  test_loss = null_test_loss
109  
110              mlflow.log_metrics({
111                  f"train_{metric}": train_loss,
112                  f"val_{metric}": valid_loss,
113                  f"test_{metric}": test_loss,
114              })
115  
116              if return_all:
117                  return train_loss, valid_loss, test_loss
118              else:
119                  return valid_loss
120  
121          return eval
122  
123      space = [
124          hp.uniform("lr", 1e-5, 1e-1),
125          hp.uniform("momentum", 0.0, 1.0),
126      ]
127  
128      with mlflow.start_run() as run:
129          experiment_id = run.info.experiment_id
130          # Evaluate null model first.
131          train_null_loss, valid_null_loss, test_null_loss = new_eval(
132              0, experiment_id, _inf, _inf, _inf, True
133          )(params=[0, 0])
134          best = fmin(
135              fn=new_eval(epochs, experiment_id, train_null_loss, valid_null_loss, test_null_loss),
136              space=space,
137              algo=tpe.suggest if algo == "tpe.suggest" else rand.suggest,
138              max_evals=max_runs,
139          )
140          mlflow.set_tag("best params", str(best))
141          # find the best run, log its metrics as the final metrics of this run.
142          client = MlflowClient()
143          runs = client.search_runs(
144              [experiment_id], f"tags.mlflow.parentRunId = '{run.info.run_id}' "
145          )
146          best_val_train = _inf
147          best_val_valid = _inf
148          best_val_test = _inf
149          best_run = None
150          for r in runs:
151              if r.data.metrics["val_rmse"] < best_val_valid:
152                  best_run = r
153                  best_val_train = r.data.metrics["train_rmse"]
154                  best_val_valid = r.data.metrics["val_rmse"]
155                  best_val_test = r.data.metrics["test_rmse"]
156          mlflow.set_tag("best_run", best_run.info.run_id)
157          mlflow.log_metrics({
158              f"train_{metric}": best_val_train,
159              f"val_{metric}": best_val_valid,
160              f"test_{metric}": best_val_test,
161          })
162  
163  
164  if __name__ == "__main__":
165      train()