test_virtualenv.py
1 import os 2 import sys 3 from io import BytesIO 4 from stat import S_IRGRP, S_IROTH, S_IRUSR, S_IXGRP, S_IXOTH, S_IXUSR 5 from typing import NamedTuple 6 7 import numpy as np 8 import pandas as pd 9 import pytest 10 import sklearn 11 from sklearn.datasets import load_iris 12 from sklearn.linear_model import LogisticRegression 13 14 import mlflow 15 from mlflow.environment_variables import MLFLOW_ENV_ROOT 16 from mlflow.pyfunc.scoring_server import CONTENT_TYPE_JSON 17 from mlflow.utils.environment import _PYTHON_ENV_FILE_NAME, _REQUIREMENTS_FILE_NAME 18 from mlflow.utils.virtualenv import _is_pyenv_available 19 20 from tests.helper_functions import pyfunc_serve_and_score_model 21 22 pytestmark = pytest.mark.skipif( 23 not _is_pyenv_available(), 24 reason="requires pyenv", 25 ) 26 27 TEST_DIR = "tests" 28 TEST_MLFLOW_1X_MODEL_DIR = os.path.join(TEST_DIR, "resources", "example_mlflow_1x_sklearn_model") 29 30 31 class Model(NamedTuple): 32 model: LogisticRegression 33 X_pred: pd.DataFrame 34 y_pred: np.ndarray 35 36 37 @pytest.fixture(scope="module") 38 def sklearn_model(): 39 X, y = load_iris(return_X_y=True, as_frame=True) 40 model = LogisticRegression().fit(X, y) 41 X_pred = X.sample(frac=0.1, random_state=0) 42 y_pred = model.predict(X_pred) 43 return Model(model, X_pred, y_pred) 44 45 46 def serve_and_score(model_uri, data, extra_args=None): 47 resp = pyfunc_serve_and_score_model( 48 model_uri, 49 data=data, 50 content_type=CONTENT_TYPE_JSON, 51 extra_args=["--env-manager=virtualenv"] + (extra_args or []), 52 ) 53 return pd.read_json(BytesIO(resp.content), orient="records").values.squeeze() 54 55 56 @pytest.fixture 57 def temp_mlflow_env_root(tmp_path, monkeypatch): 58 env_root = tmp_path / "envs" 59 env_root.mkdir(exist_ok=True) 60 monkeypatch.setenv(MLFLOW_ENV_ROOT.name, str(env_root)) 61 return env_root 62 63 64 use_temp_mlflow_env_root = pytest.mark.usefixtures(temp_mlflow_env_root.__name__) 65 66 67 @use_temp_mlflow_env_root 68 def test_restore_environment_with_virtualenv(sklearn_model): 69 with mlflow.start_run(): 70 model_info = mlflow.sklearn.log_model(sklearn_model.model, name="model") 71 72 scores = serve_and_score(model_info.model_uri, sklearn_model.X_pred) 73 np.testing.assert_array_almost_equal(scores, sklearn_model.y_pred) 74 75 76 @use_temp_mlflow_env_root 77 def test_serve_and_score_read_only_model_directory(sklearn_model, tmp_path): 78 model_path = str(tmp_path / "model") 79 mlflow.sklearn.save_model(sklearn_model.model, path=model_path) 80 os.chmod( 81 model_path, 82 S_IRUSR | S_IRGRP | S_IROTH | S_IXUSR | S_IXGRP | S_IXOTH, 83 ) 84 85 scores = serve_and_score(model_path, sklearn_model.X_pred) 86 np.testing.assert_array_almost_equal(scores, sklearn_model.y_pred) 87 88 89 @use_temp_mlflow_env_root 90 def test_serve_and_score_1x_models(): 91 X, _ = load_iris(return_X_y=True, as_frame=True) 92 X_pred = X.sample(frac=0.1, random_state=0) 93 loaded_model = mlflow.pyfunc.load_model(TEST_MLFLOW_1X_MODEL_DIR) 94 y_pred = loaded_model.predict(X_pred) 95 96 scores = serve_and_score(TEST_MLFLOW_1X_MODEL_DIR, X_pred) 97 np.testing.assert_array_almost_equal(scores, y_pred) 98 99 100 @use_temp_mlflow_env_root 101 def test_reuse_environment(temp_mlflow_env_root, sklearn_model): 102 with mlflow.start_run(): 103 model_info = mlflow.sklearn.log_model(sklearn_model.model, name="model") 104 105 # Serve the model 106 scores = serve_and_score(model_info.model_uri, sklearn_model.X_pred) 107 np.testing.assert_array_almost_equal(scores, sklearn_model.y_pred) 108 # Serve the model again. The environment created in the previous serving should be reused. 109 scores = serve_and_score(model_info.model_uri, sklearn_model.X_pred) 110 np.testing.assert_array_almost_equal(scores, sklearn_model.y_pred) 111 assert len(list(temp_mlflow_env_root.iterdir())) == 1 112 113 114 @use_temp_mlflow_env_root 115 def test_different_requirements_create_different_environments(temp_mlflow_env_root, sklearn_model): 116 sklearn_req = f"scikit-learn=={sklearn.__version__}" 117 with mlflow.start_run(): 118 model_info1 = mlflow.sklearn.log_model( 119 sklearn_model.model, 120 name="model", 121 pip_requirements=[sklearn_req], 122 ) 123 scores = serve_and_score(model_info1.model_uri, sklearn_model.X_pred) 124 np.testing.assert_array_almost_equal(scores, sklearn_model.y_pred) 125 126 # Log the same model with different requirements 127 with mlflow.start_run(): 128 model_info2 = mlflow.sklearn.log_model( 129 sklearn_model.model, 130 name="model", 131 pip_requirements=[sklearn_req, "numpy"], 132 ) 133 scores = serve_and_score(model_info2.model_uri, sklearn_model.X_pred) 134 np.testing.assert_array_almost_equal(scores, sklearn_model.y_pred) 135 # Two environments should exist now because the first and second models have different 136 # requirements 137 assert len(list(temp_mlflow_env_root.iterdir())) == 2 138 139 140 @use_temp_mlflow_env_root 141 def test_environment_directory_is_cleaned_up_when_unexpected_error_occurs( 142 temp_mlflow_env_root, sklearn_model 143 ): 144 sklearn_req = "scikit-learn==999.999.999" 145 with mlflow.start_run(): 146 model_info1 = mlflow.sklearn.log_model( 147 sklearn_model.model, 148 name="model", 149 pip_requirements=[sklearn_req], 150 ) 151 152 try: 153 serve_and_score(model_info1.model_uri, sklearn_model.X_pred) 154 except Exception: 155 pass 156 else: 157 assert False, "Should have raised an exception" 158 assert len(list(temp_mlflow_env_root.iterdir())) == 0 159 160 161 @use_temp_mlflow_env_root 162 def test_python_env_file_does_not_exist(sklearn_model, tmp_path): 163 with mlflow.start_run(): 164 model_info = mlflow.sklearn.log_model(sklearn_model.model, name="model") 165 166 mlflow.artifacts.download_artifacts(artifact_uri=model_info.model_uri, dst_path=tmp_path) 167 python_env = next(tmp_path.rglob(_PYTHON_ENV_FILE_NAME)) 168 python_env.unlink() 169 170 scores = serve_and_score(tmp_path, sklearn_model.X_pred) 171 np.testing.assert_array_almost_equal(scores, sklearn_model.y_pred) 172 173 174 @use_temp_mlflow_env_root 175 def test_python_env_file_and_requirements_file_do_not_exist(sklearn_model, tmp_path): 176 with mlflow.start_run(): 177 model_info = mlflow.sklearn.log_model(sklearn_model.model, name="model") 178 179 mlflow.artifacts.download_artifacts(artifact_uri=model_info.model_uri, dst_path=tmp_path) 180 python_env = next(tmp_path.rglob(_PYTHON_ENV_FILE_NAME)) 181 python_env.unlink() 182 requirements = next(tmp_path.rglob(_REQUIREMENTS_FILE_NAME)) 183 requirements.unlink() 184 185 scores = serve_and_score(tmp_path, sklearn_model.X_pred) 186 np.testing.assert_array_almost_equal(scores, sklearn_model.y_pred) 187 188 189 def test_environment_is_removed_when_package_installation_fails( 190 temp_mlflow_env_root, sklearn_model 191 ): 192 with mlflow.start_run(): 193 model_info = mlflow.sklearn.log_model( 194 sklearn_model.model, 195 name="model", 196 # Enforce pip install to fail using a non-existent package version 197 pip_requirements=["mlflow==999.999.999"], 198 ) 199 with pytest.raises(AssertionError, match="scoring process died"): 200 serve_and_score(model_info.model_uri, sklearn_model.X_pred) 201 assert len(list(temp_mlflow_env_root.iterdir())) == 0 202 203 204 @use_temp_mlflow_env_root 205 def test_restore_environment_from_conda_yaml_containing_conda_packages(sklearn_model, tmp_path): 206 conda_env = { 207 "name": "mlflow-env", 208 "channels": ["conda-forge"], 209 "dependencies": [ 210 "python=" + ".".join(map(str, sys.version_info[:3])), 211 "conda-package=1.2.3", # conda package 212 "pip", 213 { 214 "pip": [ 215 "mlflow", 216 f"scikit-learn=={sklearn.__version__}", 217 ] 218 }, 219 ], 220 } 221 with mlflow.start_run(): 222 model_info = mlflow.sklearn.log_model( 223 sklearn_model.model, 224 name="model", 225 conda_env=conda_env, 226 ) 227 228 mlflow.artifacts.download_artifacts(artifact_uri=model_info.model_uri, dst_path=tmp_path) 229 python_env = next(tmp_path.rglob(_PYTHON_ENV_FILE_NAME)) 230 python_env.unlink() 231 serve_and_score(tmp_path, sklearn_model.X_pred)