docstring_utils.py
1 import textwrap 2 import warnings 3 from typing import Any 4 5 from mlflow.ml_package_versions import _ML_PACKAGE_VERSIONS 6 from mlflow.utils.autologging_utils.versioning import ( 7 get_min_max_version_and_pip_release, 8 ) 9 10 11 def _create_placeholder(key: str): 12 return "{{ " + key + " }}" 13 14 15 def _replace_keys_with_placeholders(d: dict[str, Any]) -> dict[str, Any]: 16 return {_create_placeholder(k): v for k, v in d.items()} 17 18 19 def _get_indentation_of_key(line: str, placeholder: str) -> str: 20 index = line.find(placeholder) 21 return (index * " ") if index != -1 else "" 22 23 24 def _indent(text: str, indent: str) -> str: 25 """Indent everything but first line in text.""" 26 lines = text.splitlines() 27 if len(lines) <= 1: 28 return text 29 30 else: 31 first_line = lines[0] 32 subsequent_lines = "\n".join(list(lines[1:])) 33 indented_subsequent_lines = textwrap.indent(subsequent_lines, indent) 34 return first_line + "\n" + indented_subsequent_lines 35 36 37 def _replace_all(text: str, replacements: dict[str, str]) -> str: 38 """ 39 Replace all instances of replacements.keys() with their corresponding 40 values in text. The replacements will be inserted on the same line 41 with wrapping to the same level of indentation, for example: 42 43 ``` 44 Args: 45 param_1: {{ key }} 46 ``` 47 48 will become... 49 50 ``` 51 Args: 52 param_1: replaced_value_at same indentation as prior 53 and if there are more lines they will also 54 have the same indentation. 55 ``` 56 """ 57 for key, value in replacements.items(): 58 if key in text: 59 indent = _get_indentation_of_key(text, key) 60 indented_value = _indent(value, indent) 61 text = text.replace(key, indented_value) 62 return text 63 64 65 class ParamDocs(dict): 66 """ 67 Represents a set of parameter documents in the docstring. 68 """ 69 70 def __repr__(self): 71 return f"ParamDocs({super().__repr__()})" 72 73 def format(self, **kwargs): 74 """ 75 Formats values to be substituted in via the format_docstring() method. 76 77 Args: 78 kwargs: A `dict` in the form of `{"< placeholder name >": "< value >"}`. 79 80 Returns: 81 A new `ParamDocs` instance with the formatted param docs. 82 83 .. code-block:: text 84 :caption: Example 85 86 >>> pd = ParamDocs(p1="{{ doc1 }}", p2="{{ doc2 }}") 87 >>> pd.format(doc1="foo", doc2="bar") 88 ParamDocs({'p1': 'foo', 'p2': 'bar'}) 89 """ 90 replacements = _replace_keys_with_placeholders(kwargs) 91 return ParamDocs({k: _replace_all(v, replacements) for k, v in self.items()}) 92 93 def format_docstring(self, docstring: str) -> str: 94 """ 95 Formats placeholders in `docstring`. 96 97 Args: 98 docstring: A docstring with placeholders to be replaced. 99 If provided with None, will return None. 100 101 .. code-block:: text 102 :caption: Example 103 104 >>> pd = ParamDocs(p1="doc1", p2="doc2 105 doc2 second line") 106 >>> docstring = ''' 107 ... Args: 108 ... p1: {{ p1 }} 109 ... p2: {{ p2 }} 110 ... '''.strip() 111 >>> print(pd.format_docstring(docstring)) 112 """ 113 if docstring is None: 114 return None 115 116 replacements = _replace_keys_with_placeholders(self) 117 lines = docstring.splitlines() 118 for i, line in enumerate(lines): 119 lines[i] = _replace_all(line, replacements) 120 121 return "\n".join(lines) 122 123 124 def format_docstring(param_docs): 125 """ 126 Returns a decorator that replaces param doc placeholders (e.g. '{{ param_name }}') in the 127 docstring of the decorated function. 128 129 Args: 130 param_docs: A `ParamDocs` instance or `dict`. 131 132 Returns: 133 A decorator to apply the formatting. 134 135 .. code-block:: text 136 :caption: Example 137 138 >>> param_docs = {"p1": "doc1", "p2": "doc2 139 doc2 second line"} 140 >>> @format_docstring(param_docs) 141 ... def func(p1, p2): 142 ... ''' 143 ... Args: 144 ... p1: {{ p1 }} 145 ... p2: {{ p2 }} 146 ... ''' 147 >>> import textwrap 148 >>> print(textwrap.dedent(func.__doc__).strip()) 149 150 Args: 151 p1: doc1 152 p2: doc2 153 doc2 second line 154 """ 155 param_docs = ParamDocs(param_docs) 156 157 def decorator(func): 158 func.__doc__ = param_docs.format_docstring(func.__doc__) 159 return func 160 161 return decorator 162 163 164 # `{{ ... }}` represents a placeholder. 165 LOG_MODEL_PARAM_DOCS = ParamDocs({ 166 "name": "Model name.", 167 "conda_env": ( 168 """Either a dictionary representation of a Conda environment or the path to a conda 169 environment yaml file. If provided, this describes the environment this model should be run in. 170 At a minimum, it should specify the dependencies contained in `get_default_conda_env()`. 171 If ``None``, a conda environment with pip requirements inferred by 172 :func:`mlflow.models.infer_pip_requirements` is added 173 to the model. If the requirement inference fails, it falls back to using 174 `get_default_pip_requirements`. pip requirements from ``conda_env`` are written to a pip 175 ``requirements.txt`` file and the full conda environment is written to ``conda.yaml``. 176 The following is an *example* dictionary representation of a conda environment:: 177 178 { 179 "name": "mlflow-env", 180 "channels": ["conda-forge"], 181 "dependencies": [ 182 "python=3.8.15", 183 { 184 "pip": [ 185 "{{ package_name }}==x.y.z" 186 ], 187 }, 188 ], 189 }""" 190 ), 191 "pip_requirements": ( 192 """Either an iterable of pip requirement strings 193 (e.g. ``["{{ package_name }}", "-r requirements.txt", "-c constraints.txt"]``) or the string path to 194 a pip requirements file on the local filesystem (e.g. ``"requirements.txt"``). If provided, this 195 describes the environment this model should be run in. If ``None``, a default list of requirements 196 is inferred by :func:`mlflow.models.infer_pip_requirements` from the current software environment. 197 If the requirement inference fails, it falls back to using `get_default_pip_requirements`. 198 Both requirements and constraints are automatically parsed and written to ``requirements.txt`` and 199 ``constraints.txt`` files, respectively, and stored as part of the model. Requirements are also 200 written to the ``pip`` section of the model's conda environment (``conda.yaml``) file.""" 201 ), 202 "extra_pip_requirements": ( 203 """Either an iterable of pip 204 requirement strings 205 (e.g. ``["pandas", "-r requirements.txt", "-c constraints.txt"]``) or the string path to 206 a pip requirements file on the local filesystem (e.g. ``"requirements.txt"``). If provided, this 207 describes additional pip requirements that are appended to a default set of pip requirements 208 generated automatically based on the user's current software environment. Both requirements and 209 constraints are automatically parsed and written to ``requirements.txt`` and ``constraints.txt`` 210 files, respectively, and stored as part of the model. Requirements are also written to the ``pip`` 211 section of the model's conda environment (``conda.yaml``) file. 212 213 .. warning:: 214 The following arguments can't be specified at the same time: 215 216 - ``conda_env`` 217 - ``pip_requirements`` 218 - ``extra_pip_requirements`` 219 220 `This example <https://github.com/mlflow/mlflow/blob/master/examples/pip_requirements/pip_requirements.py>`_ demonstrates how to specify pip requirements using 221 ``pip_requirements`` and ``extra_pip_requirements``.""" # noqa: E501 222 ), 223 "signature": ( 224 """an instance of the :py:class:`ModelSignature <mlflow.models.ModelSignature>` 225 class that describes the model's inputs and outputs. If not specified but an 226 ``input_example`` is supplied, a signature will be automatically inferred 227 based on the supplied input example and model. To disable automatic signature 228 inference when providing an input example, set ``signature`` to ``False``. 229 To manually infer a model signature, call 230 :py:func:`infer_signature() <mlflow.models.infer_signature>` on datasets 231 with valid model inputs, such as a training dataset with the target column 232 omitted, and valid model outputs, like model predictions made on the training 233 dataset, for example: 234 235 .. code-block:: python 236 237 from mlflow.models import infer_signature 238 239 train = df.drop_column("target_label") 240 predictions = ... # compute model predictions 241 signature = infer_signature(train, predictions) 242 """ 243 ), 244 "metadata": ("Custom metadata dictionary passed to the model and stored in the MLmodel file."), 245 "input_example": ( 246 """one or several instances of valid model input. The input example is used 247 as a hint of what data to feed the model. It will be converted to a Pandas 248 DataFrame and then serialized to json using the Pandas split-oriented 249 format, or a numpy array where the example will be serialized to json 250 by converting it to a list. Bytes are base64-encoded. When the ``signature`` parameter is 251 ``None``, the input example is used to infer a model signature. 252 """ 253 ), 254 "prompt_template": ( 255 """A string that, if provided, will be used to format the user's input prior 256 to inference. The string should contain a single placeholder, ``{prompt}``, which will be 257 replaced with the user's input. For example: ``"Answer the following question. Q: {prompt} A:"``. 258 259 Currently, only the following pipeline types are supported: 260 261 - `feature-extraction <https://huggingface.co/transformers/main_classes/pipelines.html#transformers.FeatureExtractionPipeline>`_ 262 - `fill-mask <https://huggingface.co/transformers/main_classes/pipelines.html#transformers.FillMaskPipeline>`_ 263 - `summarization <https://huggingface.co/transformers/main_classes/pipelines.html#transformers.SummarizationPipeline>`_ 264 - `text2text-generation <https://huggingface.co/transformers/main_classes/pipelines.html#transformers.Text2TextGenerationPipeline>`_ 265 - `text-generation <https://huggingface.co/transformers/main_classes/pipelines.html#transformers.TextGenerationPipeline>`_ 266 267 The following example shows how to log a text-generation pipeline with a prompt template and 268 use it via the ``python_function`` (pyfunc) flavor: 269 270 .. code-block:: python 271 272 import mlflow 273 from transformers import pipeline 274 275 # Initialize a text-generation pipeline 276 generator = pipeline("text-generation", model="gpt2") 277 278 # Define a prompt template. The ``{prompt}`` placeholder will be replaced 279 # with the raw user input at inference time. 280 prompt_template = "Answer the following question concisely.\\n\\nQ: {prompt}\\nA:" 281 282 example_prompt = "What is MLflow?" 283 284 # Log the model with the prompt template and an input example 285 with mlflow.start_run(): 286 model_info = mlflow.transformers.log_model( 287 transformers_model=generator, 288 name="qa_text_generator", 289 prompt_template=prompt_template, 290 input_example=example_prompt, 291 ) 292 293 # Load the model back as a pyfunc model 294 loaded_model = mlflow.pyfunc.load_model(model_info.model_uri) 295 296 # The input to ``predict`` is the raw question string; the prompt template 297 # is applied internally before calling the underlying transformers pipeline. 298 loaded_model.predict("What is experiment tracking?") 299 """ 300 ), 301 "code_paths": ( 302 """A list of local filesystem paths to Python file dependencies (or directories 303 containing file dependencies). These files are *prepended* to the system path when the model 304 is loaded. Files declared as dependencies for a given model should have relative 305 imports declared from a common root path if multiple files are defined with import dependencies 306 between them to avoid import errors when loading the model. 307 308 For a detailed explanation of ``code_paths`` functionality, recommended usage patterns and 309 limitations, see the 310 `code_paths usage guide <https://mlflow.org/docs/latest/model/dependencies.html?highlight=code_paths#saving-extra-code-with-an-mlflow-model>`_. 311 """ 312 ), 313 "extra_files": ( 314 """A list containing the paths to corresponding extra files, if ``None``, no 315 extra files are added to the model. Remote URIs are resolved to absolute filesystem 316 paths. For example, consider the following ``extra_files`` list: 317 318 .. code-block:: python 319 320 extra_files = ["s3://my-bucket/path/to/my_file1", "/local-path/to/my_file2"] 321 322 In this case, the ``"my_file1"`` extra file is downloaded from S3. 323 Model paths will be ["extra_files/my_file1", "extra_files/my_file2"] in the model directory. 324 """ 325 ), 326 # Only pyfunc flavor supports `infer_code_paths`. 327 "code_paths_pyfunc": ( 328 """A list of local filesystem paths to Python file dependencies (or directories 329 containing file dependencies). These files are *prepended* to the system path when the model 330 is loaded. Files declared as dependencies for a given model should have relative 331 imports declared from a common root path if multiple files are defined with import dependencies 332 between them to avoid import errors when loading the model. 333 334 You can leave ``code_paths`` argument unset but set ``infer_code_paths`` to ``True`` to let MLflow 335 infer the model code paths. See ``infer_code_paths`` argument doc for details. 336 337 For a detailed explanation of ``code_paths`` functionality, recommended usage patterns and 338 limitations, see the 339 `code_paths usage guide <https://mlflow.org/docs/latest/model/dependencies.html?highlight=code_paths#saving-extra-code-with-an-mlflow-model>`_. 340 """ 341 ), 342 "infer_code_paths": ( 343 """If set to ``True``, MLflow automatically infers model code paths. The inferred 344 code path files only include necessary python module files. Only python code files 345 under current working directory are automatically inferable. Default value is 346 ``False``. 347 348 .. warning:: 349 Please ensure that the custom python module code does not contain sensitive data such as 350 credential token strings, otherwise they might be included in the automatic inferred code 351 path files and be logged to MLflow artifact repository. 352 353 If your custom python module depends on non-python files (e.g. a JSON file) with a relative 354 path to the module code file path, the non-python files can't be automatically inferred as the 355 code path file. To address this issue, you should put all used non-python files outside 356 your custom code directory. 357 358 If a python code file is loaded as the python ``__main__`` module, then this code file can't be 359 inferred as the code path file. If your model depends on classes / functions defined in 360 ``__main__`` module, you should use `cloudpickle` to dump your model instance in order to pickle 361 classes / functions in ``__main__``. 362 363 .. Note:: Experimental: This parameter may change or be removed in a future release without warning. 364 """ 365 ), 366 "save_pretrained": ( 367 """If set to ``False``, MLflow will not save the Transformer model weight files, 368 instead only saving the reference to the HuggingFace Hub model repository and its commit hash. 369 This is useful when you load the pretrained model from HuggingFace Hub and want to log or save 370 it to MLflow without modifying the model weights. In such case, specifying this flag to 371 ``False`` will save the storage space and reduce time to save the model. Please refer to the 372 `Storage-Efficient Model Logging 373 <../../llms/transformers/large-models.html#transformers-save-pretrained-guide>`_ for more detailed 374 usage. 375 376 377 .. warning:: 378 379 If the model is saved with ``save_pretrained`` set to ``False``, the model cannot be 380 registered to the MLflow Model Registry. In order to convert the model to the one that 381 can be registered, you can use :py:func:`mlflow.transformers.persist_pretrained_model()` 382 to download the model weights from the HuggingFace Hub and save it in the existing model 383 artifacts. Please refer to `Transformers flavor documentation 384 <../../llms/transformers/large-models.html#persist-pretrained-guide>`_ 385 for more detailed usage. 386 387 .. code-block:: python 388 389 import mlflow.transformers 390 391 model_uri = "YOUR_MODEL_URI_LOGGED_WITH_SAVE_PRETRAINED_FALSE" 392 model = mlflow.transformers.persist_pretrained_model(model_uri) 393 mlflow.register_model(model_uri, "model_name") 394 395 .. important:: 396 397 When you save the `PEFT <https://huggingface.co/docs/peft/en/index>`_ model, MLflow will 398 override the `save_pretrained` flag to `False` and only store the PEFT adapter weights. The 399 base model weights are not saved but the reference to the HuggingFace repository and 400 its commit hash are logged instead. 401 """ 402 ), 403 "auth_policy": ( 404 """Specifies the authentication policy for the model, which includes two key components. 405 Note that only one of `auth_policy` or `resources` should be defined. 406 407 - **System Auth Policy**: A list of resources required to serve this model. 408 - **User Auth Policy**: A minimal list of scopes that the user should have access to 409 ,in order to invoke this model. 410 411 .. Note:: 412 Experimental: This parameter may change or be removed in a future release without warning. 413 """ 414 ), 415 "params": "A dictionary of parameters to log with the model.", 416 "tags": "A dictionary of tags to log with the model.", 417 "model_type": "The type of the model.", 418 "step": "The step at which to log the model outputs and metrics", 419 "model_id": "The ID of the model.", 420 "prompts": """\ 421 A list of prompt URIs registered in the MLflow Prompt Registry, to be associated with the model. 422 Each prompt URI should be in the form ``prompt:/<name>/<version>``. The prompts should be 423 registered in the MLflow Prompt Registry before being associated with the model. 424 425 This will create a mutual link between the model and the prompt. The associated prompts can be 426 seen in the model's metadata stored in the MLmodel file. From the Prompt Registry UI, you can 427 navigate to the model as well. 428 429 .. code-block:: python 430 431 import mlflow 432 433 prompt_template = "Hi, {name}! How are you doing today?" 434 435 # Register a prompt in the MLflow Prompt Registry 436 mlflow.prompts.register_prompt("my_prompt", prompt_template, description="A simple prompt") 437 438 # Log a model with the registered prompt 439 with mlflow.start_run(): 440 model_info = mlflow.pyfunc.log_model( 441 name=MyModel(), 442 name="model", 443 prompts=["prompt:/my_prompt/1"] 444 ) 445 446 print(model_info.prompts) 447 # Output: ['prompt:/my_prompt/1'] 448 449 # Load the prompt 450 prompt = mlflow.genai.load_prompt(model_info.prompts[0]) 451 """, 452 }) 453 454 455 def get_module_min_and_max_supported_ranges(flavor_name): 456 """ 457 Extracts the minimum and maximum supported package versions from the provided module name. 458 The version information is provided via the yaml-to-python-script generation script in 459 dev/update_ml_package_versions.py which writes a python file to the importable namespace of 460 mlflow.ml_package_versions 461 462 Args: 463 flavor_name: The flavor name registered in ml_package_versions.py 464 465 Returns: 466 tuple of module name, minimum supported version, maximum supported version as strings. 467 """ 468 if flavor_name == "pyspark.ml": 469 # pyspark.ml is a special case of spark flavor 470 flavor_name = "spark" 471 472 module_name = _ML_PACKAGE_VERSIONS[flavor_name]["package_info"].get("module_name", flavor_name) 473 versions = _ML_PACKAGE_VERSIONS[flavor_name]["models"] 474 min_version = versions["minimum"] 475 max_version = versions["maximum"] 476 return module_name, min_version, max_version 477 478 479 def _do_version_compatibility_warning(msg: str): 480 """ 481 Isolate the warn call to show the warning only once. 482 """ 483 warnings.warn(msg, category=UserWarning, stacklevel=2) 484 485 486 def docstring_version_compatibility_warning(integration_name): 487 """ 488 Generates a docstring that can be applied as a note stating a version compatibility range for 489 a given flavor and optionally raises a warning if the installed version is outside of the 490 supported range. 491 492 Args: 493 integration_name: The name of the module as stored within ml-package-versions.yml 494 495 Returns: 496 The wrapped function with the additional docstring header applied 497 """ 498 499 def annotated_func(func): 500 # NB: if using this decorator, ensure the package name to module name reference is 501 # updated with the flavor's `save` and `load` functions being used within 502 # ml-package-version.yml file. 503 min_ver, max_ver, pip_release = get_min_max_version_and_pip_release( 504 integration_name, "models" 505 ) 506 notice = ( 507 f"The '{integration_name}' MLflow Models integration is known to be compatible with " 508 f"``{min_ver}`` <= ``{pip_release}`` <= ``{max_ver}``. " 509 f"MLflow Models integrations with {integration_name} may not succeed when used with " 510 "package versions outside of this range." 511 ) 512 513 func.__doc__ = ( 514 " .. Note:: " + notice + "\n" * 2 + func.__doc__ if func.__doc__ else notice 515 ) 516 517 return func 518 519 return annotated_func