/ mlflow / utils / docstring_utils.py
docstring_utils.py
  1  import textwrap
  2  import warnings
  3  from typing import Any
  4  
  5  from mlflow.ml_package_versions import _ML_PACKAGE_VERSIONS
  6  from mlflow.utils.autologging_utils.versioning import (
  7      get_min_max_version_and_pip_release,
  8  )
  9  
 10  
 11  def _create_placeholder(key: str):
 12      return "{{ " + key + " }}"
 13  
 14  
 15  def _replace_keys_with_placeholders(d: dict[str, Any]) -> dict[str, Any]:
 16      return {_create_placeholder(k): v for k, v in d.items()}
 17  
 18  
 19  def _get_indentation_of_key(line: str, placeholder: str) -> str:
 20      index = line.find(placeholder)
 21      return (index * " ") if index != -1 else ""
 22  
 23  
 24  def _indent(text: str, indent: str) -> str:
 25      """Indent everything but first line in text."""
 26      lines = text.splitlines()
 27      if len(lines) <= 1:
 28          return text
 29  
 30      else:
 31          first_line = lines[0]
 32          subsequent_lines = "\n".join(list(lines[1:]))
 33          indented_subsequent_lines = textwrap.indent(subsequent_lines, indent)
 34          return first_line + "\n" + indented_subsequent_lines
 35  
 36  
 37  def _replace_all(text: str, replacements: dict[str, str]) -> str:
 38      """
 39      Replace all instances of replacements.keys() with their corresponding
 40      values in text. The replacements will be inserted on the same line
 41      with wrapping to the same level of indentation, for example:
 42  
 43      ```
 44      Args:
 45          param_1: {{ key }}
 46      ```
 47  
 48      will become...
 49  
 50      ```
 51      Args:
 52          param_1: replaced_value_at same indentation as prior
 53                   and if there are more lines they will also
 54                   have the same indentation.
 55      ```
 56      """
 57      for key, value in replacements.items():
 58          if key in text:
 59              indent = _get_indentation_of_key(text, key)
 60              indented_value = _indent(value, indent)
 61              text = text.replace(key, indented_value)
 62      return text
 63  
 64  
 65  class ParamDocs(dict):
 66      """
 67      Represents a set of parameter documents in the docstring.
 68      """
 69  
 70      def __repr__(self):
 71          return f"ParamDocs({super().__repr__()})"
 72  
 73      def format(self, **kwargs):
 74          """
 75          Formats values to be substituted in via the format_docstring() method.
 76  
 77          Args:
 78              kwargs: A `dict` in the form of `{"< placeholder name >": "< value >"}`.
 79  
 80          Returns:
 81              A new `ParamDocs` instance with the formatted param docs.
 82  
 83          .. code-block:: text
 84              :caption: Example
 85  
 86              >>> pd = ParamDocs(p1="{{ doc1 }}", p2="{{ doc2 }}")
 87              >>> pd.format(doc1="foo", doc2="bar")
 88              ParamDocs({'p1': 'foo', 'p2': 'bar'})
 89          """
 90          replacements = _replace_keys_with_placeholders(kwargs)
 91          return ParamDocs({k: _replace_all(v, replacements) for k, v in self.items()})
 92  
 93      def format_docstring(self, docstring: str) -> str:
 94          """
 95          Formats placeholders in `docstring`.
 96  
 97          Args:
 98              docstring: A docstring with placeholders to be replaced.
 99                  If provided with None, will return None.
100  
101          .. code-block:: text
102              :caption: Example
103  
104              >>> pd = ParamDocs(p1="doc1", p2="doc2
105              doc2 second line")
106              >>> docstring = '''
107              ... Args:
108              ...     p1: {{ p1 }}
109              ...     p2: {{ p2 }}
110              ... '''.strip()
111              >>> print(pd.format_docstring(docstring))
112          """
113          if docstring is None:
114              return None
115  
116          replacements = _replace_keys_with_placeholders(self)
117          lines = docstring.splitlines()
118          for i, line in enumerate(lines):
119              lines[i] = _replace_all(line, replacements)
120  
121          return "\n".join(lines)
122  
123  
124  def format_docstring(param_docs):
125      """
126      Returns a decorator that replaces param doc placeholders (e.g. '{{ param_name }}') in the
127      docstring of the decorated function.
128  
129      Args:
130          param_docs: A `ParamDocs` instance or `dict`.
131  
132      Returns:
133          A decorator to apply the formatting.
134  
135      .. code-block:: text
136          :caption: Example
137  
138          >>> param_docs = {"p1": "doc1", "p2": "doc2
139          doc2 second line"}
140          >>> @format_docstring(param_docs)
141          ... def func(p1, p2):
142          ...     '''
143          ...     Args:
144          ...         p1: {{ p1 }}
145          ...         p2: {{ p2 }}
146          ...     '''
147          >>> import textwrap
148          >>> print(textwrap.dedent(func.__doc__).strip())
149  
150          Args:
151              p1: doc1
152              p2: doc2
153                  doc2 second line
154      """
155      param_docs = ParamDocs(param_docs)
156  
157      def decorator(func):
158          func.__doc__ = param_docs.format_docstring(func.__doc__)
159          return func
160  
161      return decorator
162  
163  
164  # `{{ ... }}` represents a placeholder.
165  LOG_MODEL_PARAM_DOCS = ParamDocs({
166      "name": "Model name.",
167      "conda_env": (
168          """Either a dictionary representation of a Conda environment or the path to a conda
169  environment yaml file. If provided, this describes the environment this model should be run in.
170  At a minimum, it should specify the dependencies contained in `get_default_conda_env()`.
171  If ``None``, a conda environment with pip requirements inferred by
172  :func:`mlflow.models.infer_pip_requirements` is added
173  to the model. If the requirement inference fails, it falls back to using
174  `get_default_pip_requirements`. pip requirements from ``conda_env`` are written to a pip
175  ``requirements.txt`` file and the full conda environment is written to ``conda.yaml``.
176  The following is an *example* dictionary representation of a conda environment::
177  
178      {
179          "name": "mlflow-env",
180          "channels": ["conda-forge"],
181          "dependencies": [
182              "python=3.8.15",
183              {
184                  "pip": [
185                      "{{ package_name }}==x.y.z"
186                  ],
187              },
188          ],
189      }"""
190      ),
191      "pip_requirements": (
192          """Either an iterable of pip requirement strings
193  (e.g. ``["{{ package_name }}", "-r requirements.txt", "-c constraints.txt"]``) or the string path to
194  a pip requirements file on the local filesystem (e.g. ``"requirements.txt"``). If provided, this
195  describes the environment this model should be run in. If ``None``, a default list of requirements
196  is inferred by :func:`mlflow.models.infer_pip_requirements` from the current software environment.
197  If the requirement inference fails, it falls back to using `get_default_pip_requirements`.
198  Both requirements and constraints are automatically parsed and written to ``requirements.txt`` and
199  ``constraints.txt`` files, respectively, and stored as part of the model. Requirements are also
200  written to the ``pip`` section of the model's conda environment (``conda.yaml``) file."""
201      ),
202      "extra_pip_requirements": (
203          """Either an iterable of pip
204  requirement strings
205  (e.g. ``["pandas", "-r requirements.txt", "-c constraints.txt"]``) or the string path to
206  a pip requirements file on the local filesystem (e.g. ``"requirements.txt"``). If provided, this
207  describes additional pip requirements that are appended to a default set of pip requirements
208  generated automatically based on the user's current software environment. Both requirements and
209  constraints are automatically parsed and written to ``requirements.txt`` and ``constraints.txt``
210  files, respectively, and stored as part of the model. Requirements are also written to the ``pip``
211  section of the model's conda environment (``conda.yaml``) file.
212  
213  .. warning::
214      The following arguments can't be specified at the same time:
215  
216      - ``conda_env``
217      - ``pip_requirements``
218      - ``extra_pip_requirements``
219  
220  `This example <https://github.com/mlflow/mlflow/blob/master/examples/pip_requirements/pip_requirements.py>`_ demonstrates how to specify pip requirements using
221  ``pip_requirements`` and ``extra_pip_requirements``."""  # noqa: E501
222      ),
223      "signature": (
224          """an instance of the :py:class:`ModelSignature <mlflow.models.ModelSignature>`
225  class that describes the model's inputs and outputs. If not specified but an
226  ``input_example`` is supplied, a signature will be automatically inferred
227  based on the supplied input example and model. To disable automatic signature
228  inference when providing an input example, set ``signature`` to ``False``.
229  To manually infer a model signature, call
230  :py:func:`infer_signature() <mlflow.models.infer_signature>` on datasets
231  with valid model inputs, such as a training dataset with the target column
232  omitted, and valid model outputs, like model predictions made on the training
233  dataset, for example:
234  
235  .. code-block:: python
236  
237      from mlflow.models import infer_signature
238  
239      train = df.drop_column("target_label")
240      predictions = ...  # compute model predictions
241      signature = infer_signature(train, predictions)
242  """
243      ),
244      "metadata": ("Custom metadata dictionary passed to the model and stored in the MLmodel file."),
245      "input_example": (
246          """one or several instances of valid model input. The input example is used
247  as a hint of what data to feed the model. It will be converted to a Pandas
248  DataFrame and then serialized to json using the Pandas split-oriented
249  format, or a numpy array where the example will be serialized to json
250  by converting it to a list. Bytes are base64-encoded. When the ``signature`` parameter is
251  ``None``, the input example is used to infer a model signature.
252  """
253      ),
254      "prompt_template": (
255          """A string that, if provided, will be used to format the user's input prior
256  to inference. The string should contain a single placeholder, ``{prompt}``, which will be
257  replaced with the user's input. For example: ``"Answer the following question. Q: {prompt} A:"``.
258  
259  Currently, only the following pipeline types are supported:
260  
261  - `feature-extraction <https://huggingface.co/transformers/main_classes/pipelines.html#transformers.FeatureExtractionPipeline>`_
262  - `fill-mask <https://huggingface.co/transformers/main_classes/pipelines.html#transformers.FillMaskPipeline>`_
263  - `summarization <https://huggingface.co/transformers/main_classes/pipelines.html#transformers.SummarizationPipeline>`_
264  - `text2text-generation <https://huggingface.co/transformers/main_classes/pipelines.html#transformers.Text2TextGenerationPipeline>`_
265  - `text-generation <https://huggingface.co/transformers/main_classes/pipelines.html#transformers.TextGenerationPipeline>`_
266  
267  The following example shows how to log a text-generation pipeline with a prompt template and
268  use it via the ``python_function`` (pyfunc) flavor:
269  
270  .. code-block:: python
271  
272      import mlflow
273      from transformers import pipeline
274  
275      # Initialize a text-generation pipeline
276      generator = pipeline("text-generation", model="gpt2")
277  
278      # Define a prompt template. The ``{prompt}`` placeholder will be replaced
279      # with the raw user input at inference time.
280      prompt_template = "Answer the following question concisely.\\n\\nQ: {prompt}\\nA:"
281  
282      example_prompt = "What is MLflow?"
283  
284      # Log the model with the prompt template and an input example
285      with mlflow.start_run():
286          model_info = mlflow.transformers.log_model(
287              transformers_model=generator,
288              name="qa_text_generator",
289              prompt_template=prompt_template,
290              input_example=example_prompt,
291          )
292  
293      # Load the model back as a pyfunc model
294      loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)
295  
296      # The input to ``predict`` is the raw question string; the prompt template
297      # is applied internally before calling the underlying transformers pipeline.
298      loaded_model.predict("What is experiment tracking?")
299  """
300      ),
301      "code_paths": (
302          """A list of local filesystem paths to Python file dependencies (or directories
303  containing file dependencies). These files are *prepended* to the system path when the model
304  is loaded. Files declared as dependencies for a given model should have relative
305  imports declared from a common root path if multiple files are defined with import dependencies
306  between them to avoid import errors when loading the model.
307  
308  For a detailed explanation of ``code_paths`` functionality, recommended usage patterns and
309  limitations, see the
310  `code_paths usage guide <https://mlflow.org/docs/latest/model/dependencies.html?highlight=code_paths#saving-extra-code-with-an-mlflow-model>`_.
311  """
312      ),
313      "extra_files": (
314          """A list containing the paths to corresponding extra files, if ``None``, no
315  extra files are added to the model. Remote URIs are resolved to absolute filesystem
316  paths. For example, consider the following ``extra_files`` list:
317  
318  .. code-block:: python
319  
320      extra_files = ["s3://my-bucket/path/to/my_file1", "/local-path/to/my_file2"]
321  
322  In this case, the ``"my_file1"`` extra file is downloaded from S3.
323  Model paths will be ["extra_files/my_file1", "extra_files/my_file2"] in the model directory.
324  """
325      ),
326      # Only pyfunc flavor supports `infer_code_paths`.
327      "code_paths_pyfunc": (
328          """A list of local filesystem paths to Python file dependencies (or directories
329  containing file dependencies). These files are *prepended* to the system path when the model
330  is loaded. Files declared as dependencies for a given model should have relative
331  imports declared from a common root path if multiple files are defined with import dependencies
332  between them to avoid import errors when loading the model.
333  
334  You can leave ``code_paths`` argument unset but set ``infer_code_paths`` to ``True`` to let MLflow
335  infer the model code paths. See ``infer_code_paths`` argument doc for details.
336  
337  For a detailed explanation of ``code_paths`` functionality, recommended usage patterns and
338  limitations, see the
339  `code_paths usage guide <https://mlflow.org/docs/latest/model/dependencies.html?highlight=code_paths#saving-extra-code-with-an-mlflow-model>`_.
340  """
341      ),
342      "infer_code_paths": (
343          """If set to ``True``, MLflow automatically infers model code paths. The inferred
344              code path files only include necessary python module files. Only python code files
345              under current working directory are automatically inferable. Default value is
346              ``False``.
347  
348  .. warning::
349      Please ensure that the custom python module code does not contain sensitive data such as
350      credential token strings, otherwise they might be included in the automatic inferred code
351      path files and be logged to MLflow artifact repository.
352  
353      If your custom python module depends on non-python files (e.g. a JSON file) with a relative
354      path to the module code file path, the non-python files can't be automatically inferred as the
355      code path file. To address this issue, you should put all used non-python files outside
356      your custom code directory.
357  
358      If a python code file is loaded as the python ``__main__`` module, then this code file can't be
359      inferred as the code path file. If your model depends on classes / functions defined in
360      ``__main__`` module, you should use `cloudpickle` to dump your model instance in order to pickle
361      classes / functions in ``__main__``.
362  
363  .. Note:: Experimental: This parameter may change or be removed in a future release without warning.
364  """
365      ),
366      "save_pretrained": (
367          """If set to ``False``, MLflow will not save the Transformer model weight files,
368  instead only saving the reference to the HuggingFace Hub model repository and its commit hash.
369  This is useful when you load the pretrained model from HuggingFace Hub and want to log or save
370  it to MLflow without modifying the model weights. In such case, specifying this flag to
371  ``False`` will save the storage space and reduce time to save the model. Please refer to the
372  `Storage-Efficient Model Logging
373  <../../llms/transformers/large-models.html#transformers-save-pretrained-guide>`_ for more detailed
374  usage.
375  
376  
377  .. warning::
378  
379      If the model is saved with ``save_pretrained`` set to ``False``, the model cannot be
380      registered to the MLflow Model Registry. In order to convert the model to the one that
381      can be registered, you can use :py:func:`mlflow.transformers.persist_pretrained_model()`
382      to download the model weights from the HuggingFace Hub and save it in the existing model
383      artifacts. Please refer to `Transformers flavor documentation
384      <../../llms/transformers/large-models.html#persist-pretrained-guide>`_
385      for more detailed usage.
386  
387      .. code-block:: python
388  
389          import mlflow.transformers
390  
391          model_uri = "YOUR_MODEL_URI_LOGGED_WITH_SAVE_PRETRAINED_FALSE"
392          model = mlflow.transformers.persist_pretrained_model(model_uri)
393          mlflow.register_model(model_uri, "model_name")
394  
395  .. important::
396  
397      When you save the `PEFT <https://huggingface.co/docs/peft/en/index>`_ model, MLflow will
398      override the `save_pretrained` flag to `False` and only store the PEFT adapter weights. The
399      base model weights are not saved but the reference to the HuggingFace repository and
400      its commit hash are logged instead.
401  """
402      ),
403      "auth_policy": (
404          """Specifies the authentication policy for the model, which includes two key components.
405              Note that only one of `auth_policy` or `resources` should be defined.
406  
407                  - **System Auth Policy**: A list of resources required to serve this model.
408                  - **User Auth Policy**: A minimal list of scopes that the user should have access to
409                      ,in order to invoke this model.
410  
411      .. Note::
412          Experimental: This parameter may change or be removed in a future release without warning.
413              """
414      ),
415      "params": "A dictionary of parameters to log with the model.",
416      "tags": "A dictionary of tags to log with the model.",
417      "model_type": "The type of the model.",
418      "step": "The step at which to log the model outputs and metrics",
419      "model_id": "The ID of the model.",
420      "prompts": """\
421  A list of prompt URIs registered in the MLflow Prompt Registry, to be associated with the model.
422  Each prompt URI should be in the form ``prompt:/<name>/<version>``. The prompts should be
423  registered in the MLflow Prompt Registry before being associated with the model.
424  
425  This will create a mutual link between the model and the prompt. The associated prompts can be
426  seen in the model's metadata stored in the MLmodel file. From the Prompt Registry UI, you can
427  navigate to the model as well.
428  
429  .. code-block:: python
430  
431      import mlflow
432  
433      prompt_template = "Hi, {name}! How are you doing today?"
434  
435      # Register a prompt in the MLflow Prompt Registry
436      mlflow.prompts.register_prompt("my_prompt", prompt_template, description="A simple prompt")
437  
438      # Log a model with the registered prompt
439      with mlflow.start_run():
440          model_info = mlflow.pyfunc.log_model(
441              name=MyModel(),
442              name="model",
443              prompts=["prompt:/my_prompt/1"]
444          )
445  
446      print(model_info.prompts)
447      # Output: ['prompt:/my_prompt/1']
448  
449      # Load the prompt
450      prompt = mlflow.genai.load_prompt(model_info.prompts[0])
451  """,
452  })
453  
454  
455  def get_module_min_and_max_supported_ranges(flavor_name):
456      """
457      Extracts the minimum and maximum supported package versions from the provided module name.
458      The version information is provided via the yaml-to-python-script generation script in
459      dev/update_ml_package_versions.py which writes a python file to the importable namespace of
460      mlflow.ml_package_versions
461  
462      Args:
463          flavor_name: The flavor name registered in ml_package_versions.py
464  
465      Returns:
466          tuple of module name, minimum supported version, maximum supported version as strings.
467      """
468      if flavor_name == "pyspark.ml":
469          # pyspark.ml is a special case of spark flavor
470          flavor_name = "spark"
471  
472      module_name = _ML_PACKAGE_VERSIONS[flavor_name]["package_info"].get("module_name", flavor_name)
473      versions = _ML_PACKAGE_VERSIONS[flavor_name]["models"]
474      min_version = versions["minimum"]
475      max_version = versions["maximum"]
476      return module_name, min_version, max_version
477  
478  
479  def _do_version_compatibility_warning(msg: str):
480      """
481      Isolate the warn call to show the warning only once.
482      """
483      warnings.warn(msg, category=UserWarning, stacklevel=2)
484  
485  
486  def docstring_version_compatibility_warning(integration_name):
487      """
488      Generates a docstring that can be applied as a note stating a version compatibility range for
489      a given flavor and optionally raises a warning if the installed version is outside of the
490      supported range.
491  
492      Args:
493          integration_name: The name of the module as stored within ml-package-versions.yml
494  
495      Returns:
496          The wrapped function with the additional docstring header applied
497      """
498  
499      def annotated_func(func):
500          # NB: if using this decorator, ensure the package name to module name reference is
501          # updated with the flavor's `save` and `load` functions being used within
502          # ml-package-version.yml file.
503          min_ver, max_ver, pip_release = get_min_max_version_and_pip_release(
504              integration_name, "models"
505          )
506          notice = (
507              f"The '{integration_name}' MLflow Models integration is known to be compatible with "
508              f"``{min_ver}`` <= ``{pip_release}`` <= ``{max_ver}``. "
509              f"MLflow Models integrations with {integration_name} may not succeed when used with "
510              "package versions outside of this range."
511          )
512  
513          func.__doc__ = (
514              "    .. Note:: " + notice + "\n" * 2 + func.__doc__ if func.__doc__ else notice
515          )
516  
517          return func
518  
519      return annotated_func