/ examples / evaluation / evaluate_with_custom_code_metrics.py
evaluate_with_custom_code_metrics.py
 1  import os
 2  
 3  import openai
 4  import pandas as pd
 5  
 6  import mlflow
 7  from mlflow.metrics import make_metric
 8  from mlflow.metrics.base import MetricValue, standard_aggregations
 9  
10  assert "OPENAI_API_KEY" in os.environ, "Please set the OPENAI_API_KEY environment variable."
11  
12  
13  # Helper function to check if a string is valid python code
14  def is_valid_python_code(code: str) -> bool:
15      try:
16          compile(code, "<string>", "exec")
17          return True
18      except SyntaxError:
19          return False
20  
21  
22  # Create an evaluation function that iterates through the predictions
23  def eval_fn(predictions):
24      scores = [int(is_valid_python_code(prediction)) for prediction in predictions]
25      return MetricValue(
26          scores=scores,
27          aggregate_results=standard_aggregations(scores),
28      )
29  
30  
31  # Create an EvaluationMetric object for the python code metric
32  valid_code_metric = make_metric(
33      eval_fn=eval_fn, greater_is_better=False, name="valid_python_code", version="v1"
34  )
35  
36  eval_df = pd.DataFrame({
37      "input": [
38          "SELECT * FROM ",
39          "import pandas",
40          "def hello_world",
41      ],
42  })
43  
44  with mlflow.start_run() as run:
45      system_prompt = (
46          "Generate code that is less than 50 characters. Return only python code and nothing else."
47      )
48      logged_model = mlflow.openai.log_model(
49          model="gpt-4o-mini",
50          task=openai.chat.completions,
51          name="model",
52          messages=[
53              {"role": "system", "content": system_prompt},
54              {"role": "user", "content": "{question}"},
55          ],
56      )
57  
58      results = mlflow.evaluate(
59          logged_model.model_uri,
60          eval_df,
61          model_type="text",
62          extra_metrics=[valid_code_metric],
63      )
64      print(results)
65  
66      eval_table = results.tables["eval_results_table"]
67      print(eval_table)