evaluate_with_custom_code_metrics.py
1 import os 2 3 import openai 4 import pandas as pd 5 6 import mlflow 7 from mlflow.metrics import make_metric 8 from mlflow.metrics.base import MetricValue, standard_aggregations 9 10 assert "OPENAI_API_KEY" in os.environ, "Please set the OPENAI_API_KEY environment variable." 11 12 13 # Helper function to check if a string is valid python code 14 def is_valid_python_code(code: str) -> bool: 15 try: 16 compile(code, "<string>", "exec") 17 return True 18 except SyntaxError: 19 return False 20 21 22 # Create an evaluation function that iterates through the predictions 23 def eval_fn(predictions): 24 scores = [int(is_valid_python_code(prediction)) for prediction in predictions] 25 return MetricValue( 26 scores=scores, 27 aggregate_results=standard_aggregations(scores), 28 ) 29 30 31 # Create an EvaluationMetric object for the python code metric 32 valid_code_metric = make_metric( 33 eval_fn=eval_fn, greater_is_better=False, name="valid_python_code", version="v1" 34 ) 35 36 eval_df = pd.DataFrame({ 37 "input": [ 38 "SELECT * FROM ", 39 "import pandas", 40 "def hello_world", 41 ], 42 }) 43 44 with mlflow.start_run() as run: 45 system_prompt = ( 46 "Generate code that is less than 50 characters. Return only python code and nothing else." 47 ) 48 logged_model = mlflow.openai.log_model( 49 model="gpt-4o-mini", 50 task=openai.chat.completions, 51 name="model", 52 messages=[ 53 {"role": "system", "content": system_prompt}, 54 {"role": "user", "content": "{question}"}, 55 ], 56 ) 57 58 results = mlflow.evaluate( 59 logged_model.model_uri, 60 eval_df, 61 model_type="text", 62 extra_metrics=[valid_code_metric], 63 ) 64 print(results) 65 66 eval_table = results.tables["eval_results_table"] 67 print(eval_table)