/ examples / evaluation / evaluate_with_qa_metrics.py
evaluate_with_qa_metrics.py
 1  import openai
 2  import pandas as pd
 3  
 4  import mlflow
 5  
 6  eval_df = pd.DataFrame({
 7      "inputs": [
 8          "What is MLflow?",
 9          "What is Spark?",
10          "What is Python?",
11      ],
12      "ground_truth": [
13          "MLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle. It was developed by Databricks, a company that specializes in big data and machine learning solutions. MLflow is designed to address the challenges that data scientists and machine learning engineers face when developing, training, and deploying machine learning models.",
14          "Apache Spark is an open-source, distributed computing system designed for big data processing and analytics. It was developed in response to limitations of the Hadoop MapReduce computing model, offering improvements in speed and ease of use. Spark provides libraries for various tasks such as data ingestion, processing, and analysis through its components like Spark SQL for structured data, Spark Streaming for real-time data processing, and MLlib for machine learning tasks",
15          "Python is a high-level programming language that was created by Guido van Rossum and released in 1991. It emphasizes code readability and allows developers to express concepts in fewer lines of code than languages like C++ or Java. Python is used in various domains, including web development, scientific computing, data analysis, and machine learning.",
16      ],
17  })
18  
19  with mlflow.start_run() as run:
20      system_prompt = "Answer the following question in two sentences"
21      logged_model = mlflow.openai.log_model(
22          model="gpt-4o-mini",
23          task=openai.chat.completions,
24          name="model",
25          messages=[
26              {"role": "system", "content": system_prompt},
27              {"role": "user", "content": "{question}"},
28          ],
29      )
30  
31      results = mlflow.evaluate(
32          logged_model.model_uri,
33          eval_df,
34          targets="ground_truth",
35          model_type="question-answering",
36          evaluators="default",
37      )
38      print(results.metrics)
39  
40      eval_table = results.tables["eval_results_table"]
41      print(eval_table)