/ src / evidently / llm / optimization / scorers.py
scorers.py
  1  from abc import ABC
  2  from abc import abstractmethod
  3  from typing import TYPE_CHECKING
  4  from typing import Any
  5  from typing import ClassVar
  6  from typing import Dict
  7  from typing import Optional
  8  
  9  import pandas as pd
 10  from sklearn.metrics import balanced_accuracy_score
 11  from sklearn.metrics import brier_score_loss
 12  from sklearn.metrics import cohen_kappa_score
 13  from sklearn.metrics import f1_score
 14  from sklearn.metrics import log_loss
 15  from sklearn.metrics import matthews_corrcoef
 16  from sklearn.metrics import precision_score
 17  from sklearn.metrics import r2_score
 18  from sklearn.metrics import recall_score
 19  from sklearn.metrics import roc_auc_score
 20  
 21  from evidently.legacy.options.base import Options
 22  from evidently.llm.optimization.optimizer import LLMDataset
 23  from evidently.llm.optimization.optimizer import LLMDatasetSplit
 24  from evidently.llm.optimization.optimizer import OptimizerContext
 25  from evidently.llm.optimization.optimizer import Params
 26  from evidently.pydantic_utils import AutoAliasMixin
 27  from evidently.pydantic_utils import EvidentlyBaseModel
 28  from evidently.utils.arg_type_registry import BaseArgTypeRegistry
 29  
 30  from .errors import OptimizationConfigurationError
 31  
 32  if TYPE_CHECKING:
 33      from .prompts import PromptExecutionLog
 34  
 35  
 36  class OptimizationScorer(BaseArgTypeRegistry, AutoAliasMixin, EvidentlyBaseModel, ABC):
 37      """Abstract base class for optimization scorers.
 38  
 39      Scorers evaluate the quality of LLM outputs during prompt optimization,
 40      computing metrics like accuracy, precision, F1, etc. across dataset splits.
 41      """
 42  
 43      __alias_type__: ClassVar = "optimizer_scorer"
 44  
 45      class Config:
 46          is_base_type = True
 47  
 48      def get_name(self) -> str:
 49          """Get the name of this scorer.
 50  
 51          Returns:
 52          * Class name as string.
 53          """
 54          return self.__class__.__name__
 55  
 56      @abstractmethod
 57      async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]:
 58          """Compute the score for predictions against targets.
 59  
 60          Args:
 61          * `predictions`: `pd.Series` with model predictions.
 62          * `target`: `pd.Series` with target values.
 63          * `options`: Processing options.
 64  
 65          Returns:
 66          * Score value, or `None` if score cannot be computed.
 67          """
 68          raise NotImplementedError()
 69  
 70      async def score(self, context: OptimizerContext, execution_log: "PromptExecutionLog") -> Optional[Dict[str, float]]:
 71          """Score predictions across dataset splits.
 72  
 73          Computes scores for each split (train, val, test) if available,
 74          or for all data if no splits are defined.
 75  
 76          Args:
 77          * `context`: `OptimizerContext` containing datasets and configuration.
 78          * `execution_log`: `PromptExecutionLog` with predictions to score.
 79  
 80          Returns:
 81          * Dictionary mapping split names to scores.
 82  
 83          Raises:
 84          * `OptimizationConfigurationError`: If target values are missing.
 85          """
 86          predictions = execution_log.result.get_predictions()
 87          if context.has_param(Params.Dataset):
 88              dataset = context.get_param(Params.Dataset, LLMDataset)
 89          else:
 90              target_value: Any = context.get_param(Params.TargetValue)
 91              return {
 92                  LLMDatasetSplit.All: await self._score(
 93                      predictions=predictions,
 94                      target=pd.Series([target_value] * len(predictions)),
 95                      options=context.options,
 96                  )
 97                  or 0,
 98              }
 99          result = {}
100          for split in (LLMDatasetSplit.Train, LLMDatasetSplit.Test, LLMDatasetSplit.Val):
101              if split not in dataset.split_masks:
102                  continue
103              target = dataset[split].target
104              if target is None:
105                  raise OptimizationConfigurationError("Target is required for scoring")
106              result[split] = await self._score(predictions[dataset.split_masks[split]], target, context.options) or 0
107          if len(result) == 0:
108              target = dataset.target
109              if target is None:
110                  raise OptimizationConfigurationError("Target is required for scoring")
111              result[LLMDatasetSplit.All] = await self._score(predictions, target, context.options) or 0
112          return result
113  
114  
115  class AccuracyScorer(OptimizationScorer):
116      """Scorer that computes accuracy of predictions."""
117  
118      __registry_alias__: ClassVar = "accuracy"
119  
120      async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]:
121          return (predictions == target).mean()
122  
123  
124  class MCCScorer(OptimizationScorer):
125      """Scorer that computes Matthew's correlation coefficient."""
126  
127      __registry_alias__: ClassVar = "mcc"
128  
129      async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]:
130          return matthews_corrcoef(target.to_numpy(), predictions.to_numpy())
131  
132  
133  class PrecisionScorer(OptimizationScorer):
134      """Scorer that computes Precision."""
135  
136      __registry_alias__: ClassVar = "precision"
137  
138      async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]:
139          return precision_score(target, predictions, average="macro", zero_division=0)
140  
141  
142  class RecallScorer(OptimizationScorer):
143      """Scorer that computes Recall."""
144  
145      __registry_alias__: ClassVar = "recall"
146  
147      async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]:
148          return recall_score(target, predictions, average="macro", zero_division=0)
149  
150  
151  class F1Scorer(OptimizationScorer):
152      """Scorer that computes F1 score."""
153  
154      __registry_alias__: ClassVar = "f1"
155  
156      async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]:
157          return f1_score(target, predictions, average="macro", zero_division=0)
158  
159  
160  class R2Scorer(OptimizationScorer):
161      """Scorer that computes R² (regression only)."""
162  
163      __registry_alias__: ClassVar = "r2"
164  
165      async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]:
166          return r2_score(target, predictions)
167  
168  
169  class BalancedAccuracyScorer(OptimizationScorer):
170      """Scorer that computes Balanced Accuracy."""
171  
172      __registry_alias__: ClassVar = "balanced_accuracy"
173  
174      async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]:
175          return balanced_accuracy_score(target, predictions)
176  
177  
178  class CohenKappaScorer(OptimizationScorer):
179      """Scorer that computes Cohen's Kappa."""
180  
181      __registry_alias__: ClassVar = "cohen_kappa"
182  
183      async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]:
184          return cohen_kappa_score(target, predictions)
185  
186  
187  class RocAucScorer(OptimizationScorer):
188      """Scorer that computes ROC AUC (probabilistic predictions required)."""
189  
190      __registry_alias__: ClassVar = "roc_auc"
191  
192      async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]:
193          # predictions should be probabilities or scores
194          return roc_auc_score(target, predictions, multi_class="ovr")
195  
196  
197  class LogLossScorer(OptimizationScorer):
198      """Scorer that computes Log Loss (probabilistic predictions required)."""
199  
200      __registry_alias__: ClassVar = "log_loss"
201  
202      async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]:
203          return log_loss(target, predictions)
204  
205  
206  class BrierScoreScorer(OptimizationScorer):
207      """Scorer that computes Brier Score (probabilistic predictions required)."""
208  
209      __registry_alias__: ClassVar = "brier_score"
210  
211      async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]:
212          return brier_score_loss(target, predictions)