scorers.py
1 from abc import ABC 2 from abc import abstractmethod 3 from typing import TYPE_CHECKING 4 from typing import Any 5 from typing import ClassVar 6 from typing import Dict 7 from typing import Optional 8 9 import pandas as pd 10 from sklearn.metrics import balanced_accuracy_score 11 from sklearn.metrics import brier_score_loss 12 from sklearn.metrics import cohen_kappa_score 13 from sklearn.metrics import f1_score 14 from sklearn.metrics import log_loss 15 from sklearn.metrics import matthews_corrcoef 16 from sklearn.metrics import precision_score 17 from sklearn.metrics import r2_score 18 from sklearn.metrics import recall_score 19 from sklearn.metrics import roc_auc_score 20 21 from evidently.legacy.options.base import Options 22 from evidently.llm.optimization.optimizer import LLMDataset 23 from evidently.llm.optimization.optimizer import LLMDatasetSplit 24 from evidently.llm.optimization.optimizer import OptimizerContext 25 from evidently.llm.optimization.optimizer import Params 26 from evidently.pydantic_utils import AutoAliasMixin 27 from evidently.pydantic_utils import EvidentlyBaseModel 28 from evidently.utils.arg_type_registry import BaseArgTypeRegistry 29 30 from .errors import OptimizationConfigurationError 31 32 if TYPE_CHECKING: 33 from .prompts import PromptExecutionLog 34 35 36 class OptimizationScorer(BaseArgTypeRegistry, AutoAliasMixin, EvidentlyBaseModel, ABC): 37 """Abstract base class for optimization scorers. 38 39 Scorers evaluate the quality of LLM outputs during prompt optimization, 40 computing metrics like accuracy, precision, F1, etc. across dataset splits. 41 """ 42 43 __alias_type__: ClassVar = "optimizer_scorer" 44 45 class Config: 46 is_base_type = True 47 48 def get_name(self) -> str: 49 """Get the name of this scorer. 50 51 Returns: 52 * Class name as string. 53 """ 54 return self.__class__.__name__ 55 56 @abstractmethod 57 async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]: 58 """Compute the score for predictions against targets. 59 60 Args: 61 * `predictions`: `pd.Series` with model predictions. 62 * `target`: `pd.Series` with target values. 63 * `options`: Processing options. 64 65 Returns: 66 * Score value, or `None` if score cannot be computed. 67 """ 68 raise NotImplementedError() 69 70 async def score(self, context: OptimizerContext, execution_log: "PromptExecutionLog") -> Optional[Dict[str, float]]: 71 """Score predictions across dataset splits. 72 73 Computes scores for each split (train, val, test) if available, 74 or for all data if no splits are defined. 75 76 Args: 77 * `context`: `OptimizerContext` containing datasets and configuration. 78 * `execution_log`: `PromptExecutionLog` with predictions to score. 79 80 Returns: 81 * Dictionary mapping split names to scores. 82 83 Raises: 84 * `OptimizationConfigurationError`: If target values are missing. 85 """ 86 predictions = execution_log.result.get_predictions() 87 if context.has_param(Params.Dataset): 88 dataset = context.get_param(Params.Dataset, LLMDataset) 89 else: 90 target_value: Any = context.get_param(Params.TargetValue) 91 return { 92 LLMDatasetSplit.All: await self._score( 93 predictions=predictions, 94 target=pd.Series([target_value] * len(predictions)), 95 options=context.options, 96 ) 97 or 0, 98 } 99 result = {} 100 for split in (LLMDatasetSplit.Train, LLMDatasetSplit.Test, LLMDatasetSplit.Val): 101 if split not in dataset.split_masks: 102 continue 103 target = dataset[split].target 104 if target is None: 105 raise OptimizationConfigurationError("Target is required for scoring") 106 result[split] = await self._score(predictions[dataset.split_masks[split]], target, context.options) or 0 107 if len(result) == 0: 108 target = dataset.target 109 if target is None: 110 raise OptimizationConfigurationError("Target is required for scoring") 111 result[LLMDatasetSplit.All] = await self._score(predictions, target, context.options) or 0 112 return result 113 114 115 class AccuracyScorer(OptimizationScorer): 116 """Scorer that computes accuracy of predictions.""" 117 118 __registry_alias__: ClassVar = "accuracy" 119 120 async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]: 121 return (predictions == target).mean() 122 123 124 class MCCScorer(OptimizationScorer): 125 """Scorer that computes Matthew's correlation coefficient.""" 126 127 __registry_alias__: ClassVar = "mcc" 128 129 async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]: 130 return matthews_corrcoef(target.to_numpy(), predictions.to_numpy()) 131 132 133 class PrecisionScorer(OptimizationScorer): 134 """Scorer that computes Precision.""" 135 136 __registry_alias__: ClassVar = "precision" 137 138 async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]: 139 return precision_score(target, predictions, average="macro", zero_division=0) 140 141 142 class RecallScorer(OptimizationScorer): 143 """Scorer that computes Recall.""" 144 145 __registry_alias__: ClassVar = "recall" 146 147 async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]: 148 return recall_score(target, predictions, average="macro", zero_division=0) 149 150 151 class F1Scorer(OptimizationScorer): 152 """Scorer that computes F1 score.""" 153 154 __registry_alias__: ClassVar = "f1" 155 156 async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]: 157 return f1_score(target, predictions, average="macro", zero_division=0) 158 159 160 class R2Scorer(OptimizationScorer): 161 """Scorer that computes R² (regression only).""" 162 163 __registry_alias__: ClassVar = "r2" 164 165 async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]: 166 return r2_score(target, predictions) 167 168 169 class BalancedAccuracyScorer(OptimizationScorer): 170 """Scorer that computes Balanced Accuracy.""" 171 172 __registry_alias__: ClassVar = "balanced_accuracy" 173 174 async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]: 175 return balanced_accuracy_score(target, predictions) 176 177 178 class CohenKappaScorer(OptimizationScorer): 179 """Scorer that computes Cohen's Kappa.""" 180 181 __registry_alias__: ClassVar = "cohen_kappa" 182 183 async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]: 184 return cohen_kappa_score(target, predictions) 185 186 187 class RocAucScorer(OptimizationScorer): 188 """Scorer that computes ROC AUC (probabilistic predictions required).""" 189 190 __registry_alias__: ClassVar = "roc_auc" 191 192 async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]: 193 # predictions should be probabilities or scores 194 return roc_auc_score(target, predictions, multi_class="ovr") 195 196 197 class LogLossScorer(OptimizationScorer): 198 """Scorer that computes Log Loss (probabilistic predictions required).""" 199 200 __registry_alias__: ClassVar = "log_loss" 201 202 async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]: 203 return log_loss(target, predictions) 204 205 206 class BrierScoreScorer(OptimizationScorer): 207 """Scorer that computes Brier Score (probabilistic predictions required).""" 208 209 __registry_alias__: ClassVar = "brier_score" 210 211 async def _score(self, predictions: pd.Series, target: pd.Series, options: Options) -> Optional[float]: 212 return brier_score_loss(target, predictions)