recsys.py
1 from abc import ABC 2 from typing import ClassVar 3 from typing import Generic 4 from typing import List 5 from typing import Literal 6 from typing import Optional 7 from typing import Type 8 from typing import TypeVar 9 from typing import Union 10 11 import pandas as pd 12 13 from evidently.core.metric_types import BoundTest 14 from evidently.core.metric_types import DataframeMetric 15 from evidently.core.metric_types import DataframeValue 16 from evidently.core.metric_types import Metric 17 from evidently.core.metric_types import SingleValue 18 from evidently.core.metric_types import SingleValueCalculation 19 from evidently.core.metric_types import SingleValueMetric 20 from evidently.core.metric_types import TMetricResult 21 from evidently.core.report import Context 22 from evidently.core.report import Report 23 from evidently.core.report import _default_input_data_generator 24 from evidently.legacy.base_metric import InputData 25 from evidently.legacy.metrics import DiversityMetric as LegacyDiversityMetric 26 from evidently.legacy.metrics import FBetaTopKMetric as LegacyFBetaTopKMetric 27 from evidently.legacy.metrics import HitRateKMetric as LegacyHitRateKMetric 28 from evidently.legacy.metrics import ItemBiasMetric as LegacyItemBiasMetric 29 from evidently.legacy.metrics import MAPKMetric as LegacyMAPKMetric 30 from evidently.legacy.metrics import MRRKMetric as LegacyMRRKMetric 31 from evidently.legacy.metrics import NDCGKMetric as LegacyNDCGKMetric 32 from evidently.legacy.metrics import NoveltyMetric as LegacyNoveltyMetric 33 from evidently.legacy.metrics import PersonalizationMetric as LegacyPersonalizationMetric 34 from evidently.legacy.metrics import PopularityBias as LegacyPopularityBias 35 from evidently.legacy.metrics import PrecisionTopKMetric as LegacyPrecisionTopKMetric 36 from evidently.legacy.metrics import RecallTopKMetric as LegacyRecallTopKMetric 37 from evidently.legacy.metrics import RecCasesTable as LegacyRecCasesTable 38 from evidently.legacy.metrics import SerendipityMetric as LegacySerendipityMetric 39 from evidently.legacy.metrics import UserBiasMetric as LegacyUserBiasMetric 40 from evidently.legacy.metrics.recsys.base_top_k import TopKMetric 41 from evidently.legacy.metrics.recsys.base_top_k import TopKMetricResult 42 from evidently.legacy.metrics.recsys.diversity import DiversityMetricResult 43 from evidently.legacy.metrics.recsys.item_bias import ItemBiasMetricResult 44 from evidently.legacy.metrics.recsys.novelty import NoveltyMetricResult 45 from evidently.legacy.metrics.recsys.personalisation import PersonalizationMetricResult 46 from evidently.legacy.metrics.recsys.popularity_bias import PopularityBiasResult 47 from evidently.legacy.metrics.recsys.rec_examples import RecCasesTableResults 48 from evidently.legacy.metrics.recsys.scores_distribution import ScoreDistribution as ScoreDistributionLegacy 49 from evidently.legacy.metrics.recsys.scores_distribution import ScoreDistributionResult 50 from evidently.legacy.metrics.recsys.serendipity import SerendipityMetricResult 51 from evidently.legacy.metrics.recsys.user_bias import UserBiasMetricResult 52 from evidently.legacy.model.widget import BaseWidgetInfo 53 from evidently.legacy.utils.data_preprocessing import create_data_definition 54 from evidently.metrics._legacy import LegacyMetricCalculation 55 from evidently.tests import Reference 56 from evidently.tests import eq 57 58 59 def _gen_ranking_input_data(context: "Context", task_name: Optional[str]) -> InputData: 60 default_data = _default_input_data_generator(context, None) 61 if task_name is None: 62 return default_data 63 ranking = context.data_definition.get_ranking(task_name) 64 if ranking is not None: 65 default_data.column_mapping.user_id = ranking.user_id 66 default_data.column_mapping.item_id = ranking.item_id 67 default_data.column_mapping.recommendations_type = ranking.recommendations_type 68 default_data.column_mapping.target = ranking.target 69 default_data.column_mapping.prediction = ranking.prediction 70 default_data.data_definition = create_data_definition( 71 default_data.reference_data, 72 default_data.current_data, 73 default_data.column_mapping, 74 ) 75 return default_data 76 77 78 class TopKBase(DataframeMetric): 79 k: int 80 min_rel_score: Optional[int] = None 81 no_feedback_users: bool = False 82 ranking_name: str = "default" 83 84 85 TTopKBase = TypeVar("TTopKBase", bound=TopKBase) 86 87 88 class LegacyTopKCalculation( 89 LegacyMetricCalculation[ 90 DataframeValue, 91 TTopKBase, 92 TopKMetricResult, 93 TopKMetric, 94 ], 95 Generic[TTopKBase], 96 ABC, 97 ): 98 __legacy_metric_type__: ClassVar[Type[TopKMetric]] 99 100 def task_name(self) -> Optional[str]: 101 return self.metric.ranking_name 102 103 def legacy_metric(self): 104 return self.__legacy_metric_type__( 105 k=self.metric.k, min_rel_score=self.metric.min_rel_score, no_feedback_users=self.metric.no_feedback_users 106 ) 107 108 def calculate_value( 109 self, context: "Context", legacy_result: TopKMetricResult, render: List[BaseWidgetInfo] 110 ) -> TMetricResult: 111 current_series = legacy_result.current 112 current_df = pd.DataFrame( 113 { 114 "rank": current_series.index + 1, # Convert 0-based to 1-based ranking 115 "value": current_series.values, 116 } 117 ) 118 current_value = DataframeValue(display_name=self.display_name(), value=current_df) 119 current_value.widget = render 120 121 if legacy_result.reference is None: 122 return current_value 123 124 reference_series = legacy_result.reference 125 reference_df = pd.DataFrame( 126 { 127 "rank": reference_series.index + 1, # Convert 0-based to 1-based ranking 128 "value": reference_series.values, 129 } 130 ) 131 reference_value = DataframeValue(display_name=self.display_name(), value=reference_df) 132 reference_value.widget = [] 133 134 return current_value, reference_value 135 136 def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData: 137 return _gen_ranking_input_data(context, task_name) 138 139 140 class NDCG(TopKBase): 141 """Calculate Normalized Discounted Cumulative Gain (NDCG@k) for ranking/recsys. 142 143 NDCG measures ranking quality by considering position and relevance. Higher values 144 indicate better ranking quality. Values range from 0 to 1. 145 146 Args: 147 * `k`: Number of top items to consider in the ranking. 148 * `ranking_name`: Name of the ranking task (default: "default"). 149 * `min_rel_score`: Minimum relevance score threshold (optional). 150 * `no_feedback_users`: Whether to exclude users with no feedback (default: False). 151 * `tests`: Optional list of test conditions. 152 """ 153 154 pass 155 156 157 class NDCGCalculation(LegacyTopKCalculation[NDCG]): 158 __legacy_metric_type__: ClassVar = LegacyNDCGKMetric 159 160 def display_name(self) -> str: 161 return "NDCG@k" 162 163 164 class MRR(TopKBase): 165 """Calculate Mean Reciprocal Rank (MRR@k) for ranking/recsys. 166 167 MRR measures the average reciprocal rank of the first relevant item. 168 Higher values indicate better ranking quality. 169 170 Args: 171 * `k`: Number of top items to consider in the ranking. 172 * `ranking_name`: Name of the ranking task (default: "default"). 173 * `min_rel_score`: Minimum relevance score threshold (optional). 174 * `no_feedback_users`: Whether to exclude users with no feedback (default: False). 175 * `tests`: Optional list of test conditions. 176 """ 177 178 pass 179 180 181 class MRRCalculation(LegacyTopKCalculation[MRR]): 182 __legacy_metric_type__: ClassVar = LegacyMRRKMetric 183 184 def display_name(self) -> str: 185 return "MRR@k" 186 187 188 class HitRate(TopKBase): 189 """Calculate Hit Rate@k for ranking/recsys. 190 191 Hit Rate measures the proportion of users who have at least one relevant item 192 in their top-k recommendations. Higher values indicate better coverage. 193 194 Args: 195 * `k`: Number of top items to consider in the ranking. 196 * `ranking_name`: Name of the ranking task (default: "default"). 197 * `min_rel_score`: Minimum relevance score threshold (optional). 198 * `no_feedback_users`: Whether to exclude users with no feedback (default: False). 199 * `tests`: Optional list of test conditions. 200 """ 201 202 pass 203 204 205 class HitRateCalculation(LegacyTopKCalculation[HitRate]): 206 __legacy_metric_type__: ClassVar = LegacyHitRateKMetric 207 208 def display_name(self) -> str: 209 return "HitRate@k" 210 211 212 class MAP(TopKBase): 213 """Calculate Mean Average Precision (MAP@k) for ranking/recsys. 214 215 MAP measures the average precision across all users. Higher values indicate 216 better ranking quality. 217 218 Args: 219 * `k`: Number of top items to consider in the ranking. 220 * `ranking_name`: Name of the ranking task (default: "default"). 221 * `min_rel_score`: Minimum relevance score threshold (optional). 222 * `no_feedback_users`: Whether to exclude users with no feedback (default: False). 223 * `tests`: Optional list of test conditions. 224 """ 225 226 pass 227 228 229 class MAPCalculation(LegacyTopKCalculation[MAP]): 230 __legacy_metric_type__: ClassVar = LegacyMAPKMetric 231 232 def display_name(self) -> str: 233 return "MAP@k" 234 235 236 class RecallTopK(TopKBase): 237 """Calculate Recall@k for ranking/recsys. 238 239 Recall measures the proportion of relevant items found in the top-k recommendations. 240 Higher values indicate better coverage of relevant items. 241 242 Args: 243 * `k`: Number of top items to consider in the ranking. 244 * `ranking_name`: Name of the ranking task (default: "default"). 245 * `min_rel_score`: Minimum relevance score threshold (optional). 246 * `no_feedback_users`: Whether to exclude users with no feedback (default: False). 247 * `tests`: Optional list of test conditions. 248 """ 249 250 pass 251 252 253 class RecallTopKCalculation(LegacyTopKCalculation[RecallTopK]): 254 __legacy_metric_type__: ClassVar = LegacyRecallTopKMetric 255 256 def display_name(self) -> str: 257 return "Recall@k" 258 259 260 class PrecisionTopK(TopKBase): 261 """Calculate Precision@k for ranking/recsys. 262 263 Precision measures the proportion of relevant items in the top-k recommendations. 264 Higher values indicate better precision. 265 266 """ 267 268 pass 269 270 271 class PrecisionTopKCalculation(LegacyTopKCalculation[PrecisionTopK]): 272 __legacy_metric_type__: ClassVar = LegacyPrecisionTopKMetric 273 274 def display_name(self) -> str: 275 return "Precision@k" 276 277 278 class FBetaTopK(TopKBase): 279 """Calculate F-beta score@k for ranking/recsys. 280 281 F-beta is a weighted harmonic mean of precision and recall. Beta controls 282 the weight: beta > 1 favors recall, beta < 1 favors precision. 283 284 """ 285 286 beta: Optional[float] = 1.0 287 """Weight factor for recall vs precision (1.0 = balanced F1).""" 288 289 290 class FBetaTopKCalculation(LegacyTopKCalculation[FBetaTopK]): 291 def display_name(self) -> str: 292 return f"F{self.metric.beta}@k" 293 294 def legacy_metric(self): 295 return LegacyFBetaTopKMetric( 296 k=self.metric.k, 297 min_rel_score=self.metric.min_rel_score, 298 no_feedback_users=self.metric.no_feedback_users, 299 beta=self.metric.beta, 300 ) 301 302 303 class ScoreDistribution(SingleValueMetric): 304 """Calculate score distribution entropy for ranking/recsys. 305 306 Measures the diversity of recommendation scores using entropy. Higher entropy 307 indicates more diverse score distribution. 308 309 """ 310 311 k: int 312 """Number of top items to consider.""" 313 ranking_name: str = "default" 314 """Name of the ranking task.""" 315 316 def _default_tests_with_reference(self, context: Context) -> List[BoundTest]: 317 return [ 318 eq(Reference(relative=0.1)).bind_single(self.get_fingerprint()), 319 ] 320 321 322 class ScoreDistributionCalculation( 323 LegacyMetricCalculation[SingleValue, ScoreDistribution, ScoreDistributionResult, ScoreDistributionLegacy], 324 SingleValueCalculation, 325 ): 326 def task_name(self) -> Optional[str]: 327 return self.metric.ranking_name 328 329 def legacy_metric(self) -> ScoreDistributionLegacy: 330 return ScoreDistributionLegacy(k=self.metric.k) 331 332 def calculate_value( 333 self, context: "Context", legacy_result: ScoreDistributionResult, render: List[BaseWidgetInfo] 334 ) -> TMetricResult: 335 current = self.result(legacy_result.current_entropy) 336 current.widget = render 337 if legacy_result.reference_entropy is None: 338 return current 339 return current, self.result(legacy_result.reference_entropy) 340 341 def display_name(self) -> str: 342 return "Score distribution" 343 344 def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData: 345 return _gen_ranking_input_data(context, task_name) 346 347 348 class PopularityBiasMetric(SingleValueMetric): 349 """Measure popularity bias in recommendations. 350 351 Evaluates how much recommendations favor popular items. Can measure using 352 Average Recommendation Popularity (ARP), coverage, or Gini coefficient. 353 354 """ 355 356 k: int 357 """Number of top items to consider.""" 358 normalize_arp: bool = False 359 """Whether to normalize ARP.""" 360 ranking_name: str = "default" 361 """Name of the ranking task.""" 362 metric: Literal["arp", "coverage", "gini"] = "arp" 363 """Metric type: ARP, coverage, or Gini coefficient.""" 364 365 def _default_tests_with_reference(self, context: Context) -> List[BoundTest]: 366 return [ 367 eq(Reference(relative=0.1)).bind_single(self.get_fingerprint()), 368 ] 369 370 371 class PopularityBiasCalculation( 372 LegacyMetricCalculation[SingleValue, PopularityBiasMetric, PopularityBiasResult, LegacyPopularityBias], 373 SingleValueCalculation, 374 ): 375 def task_name(self) -> Optional[str]: 376 return self.metric.ranking_name 377 378 def legacy_metric(self) -> LegacyPopularityBias: 379 return LegacyPopularityBias(k=self.metric.k, normalize_arp=self.metric.normalize_arp) 380 381 def calculate_value( 382 self, context: "Context", legacy_result: PopularityBiasResult, render: List[BaseWidgetInfo] 383 ) -> TMetricResult: 384 # PopularityBiasResult has: current_apr, current_coverage, current_gini 385 if self.metric.metric == "coverage": 386 current = self.result(legacy_result.current_coverage) 387 current.widget = render 388 if legacy_result.reference_coverage is None: 389 return current 390 return current, self.result(legacy_result.reference_coverage) 391 if self.metric.metric == "gini": 392 current = self.result(legacy_result.current_gini) 393 current.widget = render 394 if legacy_result.reference_gini is None: 395 return current 396 return current, self.result(legacy_result.reference_gini) 397 # default to apr 398 current = self.result(legacy_result.current_apr) 399 current.widget = render 400 if legacy_result.reference_apr is None: 401 return current 402 return current, self.result(legacy_result.reference_apr) 403 404 def display_name(self) -> str: 405 return f"Popularity Bias ({self.metric.metric})" 406 407 def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData: 408 return _gen_ranking_input_data(context, task_name) 409 410 411 class Personalization(SingleValueMetric): 412 """Measure personalization (diversity between users' recommendations). 413 414 Calculates how different recommendations are across users. Higher values 415 indicate more personalized (diverse) recommendations per user. 416 417 """ 418 419 k: int 420 """Number of top items to consider.""" 421 ranking_name: str = "default" 422 """Name of the ranking task.""" 423 424 def _default_tests_with_reference(self, context: Context) -> List[BoundTest]: 425 return [ 426 eq(Reference(relative=0.1)).bind_single(self.get_fingerprint()), 427 ] 428 429 430 class PersonalizationCalculation( 431 LegacyMetricCalculation[SingleValue, Personalization, PersonalizationMetricResult, LegacyPersonalizationMetric], 432 SingleValueCalculation, 433 ): 434 def task_name(self) -> Optional[str]: 435 return self.metric.ranking_name 436 437 def legacy_metric(self) -> LegacyPersonalizationMetric: 438 return LegacyPersonalizationMetric(k=self.metric.k) 439 440 def calculate_value( 441 self, context: "Context", legacy_result: PersonalizationMetricResult, render: List[BaseWidgetInfo] 442 ) -> TMetricResult: 443 current = self.result(legacy_result.current_value) 444 current.widget = render 445 if legacy_result.reference_value is None: 446 return current 447 return current, self.result(legacy_result.reference_value) 448 449 def display_name(self) -> str: 450 return "Personalization" 451 452 def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData: 453 return _gen_ranking_input_data(context, task_name) 454 455 456 class Diversity(SingleValueMetric): 457 """Measure diversity of recommended items within each user's list. 458 459 Calculates how diverse items are within a user's top-k recommendations 460 based on item features. Higher values indicate more diverse recommendations. 461 462 """ 463 464 k: int 465 """Number of top items to consider.""" 466 item_features: List[str] 467 """List of feature column names for diversity calculation.""" 468 ranking_name: str = "default" 469 """Name of the ranking task.""" 470 471 def _default_tests_with_reference(self, context: Context) -> List[BoundTest]: 472 return [ 473 eq(Reference(relative=0.1)).bind_single(self.get_fingerprint()), 474 ] 475 476 477 class DiversityCalculation( 478 LegacyMetricCalculation[SingleValue, Diversity, DiversityMetricResult, LegacyDiversityMetric], 479 SingleValueCalculation, 480 ): 481 def task_name(self) -> Optional[str]: 482 return self.metric.ranking_name 483 484 def legacy_metric(self) -> LegacyDiversityMetric: 485 return LegacyDiversityMetric(k=self.metric.k, item_features=self.metric.item_features) 486 487 def calculate_value( 488 self, context: "Context", legacy_result: DiversityMetricResult, render: List[BaseWidgetInfo] 489 ) -> TMetricResult: 490 current = self.result(legacy_result.current_value) 491 current.widget = render 492 if legacy_result.reference_value is None: 493 return current 494 return current, self.result(legacy_result.reference_value) 495 496 def display_name(self) -> str: 497 return "Diversity" 498 499 def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData: 500 return _gen_ranking_input_data(context, task_name) 501 502 503 class Serendipity(SingleValueMetric): 504 """Measure serendipity (unexpected but relevant recommendations). 505 506 Evaluates how surprising yet relevant recommendations are. Higher values 507 indicate more serendipitous recommendations. 508 509 """ 510 511 k: int 512 """Number of top items to consider.""" 513 item_features: List[str] 514 """List of feature column names for serendipity calculation.""" 515 ranking_name: str = "default" 516 """Name of the ranking task.""" 517 518 def _default_tests_with_reference(self, context: Context) -> List[BoundTest]: 519 return [ 520 eq(Reference(relative=0.1)).bind_single(self.get_fingerprint()), 521 ] 522 523 524 class SerendipityCalculation( 525 LegacyMetricCalculation[SingleValue, Serendipity, SerendipityMetricResult, LegacySerendipityMetric], 526 SingleValueCalculation, 527 ): 528 def task_name(self) -> Optional[str]: 529 return self.metric.ranking_name 530 531 def legacy_metric(self) -> LegacySerendipityMetric: 532 return LegacySerendipityMetric(k=self.metric.k, item_features=self.metric.item_features) 533 534 def calculate_value( 535 self, context: "Context", legacy_result: SerendipityMetricResult, render: List[BaseWidgetInfo] 536 ) -> TMetricResult: 537 current = self.result(legacy_result.current_value) 538 current.widget = render 539 if legacy_result.reference_value is None: 540 return current 541 return current, self.result(legacy_result.reference_value) 542 543 def display_name(self) -> str: 544 return "Serendipity" 545 546 def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData: 547 return _gen_ranking_input_data(context, task_name) 548 549 550 class Novelty(SingleValueMetric): 551 """Measure novelty (how new/unpopular recommended items are). 552 553 Evaluates how novel (less popular) the recommended items are. Higher values 554 indicate recommendations of less popular items. 555 556 """ 557 558 k: int 559 """Number of top items to consider.""" 560 ranking_name: str = "default" 561 """Name of the ranking task.""" 562 563 def _default_tests_with_reference(self, context: Context) -> List[BoundTest]: 564 return [ 565 eq(Reference(relative=0.1)).bind_single(self.get_fingerprint()), 566 ] 567 568 569 class NoveltyCalculation( 570 LegacyMetricCalculation[SingleValue, Novelty, NoveltyMetricResult, LegacyNoveltyMetric], 571 SingleValueCalculation, 572 ): 573 def task_name(self) -> Optional[str]: 574 return self.metric.ranking_name 575 576 def legacy_metric(self) -> LegacyNoveltyMetric: 577 return LegacyNoveltyMetric(k=self.metric.k) 578 579 def calculate_value( 580 self, context: "Context", legacy_result: NoveltyMetricResult, render: List[BaseWidgetInfo] 581 ) -> TMetricResult: 582 current = self.result(legacy_result.current_value) 583 current.widget = render 584 if legacy_result.reference_value is None: 585 return current 586 return current, self.result(legacy_result.reference_value) 587 588 def display_name(self) -> str: 589 return "Novelty" 590 591 def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData: 592 return _gen_ranking_input_data(context, task_name) 593 594 595 class ItemBias(Metric): 596 """Measure bias in recommendations towards specific item groups. 597 598 Evaluates whether recommendations are biased towards certain item categories 599 or groups. Returns a dataframe showing bias distribution across item groups. 600 601 """ 602 603 k: int 604 """Number of top items to consider.""" 605 column_name: str 606 """Name of the column containing item group/category information.""" 607 distribution: Literal["default", "train"] = "default" 608 """Distribution to use: "default" or "train".""" 609 ranking_name: str = "default" 610 """Name of the ranking task.""" 611 612 def get_bound_tests(self, context: "Context") -> List[BoundTest]: 613 return [] 614 615 616 class ItemBiasCalculation( 617 LegacyMetricCalculation[DataframeValue, ItemBias, ItemBiasMetricResult, LegacyItemBiasMetric], 618 ): 619 def task_name(self) -> Optional[str]: 620 return self.metric.ranking_name 621 622 def legacy_metric(self) -> LegacyItemBiasMetric: 623 return LegacyItemBiasMetric(k=self.metric.k, column_name=self.metric.column_name) 624 625 def calculate_value( 626 self, context: "Context", legacy_result: ItemBiasMetricResult, render: List[BaseWidgetInfo] 627 ) -> TMetricResult: 628 return _bias_result(self.metric, legacy_result, render, self.display_name()) 629 630 def display_name(self) -> str: 631 return f"Item Bias ({self.metric.column_name}, {self.metric.distribution})" 632 633 def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData: 634 return _gen_ranking_input_data(context, task_name) 635 636 637 class UserBias(Metric): 638 """Measure bias in recommendations towards specific user groups. 639 640 Evaluates whether recommendations are biased towards certain user categories 641 or groups. Returns a dataframe showing bias distribution across user groups. 642 643 """ 644 645 column_name: str 646 """Name of the column containing user group/category information.""" 647 distribution: Literal["default", "train"] = "default" 648 """Distribution to use: "default" or "train".""" 649 ranking_name: str = "default" 650 """Name of the ranking task.""" 651 652 def get_bound_tests(self, context: "Context") -> List[BoundTest]: 653 return [] 654 655 656 class UserBiasCalculation( 657 LegacyMetricCalculation[DataframeValue, UserBias, UserBiasMetricResult, LegacyUserBiasMetric], 658 ): 659 def task_name(self) -> Optional[str]: 660 return self.metric.ranking_name 661 662 def legacy_metric(self) -> LegacyUserBiasMetric: 663 return LegacyUserBiasMetric(column_name=self.metric.column_name) 664 665 def calculate_value( 666 self, context: "Context", legacy_result: UserBiasMetricResult, render: List[BaseWidgetInfo] 667 ) -> TMetricResult: 668 return _bias_result(self.metric, legacy_result, render, self.display_name()) 669 670 def display_name(self) -> str: 671 return f"User Bias ({self.metric.column_name}, {self.metric.distribution})" 672 673 def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData: 674 return _gen_ranking_input_data(context, task_name) 675 676 677 def _bias_result( 678 metric: Union[ItemBias, UserBias], 679 legacy_result: Union[ItemBiasMetricResult, UserBiasMetricResult], 680 render: List[BaseWidgetInfo], 681 display_name: str, 682 ) -> TMetricResult: 683 if metric.distribution == "train": 684 current_distr = legacy_result.current_train_distr 685 reference_distr = legacy_result.reference_train_distr 686 else: # default 687 current_distr = legacy_result.current_distr 688 reference_distr = legacy_result.reference_distr 689 690 # Fix for legacy bug: x (bin edges) and y (counts) have different lengths 691 # We need to use bin centers instead of bin edges 692 if current_distr is not None: 693 if len(current_distr.x) == len(current_distr.y) + 1: 694 # x is bin edges, y is counts - convert to bin centers 695 bin_centers = (current_distr.x[:-1] + current_distr.x[1:]) / 2 # type: ignore[operator] 696 current_df = pd.DataFrame({"x": bin_centers, "y": current_distr.y}) 697 else: 698 # Already correct lengths 699 current_df = pd.DataFrame({"x": current_distr.x, "y": current_distr.y}) 700 else: 701 current_df = pd.DataFrame({"x": [], "y": []}) 702 703 current_value = DataframeValue(display_name=display_name, value=current_df) 704 current_value.widget = render 705 706 if reference_distr is None: 707 return current_value 708 709 # Apply same fix to reference distribution 710 if reference_distr is not None: 711 if len(reference_distr.x) == len(reference_distr.y) + 1: 712 # x is bin edges, y is counts - convert to bin centers 713 ref_bin_centers = (reference_distr.x[:-1] + reference_distr.x[1:]) / 2 # type: ignore[operator] 714 reference_df = pd.DataFrame({"x": ref_bin_centers, "y": reference_distr.y}) 715 else: 716 # Already correct lengths 717 reference_df = pd.DataFrame({"x": reference_distr.x, "y": reference_distr.y}) 718 else: 719 reference_df = pd.DataFrame({"x": [], "y": []}) 720 721 reference_value = DataframeValue(display_name=display_name, value=reference_df) 722 reference_value.widget = [] 723 return current_value, reference_value 724 725 726 class RecCasesTable(DataframeMetric): 727 """Display recommendation cases as a table for specific users. 728 729 Shows detailed recommendation examples for specified users, including 730 recommended items, scores, and optional features. Useful for debugging 731 and understanding recommendation behavior. 732 733 """ 734 735 user_ids: Optional[List[Union[int, str]]] = None 736 """Optional list of user IDs to display (None = all users).""" 737 display_features: Optional[List[str]] = None 738 """Optional list of feature columns to display.""" 739 ranking_name: str = "default" 740 """Name of the ranking task.""" 741 742 743 class RecCasesTableCalculation( 744 LegacyMetricCalculation[DataframeValue, RecCasesTable, RecCasesTableResults, LegacyRecCasesTable], 745 ): 746 def task_name(self) -> Optional[str]: 747 return self.metric.ranking_name 748 749 def legacy_metric(self) -> LegacyRecCasesTable: 750 return LegacyRecCasesTable(user_ids=self.metric.user_ids, display_features=self.metric.display_features) 751 752 def calculate_value( 753 self, context: "Context", legacy_result: RecCasesTableResults, render: List[BaseWidgetInfo] 754 ) -> TMetricResult: 755 # RecCasesTableResults has current: Dict[str, pd.DataFrame] and reference: Dict[str, pd.DataFrame] 756 # Each dataframe contains [prediction_name, item_id] + display_features columns 757 # We need to merge all dataframes with an additional user_id column 758 current_dfs = [] 759 for user_id, df in legacy_result.current.items(): 760 df_with_user = df.copy() 761 df_with_user["user_id"] = user_id 762 current_dfs.append(df_with_user) 763 764 if current_dfs: 765 current_merged = pd.concat(current_dfs, ignore_index=True) 766 else: 767 current_merged = pd.DataFrame() 768 769 current_value = DataframeValue(display_name=self.display_name(), value=current_merged) 770 current_value.widget = render 771 772 if not legacy_result.reference: 773 return current_value 774 775 reference_dfs = [] 776 for user_id, df in legacy_result.reference.items(): 777 df_with_user = df.copy() 778 df_with_user["user_id"] = user_id 779 reference_dfs.append(df_with_user) 780 781 if reference_dfs: 782 reference_merged = pd.concat(reference_dfs, ignore_index=True) 783 else: 784 reference_merged = pd.DataFrame() 785 786 reference_value = DataframeValue(display_name=self.display_name(), value=reference_merged) 787 reference_value.widget = [] 788 789 return current_value, reference_value 790 791 def display_name(self) -> str: 792 return "Recommendation Cases Table" 793 794 def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData: 795 return _gen_ranking_input_data(context, task_name) 796 797 798 def main(): 799 import pandas as pd 800 801 def create_snapshot(i): 802 df = pd.DataFrame( 803 { 804 "user_id": [i % 3 for i in range(i + 5)], 805 "target": [0.5 for i in range(i + 5)], 806 "prediction": [0.5 for i in range(i + 5)], 807 } 808 ) 809 from evidently.core.datasets import DataDefinition 810 from evidently.core.datasets import Dataset 811 from evidently.core.datasets import Recsys 812 813 dataset = Dataset.from_pandas( 814 df, data_definition=DataDefinition(numerical_columns=["target", "prediction"], ranking=[Recsys()]) 815 ) 816 report = Report( 817 [ 818 NDCG(k=3, no_feedback_users=True), 819 MRR(k=3), 820 HitRate(k=3), 821 ScoreDistribution(k=3), 822 MAP(k=3), 823 RecallTopK(k=3), 824 PrecisionTopK(k=3), 825 FBetaTopK(k=3), 826 ] 827 ) 828 snapshot_v2 = report.run(dataset, None) 829 830 from evidently.ui.backport import snapshot_v2_to_v1 831 832 snapshot_v1 = snapshot_v2_to_v1(snapshot_v2) 833 return snapshot_v1 834 835 sn = create_snapshot(10) 836 sn.save("ndcg.json") 837 838 839 if __name__ == "__main__": 840 main()