/ src / evidently / metrics / recsys.py
recsys.py
  1  from abc import ABC
  2  from typing import ClassVar
  3  from typing import Generic
  4  from typing import List
  5  from typing import Literal
  6  from typing import Optional
  7  from typing import Type
  8  from typing import TypeVar
  9  from typing import Union
 10  
 11  import pandas as pd
 12  
 13  from evidently.core.metric_types import BoundTest
 14  from evidently.core.metric_types import DataframeMetric
 15  from evidently.core.metric_types import DataframeValue
 16  from evidently.core.metric_types import Metric
 17  from evidently.core.metric_types import SingleValue
 18  from evidently.core.metric_types import SingleValueCalculation
 19  from evidently.core.metric_types import SingleValueMetric
 20  from evidently.core.metric_types import TMetricResult
 21  from evidently.core.report import Context
 22  from evidently.core.report import Report
 23  from evidently.core.report import _default_input_data_generator
 24  from evidently.legacy.base_metric import InputData
 25  from evidently.legacy.metrics import DiversityMetric as LegacyDiversityMetric
 26  from evidently.legacy.metrics import FBetaTopKMetric as LegacyFBetaTopKMetric
 27  from evidently.legacy.metrics import HitRateKMetric as LegacyHitRateKMetric
 28  from evidently.legacy.metrics import ItemBiasMetric as LegacyItemBiasMetric
 29  from evidently.legacy.metrics import MAPKMetric as LegacyMAPKMetric
 30  from evidently.legacy.metrics import MRRKMetric as LegacyMRRKMetric
 31  from evidently.legacy.metrics import NDCGKMetric as LegacyNDCGKMetric
 32  from evidently.legacy.metrics import NoveltyMetric as LegacyNoveltyMetric
 33  from evidently.legacy.metrics import PersonalizationMetric as LegacyPersonalizationMetric
 34  from evidently.legacy.metrics import PopularityBias as LegacyPopularityBias
 35  from evidently.legacy.metrics import PrecisionTopKMetric as LegacyPrecisionTopKMetric
 36  from evidently.legacy.metrics import RecallTopKMetric as LegacyRecallTopKMetric
 37  from evidently.legacy.metrics import RecCasesTable as LegacyRecCasesTable
 38  from evidently.legacy.metrics import SerendipityMetric as LegacySerendipityMetric
 39  from evidently.legacy.metrics import UserBiasMetric as LegacyUserBiasMetric
 40  from evidently.legacy.metrics.recsys.base_top_k import TopKMetric
 41  from evidently.legacy.metrics.recsys.base_top_k import TopKMetricResult
 42  from evidently.legacy.metrics.recsys.diversity import DiversityMetricResult
 43  from evidently.legacy.metrics.recsys.item_bias import ItemBiasMetricResult
 44  from evidently.legacy.metrics.recsys.novelty import NoveltyMetricResult
 45  from evidently.legacy.metrics.recsys.personalisation import PersonalizationMetricResult
 46  from evidently.legacy.metrics.recsys.popularity_bias import PopularityBiasResult
 47  from evidently.legacy.metrics.recsys.rec_examples import RecCasesTableResults
 48  from evidently.legacy.metrics.recsys.scores_distribution import ScoreDistribution as ScoreDistributionLegacy
 49  from evidently.legacy.metrics.recsys.scores_distribution import ScoreDistributionResult
 50  from evidently.legacy.metrics.recsys.serendipity import SerendipityMetricResult
 51  from evidently.legacy.metrics.recsys.user_bias import UserBiasMetricResult
 52  from evidently.legacy.model.widget import BaseWidgetInfo
 53  from evidently.legacy.utils.data_preprocessing import create_data_definition
 54  from evidently.metrics._legacy import LegacyMetricCalculation
 55  from evidently.tests import Reference
 56  from evidently.tests import eq
 57  
 58  
 59  def _gen_ranking_input_data(context: "Context", task_name: Optional[str]) -> InputData:
 60      default_data = _default_input_data_generator(context, None)
 61      if task_name is None:
 62          return default_data
 63      ranking = context.data_definition.get_ranking(task_name)
 64      if ranking is not None:
 65          default_data.column_mapping.user_id = ranking.user_id
 66          default_data.column_mapping.item_id = ranking.item_id
 67          default_data.column_mapping.recommendations_type = ranking.recommendations_type
 68          default_data.column_mapping.target = ranking.target
 69          default_data.column_mapping.prediction = ranking.prediction
 70          default_data.data_definition = create_data_definition(
 71              default_data.reference_data,
 72              default_data.current_data,
 73              default_data.column_mapping,
 74          )
 75      return default_data
 76  
 77  
 78  class TopKBase(DataframeMetric):
 79      k: int
 80      min_rel_score: Optional[int] = None
 81      no_feedback_users: bool = False
 82      ranking_name: str = "default"
 83  
 84  
 85  TTopKBase = TypeVar("TTopKBase", bound=TopKBase)
 86  
 87  
 88  class LegacyTopKCalculation(
 89      LegacyMetricCalculation[
 90          DataframeValue,
 91          TTopKBase,
 92          TopKMetricResult,
 93          TopKMetric,
 94      ],
 95      Generic[TTopKBase],
 96      ABC,
 97  ):
 98      __legacy_metric_type__: ClassVar[Type[TopKMetric]]
 99  
100      def task_name(self) -> Optional[str]:
101          return self.metric.ranking_name
102  
103      def legacy_metric(self):
104          return self.__legacy_metric_type__(
105              k=self.metric.k, min_rel_score=self.metric.min_rel_score, no_feedback_users=self.metric.no_feedback_users
106          )
107  
108      def calculate_value(
109          self, context: "Context", legacy_result: TopKMetricResult, render: List[BaseWidgetInfo]
110      ) -> TMetricResult:
111          current_series = legacy_result.current
112          current_df = pd.DataFrame(
113              {
114                  "rank": current_series.index + 1,  # Convert 0-based to 1-based ranking
115                  "value": current_series.values,
116              }
117          )
118          current_value = DataframeValue(display_name=self.display_name(), value=current_df)
119          current_value.widget = render
120  
121          if legacy_result.reference is None:
122              return current_value
123  
124          reference_series = legacy_result.reference
125          reference_df = pd.DataFrame(
126              {
127                  "rank": reference_series.index + 1,  # Convert 0-based to 1-based ranking
128                  "value": reference_series.values,
129              }
130          )
131          reference_value = DataframeValue(display_name=self.display_name(), value=reference_df)
132          reference_value.widget = []
133  
134          return current_value, reference_value
135  
136      def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData:
137          return _gen_ranking_input_data(context, task_name)
138  
139  
140  class NDCG(TopKBase):
141      """Calculate Normalized Discounted Cumulative Gain (NDCG@k) for ranking/recsys.
142  
143      NDCG measures ranking quality by considering position and relevance. Higher values
144      indicate better ranking quality. Values range from 0 to 1.
145  
146      Args:
147      * `k`: Number of top items to consider in the ranking.
148      * `ranking_name`: Name of the ranking task (default: "default").
149      * `min_rel_score`: Minimum relevance score threshold (optional).
150      * `no_feedback_users`: Whether to exclude users with no feedback (default: False).
151      * `tests`: Optional list of test conditions.
152      """
153  
154      pass
155  
156  
157  class NDCGCalculation(LegacyTopKCalculation[NDCG]):
158      __legacy_metric_type__: ClassVar = LegacyNDCGKMetric
159  
160      def display_name(self) -> str:
161          return "NDCG@k"
162  
163  
164  class MRR(TopKBase):
165      """Calculate Mean Reciprocal Rank (MRR@k) for ranking/recsys.
166  
167      MRR measures the average reciprocal rank of the first relevant item.
168      Higher values indicate better ranking quality.
169  
170      Args:
171      * `k`: Number of top items to consider in the ranking.
172      * `ranking_name`: Name of the ranking task (default: "default").
173      * `min_rel_score`: Minimum relevance score threshold (optional).
174      * `no_feedback_users`: Whether to exclude users with no feedback (default: False).
175      * `tests`: Optional list of test conditions.
176      """
177  
178      pass
179  
180  
181  class MRRCalculation(LegacyTopKCalculation[MRR]):
182      __legacy_metric_type__: ClassVar = LegacyMRRKMetric
183  
184      def display_name(self) -> str:
185          return "MRR@k"
186  
187  
188  class HitRate(TopKBase):
189      """Calculate Hit Rate@k for ranking/recsys.
190  
191      Hit Rate measures the proportion of users who have at least one relevant item
192      in their top-k recommendations. Higher values indicate better coverage.
193  
194      Args:
195      * `k`: Number of top items to consider in the ranking.
196      * `ranking_name`: Name of the ranking task (default: "default").
197      * `min_rel_score`: Minimum relevance score threshold (optional).
198      * `no_feedback_users`: Whether to exclude users with no feedback (default: False).
199      * `tests`: Optional list of test conditions.
200      """
201  
202      pass
203  
204  
205  class HitRateCalculation(LegacyTopKCalculation[HitRate]):
206      __legacy_metric_type__: ClassVar = LegacyHitRateKMetric
207  
208      def display_name(self) -> str:
209          return "HitRate@k"
210  
211  
212  class MAP(TopKBase):
213      """Calculate Mean Average Precision (MAP@k) for ranking/recsys.
214  
215      MAP measures the average precision across all users. Higher values indicate
216      better ranking quality.
217  
218      Args:
219      * `k`: Number of top items to consider in the ranking.
220      * `ranking_name`: Name of the ranking task (default: "default").
221      * `min_rel_score`: Minimum relevance score threshold (optional).
222      * `no_feedback_users`: Whether to exclude users with no feedback (default: False).
223      * `tests`: Optional list of test conditions.
224      """
225  
226      pass
227  
228  
229  class MAPCalculation(LegacyTopKCalculation[MAP]):
230      __legacy_metric_type__: ClassVar = LegacyMAPKMetric
231  
232      def display_name(self) -> str:
233          return "MAP@k"
234  
235  
236  class RecallTopK(TopKBase):
237      """Calculate Recall@k for ranking/recsys.
238  
239      Recall measures the proportion of relevant items found in the top-k recommendations.
240      Higher values indicate better coverage of relevant items.
241  
242      Args:
243      * `k`: Number of top items to consider in the ranking.
244      * `ranking_name`: Name of the ranking task (default: "default").
245      * `min_rel_score`: Minimum relevance score threshold (optional).
246      * `no_feedback_users`: Whether to exclude users with no feedback (default: False).
247      * `tests`: Optional list of test conditions.
248      """
249  
250      pass
251  
252  
253  class RecallTopKCalculation(LegacyTopKCalculation[RecallTopK]):
254      __legacy_metric_type__: ClassVar = LegacyRecallTopKMetric
255  
256      def display_name(self) -> str:
257          return "Recall@k"
258  
259  
260  class PrecisionTopK(TopKBase):
261      """Calculate Precision@k for ranking/recsys.
262  
263      Precision measures the proportion of relevant items in the top-k recommendations.
264      Higher values indicate better precision.
265  
266      """
267  
268      pass
269  
270  
271  class PrecisionTopKCalculation(LegacyTopKCalculation[PrecisionTopK]):
272      __legacy_metric_type__: ClassVar = LegacyPrecisionTopKMetric
273  
274      def display_name(self) -> str:
275          return "Precision@k"
276  
277  
278  class FBetaTopK(TopKBase):
279      """Calculate F-beta score@k for ranking/recsys.
280  
281      F-beta is a weighted harmonic mean of precision and recall. Beta controls
282      the weight: beta > 1 favors recall, beta < 1 favors precision.
283  
284      """
285  
286      beta: Optional[float] = 1.0
287      """Weight factor for recall vs precision (1.0 = balanced F1)."""
288  
289  
290  class FBetaTopKCalculation(LegacyTopKCalculation[FBetaTopK]):
291      def display_name(self) -> str:
292          return f"F{self.metric.beta}@k"
293  
294      def legacy_metric(self):
295          return LegacyFBetaTopKMetric(
296              k=self.metric.k,
297              min_rel_score=self.metric.min_rel_score,
298              no_feedback_users=self.metric.no_feedback_users,
299              beta=self.metric.beta,
300          )
301  
302  
303  class ScoreDistribution(SingleValueMetric):
304      """Calculate score distribution entropy for ranking/recsys.
305  
306      Measures the diversity of recommendation scores using entropy. Higher entropy
307      indicates more diverse score distribution.
308  
309      """
310  
311      k: int
312      """Number of top items to consider."""
313      ranking_name: str = "default"
314      """Name of the ranking task."""
315  
316      def _default_tests_with_reference(self, context: Context) -> List[BoundTest]:
317          return [
318              eq(Reference(relative=0.1)).bind_single(self.get_fingerprint()),
319          ]
320  
321  
322  class ScoreDistributionCalculation(
323      LegacyMetricCalculation[SingleValue, ScoreDistribution, ScoreDistributionResult, ScoreDistributionLegacy],
324      SingleValueCalculation,
325  ):
326      def task_name(self) -> Optional[str]:
327          return self.metric.ranking_name
328  
329      def legacy_metric(self) -> ScoreDistributionLegacy:
330          return ScoreDistributionLegacy(k=self.metric.k)
331  
332      def calculate_value(
333          self, context: "Context", legacy_result: ScoreDistributionResult, render: List[BaseWidgetInfo]
334      ) -> TMetricResult:
335          current = self.result(legacy_result.current_entropy)
336          current.widget = render
337          if legacy_result.reference_entropy is None:
338              return current
339          return current, self.result(legacy_result.reference_entropy)
340  
341      def display_name(self) -> str:
342          return "Score distribution"
343  
344      def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData:
345          return _gen_ranking_input_data(context, task_name)
346  
347  
348  class PopularityBiasMetric(SingleValueMetric):
349      """Measure popularity bias in recommendations.
350  
351      Evaluates how much recommendations favor popular items. Can measure using
352      Average Recommendation Popularity (ARP), coverage, or Gini coefficient.
353  
354      """
355  
356      k: int
357      """Number of top items to consider."""
358      normalize_arp: bool = False
359      """Whether to normalize ARP."""
360      ranking_name: str = "default"
361      """Name of the ranking task."""
362      metric: Literal["arp", "coverage", "gini"] = "arp"
363      """Metric type: ARP, coverage, or Gini coefficient."""
364  
365      def _default_tests_with_reference(self, context: Context) -> List[BoundTest]:
366          return [
367              eq(Reference(relative=0.1)).bind_single(self.get_fingerprint()),
368          ]
369  
370  
371  class PopularityBiasCalculation(
372      LegacyMetricCalculation[SingleValue, PopularityBiasMetric, PopularityBiasResult, LegacyPopularityBias],
373      SingleValueCalculation,
374  ):
375      def task_name(self) -> Optional[str]:
376          return self.metric.ranking_name
377  
378      def legacy_metric(self) -> LegacyPopularityBias:
379          return LegacyPopularityBias(k=self.metric.k, normalize_arp=self.metric.normalize_arp)
380  
381      def calculate_value(
382          self, context: "Context", legacy_result: PopularityBiasResult, render: List[BaseWidgetInfo]
383      ) -> TMetricResult:
384          # PopularityBiasResult has: current_apr, current_coverage, current_gini
385          if self.metric.metric == "coverage":
386              current = self.result(legacy_result.current_coverage)
387              current.widget = render
388              if legacy_result.reference_coverage is None:
389                  return current
390              return current, self.result(legacy_result.reference_coverage)
391          if self.metric.metric == "gini":
392              current = self.result(legacy_result.current_gini)
393              current.widget = render
394              if legacy_result.reference_gini is None:
395                  return current
396              return current, self.result(legacy_result.reference_gini)
397          # default to apr
398          current = self.result(legacy_result.current_apr)
399          current.widget = render
400          if legacy_result.reference_apr is None:
401              return current
402          return current, self.result(legacy_result.reference_apr)
403  
404      def display_name(self) -> str:
405          return f"Popularity Bias ({self.metric.metric})"
406  
407      def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData:
408          return _gen_ranking_input_data(context, task_name)
409  
410  
411  class Personalization(SingleValueMetric):
412      """Measure personalization (diversity between users' recommendations).
413  
414      Calculates how different recommendations are across users. Higher values
415      indicate more personalized (diverse) recommendations per user.
416  
417      """
418  
419      k: int
420      """Number of top items to consider."""
421      ranking_name: str = "default"
422      """Name of the ranking task."""
423  
424      def _default_tests_with_reference(self, context: Context) -> List[BoundTest]:
425          return [
426              eq(Reference(relative=0.1)).bind_single(self.get_fingerprint()),
427          ]
428  
429  
430  class PersonalizationCalculation(
431      LegacyMetricCalculation[SingleValue, Personalization, PersonalizationMetricResult, LegacyPersonalizationMetric],
432      SingleValueCalculation,
433  ):
434      def task_name(self) -> Optional[str]:
435          return self.metric.ranking_name
436  
437      def legacy_metric(self) -> LegacyPersonalizationMetric:
438          return LegacyPersonalizationMetric(k=self.metric.k)
439  
440      def calculate_value(
441          self, context: "Context", legacy_result: PersonalizationMetricResult, render: List[BaseWidgetInfo]
442      ) -> TMetricResult:
443          current = self.result(legacy_result.current_value)
444          current.widget = render
445          if legacy_result.reference_value is None:
446              return current
447          return current, self.result(legacy_result.reference_value)
448  
449      def display_name(self) -> str:
450          return "Personalization"
451  
452      def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData:
453          return _gen_ranking_input_data(context, task_name)
454  
455  
456  class Diversity(SingleValueMetric):
457      """Measure diversity of recommended items within each user's list.
458  
459      Calculates how diverse items are within a user's top-k recommendations
460      based on item features. Higher values indicate more diverse recommendations.
461  
462      """
463  
464      k: int
465      """Number of top items to consider."""
466      item_features: List[str]
467      """List of feature column names for diversity calculation."""
468      ranking_name: str = "default"
469      """Name of the ranking task."""
470  
471      def _default_tests_with_reference(self, context: Context) -> List[BoundTest]:
472          return [
473              eq(Reference(relative=0.1)).bind_single(self.get_fingerprint()),
474          ]
475  
476  
477  class DiversityCalculation(
478      LegacyMetricCalculation[SingleValue, Diversity, DiversityMetricResult, LegacyDiversityMetric],
479      SingleValueCalculation,
480  ):
481      def task_name(self) -> Optional[str]:
482          return self.metric.ranking_name
483  
484      def legacy_metric(self) -> LegacyDiversityMetric:
485          return LegacyDiversityMetric(k=self.metric.k, item_features=self.metric.item_features)
486  
487      def calculate_value(
488          self, context: "Context", legacy_result: DiversityMetricResult, render: List[BaseWidgetInfo]
489      ) -> TMetricResult:
490          current = self.result(legacy_result.current_value)
491          current.widget = render
492          if legacy_result.reference_value is None:
493              return current
494          return current, self.result(legacy_result.reference_value)
495  
496      def display_name(self) -> str:
497          return "Diversity"
498  
499      def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData:
500          return _gen_ranking_input_data(context, task_name)
501  
502  
503  class Serendipity(SingleValueMetric):
504      """Measure serendipity (unexpected but relevant recommendations).
505  
506      Evaluates how surprising yet relevant recommendations are. Higher values
507      indicate more serendipitous recommendations.
508  
509      """
510  
511      k: int
512      """Number of top items to consider."""
513      item_features: List[str]
514      """List of feature column names for serendipity calculation."""
515      ranking_name: str = "default"
516      """Name of the ranking task."""
517  
518      def _default_tests_with_reference(self, context: Context) -> List[BoundTest]:
519          return [
520              eq(Reference(relative=0.1)).bind_single(self.get_fingerprint()),
521          ]
522  
523  
524  class SerendipityCalculation(
525      LegacyMetricCalculation[SingleValue, Serendipity, SerendipityMetricResult, LegacySerendipityMetric],
526      SingleValueCalculation,
527  ):
528      def task_name(self) -> Optional[str]:
529          return self.metric.ranking_name
530  
531      def legacy_metric(self) -> LegacySerendipityMetric:
532          return LegacySerendipityMetric(k=self.metric.k, item_features=self.metric.item_features)
533  
534      def calculate_value(
535          self, context: "Context", legacy_result: SerendipityMetricResult, render: List[BaseWidgetInfo]
536      ) -> TMetricResult:
537          current = self.result(legacy_result.current_value)
538          current.widget = render
539          if legacy_result.reference_value is None:
540              return current
541          return current, self.result(legacy_result.reference_value)
542  
543      def display_name(self) -> str:
544          return "Serendipity"
545  
546      def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData:
547          return _gen_ranking_input_data(context, task_name)
548  
549  
550  class Novelty(SingleValueMetric):
551      """Measure novelty (how new/unpopular recommended items are).
552  
553      Evaluates how novel (less popular) the recommended items are. Higher values
554      indicate recommendations of less popular items.
555  
556      """
557  
558      k: int
559      """Number of top items to consider."""
560      ranking_name: str = "default"
561      """Name of the ranking task."""
562  
563      def _default_tests_with_reference(self, context: Context) -> List[BoundTest]:
564          return [
565              eq(Reference(relative=0.1)).bind_single(self.get_fingerprint()),
566          ]
567  
568  
569  class NoveltyCalculation(
570      LegacyMetricCalculation[SingleValue, Novelty, NoveltyMetricResult, LegacyNoveltyMetric],
571      SingleValueCalculation,
572  ):
573      def task_name(self) -> Optional[str]:
574          return self.metric.ranking_name
575  
576      def legacy_metric(self) -> LegacyNoveltyMetric:
577          return LegacyNoveltyMetric(k=self.metric.k)
578  
579      def calculate_value(
580          self, context: "Context", legacy_result: NoveltyMetricResult, render: List[BaseWidgetInfo]
581      ) -> TMetricResult:
582          current = self.result(legacy_result.current_value)
583          current.widget = render
584          if legacy_result.reference_value is None:
585              return current
586          return current, self.result(legacy_result.reference_value)
587  
588      def display_name(self) -> str:
589          return "Novelty"
590  
591      def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData:
592          return _gen_ranking_input_data(context, task_name)
593  
594  
595  class ItemBias(Metric):
596      """Measure bias in recommendations towards specific item groups.
597  
598      Evaluates whether recommendations are biased towards certain item categories
599      or groups. Returns a dataframe showing bias distribution across item groups.
600  
601      """
602  
603      k: int
604      """Number of top items to consider."""
605      column_name: str
606      """Name of the column containing item group/category information."""
607      distribution: Literal["default", "train"] = "default"
608      """Distribution to use: "default" or "train"."""
609      ranking_name: str = "default"
610      """Name of the ranking task."""
611  
612      def get_bound_tests(self, context: "Context") -> List[BoundTest]:
613          return []
614  
615  
616  class ItemBiasCalculation(
617      LegacyMetricCalculation[DataframeValue, ItemBias, ItemBiasMetricResult, LegacyItemBiasMetric],
618  ):
619      def task_name(self) -> Optional[str]:
620          return self.metric.ranking_name
621  
622      def legacy_metric(self) -> LegacyItemBiasMetric:
623          return LegacyItemBiasMetric(k=self.metric.k, column_name=self.metric.column_name)
624  
625      def calculate_value(
626          self, context: "Context", legacy_result: ItemBiasMetricResult, render: List[BaseWidgetInfo]
627      ) -> TMetricResult:
628          return _bias_result(self.metric, legacy_result, render, self.display_name())
629  
630      def display_name(self) -> str:
631          return f"Item Bias ({self.metric.column_name}, {self.metric.distribution})"
632  
633      def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData:
634          return _gen_ranking_input_data(context, task_name)
635  
636  
637  class UserBias(Metric):
638      """Measure bias in recommendations towards specific user groups.
639  
640      Evaluates whether recommendations are biased towards certain user categories
641      or groups. Returns a dataframe showing bias distribution across user groups.
642  
643      """
644  
645      column_name: str
646      """Name of the column containing user group/category information."""
647      distribution: Literal["default", "train"] = "default"
648      """Distribution to use: "default" or "train"."""
649      ranking_name: str = "default"
650      """Name of the ranking task."""
651  
652      def get_bound_tests(self, context: "Context") -> List[BoundTest]:
653          return []
654  
655  
656  class UserBiasCalculation(
657      LegacyMetricCalculation[DataframeValue, UserBias, UserBiasMetricResult, LegacyUserBiasMetric],
658  ):
659      def task_name(self) -> Optional[str]:
660          return self.metric.ranking_name
661  
662      def legacy_metric(self) -> LegacyUserBiasMetric:
663          return LegacyUserBiasMetric(column_name=self.metric.column_name)
664  
665      def calculate_value(
666          self, context: "Context", legacy_result: UserBiasMetricResult, render: List[BaseWidgetInfo]
667      ) -> TMetricResult:
668          return _bias_result(self.metric, legacy_result, render, self.display_name())
669  
670      def display_name(self) -> str:
671          return f"User Bias ({self.metric.column_name}, {self.metric.distribution})"
672  
673      def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData:
674          return _gen_ranking_input_data(context, task_name)
675  
676  
677  def _bias_result(
678      metric: Union[ItemBias, UserBias],
679      legacy_result: Union[ItemBiasMetricResult, UserBiasMetricResult],
680      render: List[BaseWidgetInfo],
681      display_name: str,
682  ) -> TMetricResult:
683      if metric.distribution == "train":
684          current_distr = legacy_result.current_train_distr
685          reference_distr = legacy_result.reference_train_distr
686      else:  # default
687          current_distr = legacy_result.current_distr
688          reference_distr = legacy_result.reference_distr
689  
690      # Fix for legacy bug: x (bin edges) and y (counts) have different lengths
691      # We need to use bin centers instead of bin edges
692      if current_distr is not None:
693          if len(current_distr.x) == len(current_distr.y) + 1:
694              # x is bin edges, y is counts - convert to bin centers
695              bin_centers = (current_distr.x[:-1] + current_distr.x[1:]) / 2  #  type: ignore[operator]
696              current_df = pd.DataFrame({"x": bin_centers, "y": current_distr.y})
697          else:
698              # Already correct lengths
699              current_df = pd.DataFrame({"x": current_distr.x, "y": current_distr.y})
700      else:
701          current_df = pd.DataFrame({"x": [], "y": []})
702  
703      current_value = DataframeValue(display_name=display_name, value=current_df)
704      current_value.widget = render
705  
706      if reference_distr is None:
707          return current_value
708  
709      # Apply same fix to reference distribution
710      if reference_distr is not None:
711          if len(reference_distr.x) == len(reference_distr.y) + 1:
712              # x is bin edges, y is counts - convert to bin centers
713              ref_bin_centers = (reference_distr.x[:-1] + reference_distr.x[1:]) / 2  #  type: ignore[operator]
714              reference_df = pd.DataFrame({"x": ref_bin_centers, "y": reference_distr.y})
715          else:
716              # Already correct lengths
717              reference_df = pd.DataFrame({"x": reference_distr.x, "y": reference_distr.y})
718      else:
719          reference_df = pd.DataFrame({"x": [], "y": []})
720  
721      reference_value = DataframeValue(display_name=display_name, value=reference_df)
722      reference_value.widget = []
723      return current_value, reference_value
724  
725  
726  class RecCasesTable(DataframeMetric):
727      """Display recommendation cases as a table for specific users.
728  
729      Shows detailed recommendation examples for specified users, including
730      recommended items, scores, and optional features. Useful for debugging
731      and understanding recommendation behavior.
732  
733      """
734  
735      user_ids: Optional[List[Union[int, str]]] = None
736      """Optional list of user IDs to display (None = all users)."""
737      display_features: Optional[List[str]] = None
738      """Optional list of feature columns to display."""
739      ranking_name: str = "default"
740      """Name of the ranking task."""
741  
742  
743  class RecCasesTableCalculation(
744      LegacyMetricCalculation[DataframeValue, RecCasesTable, RecCasesTableResults, LegacyRecCasesTable],
745  ):
746      def task_name(self) -> Optional[str]:
747          return self.metric.ranking_name
748  
749      def legacy_metric(self) -> LegacyRecCasesTable:
750          return LegacyRecCasesTable(user_ids=self.metric.user_ids, display_features=self.metric.display_features)
751  
752      def calculate_value(
753          self, context: "Context", legacy_result: RecCasesTableResults, render: List[BaseWidgetInfo]
754      ) -> TMetricResult:
755          # RecCasesTableResults has current: Dict[str, pd.DataFrame] and reference: Dict[str, pd.DataFrame]
756          # Each dataframe contains [prediction_name, item_id] + display_features columns
757          # We need to merge all dataframes with an additional user_id column
758          current_dfs = []
759          for user_id, df in legacy_result.current.items():
760              df_with_user = df.copy()
761              df_with_user["user_id"] = user_id
762              current_dfs.append(df_with_user)
763  
764          if current_dfs:
765              current_merged = pd.concat(current_dfs, ignore_index=True)
766          else:
767              current_merged = pd.DataFrame()
768  
769          current_value = DataframeValue(display_name=self.display_name(), value=current_merged)
770          current_value.widget = render
771  
772          if not legacy_result.reference:
773              return current_value
774  
775          reference_dfs = []
776          for user_id, df in legacy_result.reference.items():
777              df_with_user = df.copy()
778              df_with_user["user_id"] = user_id
779              reference_dfs.append(df_with_user)
780  
781          if reference_dfs:
782              reference_merged = pd.concat(reference_dfs, ignore_index=True)
783          else:
784              reference_merged = pd.DataFrame()
785  
786          reference_value = DataframeValue(display_name=self.display_name(), value=reference_merged)
787          reference_value.widget = []
788  
789          return current_value, reference_value
790  
791      def display_name(self) -> str:
792          return "Recommendation Cases Table"
793  
794      def _gen_input_data(self, context: "Context", task_name: Optional[str]) -> InputData:
795          return _gen_ranking_input_data(context, task_name)
796  
797  
798  def main():
799      import pandas as pd
800  
801      def create_snapshot(i):
802          df = pd.DataFrame(
803              {
804                  "user_id": [i % 3 for i in range(i + 5)],
805                  "target": [0.5 for i in range(i + 5)],
806                  "prediction": [0.5 for i in range(i + 5)],
807              }
808          )
809          from evidently.core.datasets import DataDefinition
810          from evidently.core.datasets import Dataset
811          from evidently.core.datasets import Recsys
812  
813          dataset = Dataset.from_pandas(
814              df, data_definition=DataDefinition(numerical_columns=["target", "prediction"], ranking=[Recsys()])
815          )
816          report = Report(
817              [
818                  NDCG(k=3, no_feedback_users=True),
819                  MRR(k=3),
820                  HitRate(k=3),
821                  ScoreDistribution(k=3),
822                  MAP(k=3),
823                  RecallTopK(k=3),
824                  PrecisionTopK(k=3),
825                  FBetaTopK(k=3),
826              ]
827          )
828          snapshot_v2 = report.run(dataset, None)
829  
830          from evidently.ui.backport import snapshot_v2_to_v1
831  
832          snapshot_v1 = snapshot_v2_to_v1(snapshot_v2)
833          return snapshot_v1
834  
835      sn = create_snapshot(10)
836      sn.save("ndcg.json")
837  
838  
839  if __name__ == "__main__":
840      main()