Cradicle Explorer

datasets.py
   1  import abc
   2  import copy
   3  import dataclasses
   4  import io
   5  import json
   6  import os
   7  import tarfile
   8  from abc import abstractmethod
   9  from enum import Enum
  10  from typing import TYPE_CHECKING
  11  from typing import Any
  12  from typing import ClassVar
  13  from typing import Dict
  14  from typing import Generator
  15  from typing import List
  16  from typing import Optional
  17  from typing import Tuple
  18  from typing import Union
  19  
  20  import numpy as np
  21  import pandas as pd
  22  
  23  from evidently._pydantic_compat import BaseModel
  24  from evidently._pydantic_compat import parse_obj_as
  25  from evidently.core.base_types import Label
  26  from evidently.core.tests import GenericTest
  27  from evidently.legacy.base_metric import DisplayName
  28  from evidently.legacy.core import ColumnType
  29  from evidently.legacy.features.generated_features import GeneratedFeatures
  30  from evidently.legacy.options.base import AnyOptions
  31  from evidently.legacy.options.base import Options
  32  from evidently.legacy.pipeline.column_mapping import ColumnMapping
  33  from evidently.legacy.suite.base_suite import MetadataValueType
  34  from evidently.legacy.utils.data_preprocessing import create_data_definition
  35  from evidently.legacy.utils.types import Numeric
  36  from evidently.pydantic_utils import AutoAliasMixin
  37  from evidently.pydantic_utils import EvidentlyBaseModel
  38  
  39  EVIDENTLY_DATASET_EXT = "evidently_dataset"
  40  
  41  if TYPE_CHECKING:
  42      from evidently.core.container import MetricOrContainer
  43  
  44  
  45  class ColumnRole(Enum):
  46      """Role of a column in the dataset.
  47  
  48      Defines the semantic role of a column (e.g., target, prediction, feature).
  49      Used in `DataDefinition` to specify column purposes.
  50      """
  51  
  52      Unset = "Unset"
  53      """Column role is not set."""
  54      Target = "target"
  55      """Column contains target/ground truth values."""
  56      Output = "output"
  57      """Column contains model output/predictions."""
  58      Feature = "feature"
  59      """Column is a feature used for prediction."""
  60      Descriptor = "descriptor"
  61      """Column is a computed descriptor (e.g., from text)."""
  62      UserId = "user_id"
  63      """Column contains user IDs (for ranking/recsys)."""
  64      ItemId = "item_id"
  65      """Column contains item IDs (for ranking/recsys)."""
  66      Input = "input"
  67      """Column is an input to the model."""
  68      Context = "context"
  69      """Column contains context information."""
  70      Example = "example"
  71      """Column contains example data."""
  72  
  73  
  74  @dataclasses.dataclass
  75  class ColumnInfo:
  76      """Information about a column's type and role."""
  77  
  78      type: ColumnType
  79      """Column type (numerical, categorical, text, etc.)."""
  80      role: ColumnRole = ColumnRole.Unset
  81      """Column role (target, feature, etc.)."""
  82  
  83  
  84  @dataclasses.dataclass
  85  class BinaryClassification:
  86      """Configuration for binary classification evaluation tasks.
  87  
  88      Maps columns containing target labels and predictions for binary classification.
  89      Used in `DataDefinition` to specify which columns contain classification data.
  90  
  91      Example:
  92      ```python
  93      definition = DataDefinition(
  94          classification=[BinaryClassification(
  95              target="target",
  96              prediction_labels="prediction"
  97          )]
  98      )
  99      ```
 100      """
 101  
 102      name: str
 103      """Identifier for this classification task."""
 104      target: str
 105      """Column name with true binary labels."""
 106      prediction_labels: Optional[str]
 107      """Column name with predicted binary labels."""
 108      prediction_probas: Optional[str]
 109      """Column name with predicted probabilities."""
 110      pos_label: Label
 111      """Value representing the positive class."""
 112      labels: Optional[Dict[Label, str]]
 113      """Optional mapping of label values to display names."""
 114  
 115      def __init__(
 116          self,
 117          *,
 118          name: str = "default",
 119          target: Optional[str] = None,
 120          prediction_labels: Optional[str] = None,
 121          prediction_probas: Optional[str] = None,
 122          pos_label: Optional[str] = None,
 123          labels: Optional[Dict[Label, str]] = None,
 124      ):
 125          """Initialize binary classification configuration.
 126  
 127          If no arguments are provided, defaults to `target="target"` and `prediction_probas="prediction"`.
 128          Otherwise, requires `target` and at least one of `prediction_labels` or `prediction_probas`.
 129          """
 130          self.name = name
 131          if (
 132              target is None
 133              and prediction_labels is None
 134              and prediction_probas is None
 135              and pos_label is None
 136              and labels is None
 137          ):
 138              self.target = "target"
 139              self.prediction_labels = None
 140              self.prediction_probas = "prediction"
 141              self.pos_label = 1
 142              self.labels = None
 143              return
 144          if target is None or (prediction_labels is None and prediction_probas is None):
 145              raise ValueError(
 146                  "Invalid BinaryClassification configuration:" " target and one of (labels or probas) should be set"
 147              )
 148          self.target = target
 149          self.prediction_labels = prediction_labels
 150          self.prediction_probas = prediction_probas
 151          self.pos_label = pos_label if pos_label is not None else 1
 152          self.labels = labels
 153  
 154  
 155  @dataclasses.dataclass
 156  class MulticlassClassification:
 157      """Configuration for multiclass classification evaluation tasks.
 158  
 159      Maps columns containing target labels and predictions for multiclass classification.
 160      Used in `DataDefinition` to specify which columns contain classification data.
 161  
 162      Example:
 163      ```python
 164      definition = DataDefinition(
 165          classification=[MulticlassClassification(
 166              target="target",
 167              prediction_labels="prediction",
 168              prediction_probas=["0", "1", "2"]
 169          )]
 170      )
 171      ```
 172      """
 173  
 174      name: str = "default"
 175      """Identifier for this classification task."""
 176      target: str = "target"
 177      """Column name with true class labels."""
 178      prediction_labels: Optional[str] = "prediction"
 179      """Column name with predicted class labels."""
 180      prediction_probas: Optional[List[str]] = None
 181      """List of column names with predicted probabilities per class."""
 182      labels: Optional[Dict[Label, str]] = None
 183      """Optional mapping of label values to display names."""
 184  
 185      def __init__(
 186          self,
 187          *,
 188          name: str = "default",
 189          target: Optional[str] = None,
 190          prediction_labels: Optional[str] = None,
 191          prediction_probas: Optional[List[str]] = None,
 192          labels: Optional[Dict[Label, str]] = None,
 193      ):
 194          """Initialize multiclass classification configuration.
 195  
 196          If no arguments are provided, defaults to `target="target"` and `prediction_labels="prediction"`.
 197          Otherwise, requires `target` and at least one of `prediction_labels` or `prediction_probas`.
 198          """
 199          self.name = name
 200          if target is None and prediction_labels is None and prediction_probas is None and labels is None:
 201              self.target = "target"
 202              self.prediction_labels = "prediction"
 203              self.prediction_probas = None
 204              self.labels = None
 205              return
 206          if target is None or (prediction_labels is None and prediction_probas is None):
 207              raise ValueError(
 208                  "Invalid MulticlassClassification configuration:" " target and one of (labels or probas) should be set"
 209              )
 210          self.target = target
 211          self.prediction_labels = prediction_labels
 212          self.prediction_probas = prediction_probas
 213          self.labels = labels
 214  
 215  
 216  Classification = Union[BinaryClassification, MulticlassClassification]
 217  
 218  
 219  @dataclasses.dataclass
 220  class Regression:
 221      """Configuration for regression evaluation tasks.
 222  
 223      Maps columns containing target values and predictions for regression.
 224      Used in `DataDefinition` to specify which columns contain regression data.
 225  
 226      Example:
 227      ```python
 228      definition = DataDefinition(
 229          regression=[Regression(target="y_true", prediction="y_pred")]
 230      )
 231      ```
 232      """
 233  
 234      name: str = "default"
 235      """Identifier for this regression task."""
 236      target: str = "target"
 237      """Column name with actual/true values."""
 238      prediction: str = "prediction"
 239      """Column name with predicted values."""
 240  
 241  
 242  @dataclasses.dataclass
 243  class Recsys:
 244      """Configuration for recommender systems and ranking evaluation tasks.
 245  
 246      Maps columns for evaluating recommendation systems, including user-item interactions
 247      and relevance scores. Used in `DataDefinition` to specify ranking/recsys data structure.
 248  
 249      Example:
 250      ```python
 251      definition = DataDefinition(
 252          ranking=[Recsys()]
 253      )
 254      ```
 255      """
 256  
 257      name: str = "default"
 258      """Identifier for this ranking task."""
 259      user_id: str = "user_id"
 260      """Column name with user identifiers."""
 261      item_id: str = "item_id"
 262      """Column name with item identifiers."""
 263      target: str = "target"
 264      """Column name with relevance labels/scores."""
 265      prediction: str = "prediction"
 266      """Column name with predicted scores or ranks."""
 267      recommendations_type: str = "score"
 268      """Type of prediction - "score" or "rank"."""
 269  
 270  
 271  @dataclasses.dataclass
 272  class Completion:
 273      pass
 274  
 275  
 276  @dataclasses.dataclass
 277  class RAG:
 278      pass
 279  
 280  
 281  @dataclasses.dataclass
 282  class LLMClassification:
 283      """Configuration for LLM classification evaluation tasks.
 284  
 285      Maps columns containing LLM inputs, outputs, and optional reasoning for LLM evaluation.
 286      Used in `DataDefinition` to specify which columns contain LLM interaction data.
 287  
 288      Example:
 289      ```python
 290      definition = DataDefinition(
 291          llm=LLMClassification(
 292              input="question",
 293              target="expected_answer",
 294              predictions="model_answer"
 295          )
 296      )
 297      ```
 298      """
 299  
 300      input: str
 301      """Column name with LLM input/prompt text."""
 302      target: str
 303      """Column name with expected/ground truth output."""
 304      predictions: Optional[str] = None
 305      """Column name with LLM-generated output."""
 306      reasoning: Optional[str] = None
 307      """Column name with reasoning text."""
 308      prediction_reasoning: Optional[str] = None
 309      """Column name with reasoning for predictions."""
 310      name: str = "llm_default"
 311      """Identifier for this LLM task."""
 312  
 313  
 314  class SpecialColumnInfo(AutoAliasMixin, EvidentlyBaseModel):
 315      """Base class for special column information.
 316  
 317      Used to define special columns that require custom handling or metrics.
 318      Subclasses can provide custom metrics and column type information.
 319      """
 320  
 321      __alias_type__: ClassVar = "special_column_info"
 322      """Alias type for serialization."""
 323  
 324      class Config:
 325          is_base_type = True
 326  
 327      def get_metrics(self) -> List["MetricOrContainer"]:
 328          """Get metrics associated with this special column.
 329  
 330          Returns:
 331          * List of metrics or metric containers.
 332          """
 333          return []
 334  
 335      def get_column_type(self, column_name: str) -> Optional[ColumnType]:
 336          """Get the column type for a column name.
 337  
 338          Args:
 339          * `column_name`: Name of the column.
 340  
 341          Returns:
 342          * `ColumnType` if known, `None` otherwise.
 343          """
 344          return None
 345  
 346  
 347  LLMDefinition = Union[Completion, RAG, LLMClassification]
 348  
 349  
 350  DEFAULT_TRACE_LINK_COLUMN = "_evidently_trace_link"
 351  
 352  
 353  class ServiceColumns(BaseModel):
 354      """Service columns for special functionality.
 355  
 356      Defines columns used for special features like trace linking and human feedback.
 357      """
 358  
 359      trace_link: Optional[str] = None
 360      """Optional column name for trace links."""
 361      human_feedback_label: Optional[str] = None
 362      """Optional column name for human feedback labels."""
 363      human_feedback_comment: Optional[str] = None
 364      """Optional column name for human feedback comments."""
 365  
 366  
 367  class DataDefinition(BaseModel):
 368      """Maps column types and roles in your dataset for correct evaluation processing.
 369  
 370      `DataDefinition` maps:
 371      - Column types (e.g., categorical, numerical, text)
 372      - Column roles (e.g., id, prediction, target, timestamp)
 373      - Task-specific configurations (classification, regression, ranking, LLM)
 374  
 375      This allows Evidently to process the data correctly. Some evaluations need specific
 376      columns and will fail if they're missing.
 377  
 378      **Documentation**: See [Data Definition Guide](https://docs.evidentlyai.com/docs/library/data_definition) for detailed mapping options.
 379  
 380      Auto-mapping (empty DataDefinition):
 381      ```python
 382      dataset = Dataset.from_pandas(df, data_definition=DataDefinition())
 383      ```
 384  
 385      Manual mapping:
 386      ```python
 387      definition = DataDefinition(
 388          numerical_columns=["Age", "Salary"],
 389          categorical_columns=["Department"],
 390          classification=[BinaryClassification(target="target", prediction_labels="prediction")]
 391      )
 392      ```
 393      """
 394  
 395      id_column: Optional[str] = None
 396      """Column name with unique identifiers."""
 397      timestamp: Optional[str] = None
 398      """Column name with timestamp values."""
 399      service_columns: Optional[ServiceColumns] = None
 400      """Service columns like trace links."""
 401      numerical_columns: Optional[List[str]] = None
 402      """List of numerical column names."""
 403      categorical_columns: Optional[List[str]] = None
 404      """List of categorical column names."""
 405      text_columns: Optional[List[str]] = None
 406      """List of text column names."""
 407      datetime_columns: Optional[List[str]] = None
 408      """List of datetime column names."""
 409      unknown_columns: Optional[List[str]] = None
 410      """List of unknown/unclassified column names."""
 411      list_columns: Optional[List[str]] = None
 412      """List of list/array column names."""
 413      classification: Optional[List[Classification]] = None
 414      """List of classification task configurations (`BinaryClassification` or `MulticlassClassification`)."""
 415      regression: Optional[List[Regression]] = None
 416      """List of regression task configurations (`Regression`)."""
 417      llm: Optional[LLMDefinition] = None
 418      """LLM task configuration (`LLMClassification`)."""
 419      numerical_descriptors: List[str] = []
 420      """List of numerical descriptor column names."""
 421      categorical_descriptors: List[str] = []
 422      """List of categorical descriptor column names."""
 423      test_descriptors: Optional[List[str]] = None
 424      """List of test descriptor column names."""
 425      ranking: Optional[List[Recsys]] = None
 426      """List of ranking/recsys task configurations (`Recsys`)."""
 427      special_columns: List[SpecialColumnInfo] = []
 428      """Additional special column configurations."""
 429      embeddings: Optional[Dict[str, List[str]]] = None
 430      """Embeddings columns definitions: mapping of embedding name to list of columns"""
 431  
 432      def __init__(
 433          self,
 434          id_column: Optional[str] = None,
 435          timestamp: Optional[str] = None,
 436          numerical_columns: Optional[List[str]] = None,
 437          categorical_columns: Optional[List[str]] = None,
 438          text_columns: Optional[List[str]] = None,
 439          datetime_columns: Optional[List[str]] = None,
 440          classification: Optional[List[Classification]] = None,
 441          regression: Optional[List[Regression]] = None,
 442          llm: Optional[LLMDefinition] = None,
 443          numerical_descriptors: Optional[List[str]] = None,
 444          categorical_descriptors: Optional[List[str]] = None,
 445          unknown_columns: Optional[List[str]] = None,
 446          list_columns: Optional[List[str]] = None,
 447          test_descriptors: Optional[List[str]] = None,
 448          ranking: Optional[List[Recsys]] = None,
 449          service_columns: Optional[ServiceColumns] = None,
 450          special_columns: Optional[List[SpecialColumnInfo]] = None,
 451          embeddings: Optional[Dict[str, List[str]]] = None,
 452      ):
 453          """Initialize DataDefinition with column mappings.
 454  
 455          The constructor maps all parameters directly to the corresponding model fields.
 456          If `numerical_descriptors` or `categorical_descriptors` are not provided, they default to empty lists.
 457          """
 458          super().__init__(
 459              id_column=id_column,
 460              timestamp=timestamp,
 461              numerical_columns=numerical_columns,
 462              categorical_columns=categorical_columns,
 463              text_columns=text_columns,
 464              datetime_columns=datetime_columns,
 465              unknown_columns=unknown_columns,
 466              list_columns=list_columns,
 467              # classification=classification,
 468              # regression=regression,
 469              # llm=llm,
 470              numerical_descriptors=numerical_descriptors if numerical_descriptors is not None else [],
 471              categorical_descriptors=categorical_descriptors if categorical_descriptors is not None else [],
 472              test_descriptors=test_descriptors,
 473              # ranking=ranking,
 474              service_columns=service_columns,
 475              special_columns=special_columns if special_columns is not None else [],
 476          )
 477          self.classification = classification
 478          self.regression = regression
 479          self.llm = llm
 480          self.ranking = ranking
 481          self.embeddings = embeddings
 482  
 483      def get_numerical_columns(self):
 484          """Get all numerical columns including descriptors.
 485  
 486          Returns:
 487          * List of numerical column names (includes both explicitly mapped and descriptor columns)
 488          """
 489          return (self.numerical_columns or []) + (self.numerical_descriptors or [])
 490  
 491      def get_categorical_columns(self):
 492          """Get all categorical columns including descriptors.
 493  
 494          Returns:
 495          * List of categorical column names (includes both explicitly mapped and descriptor columns)
 496          """
 497          return (self.categorical_columns or []) + (self.categorical_descriptors or [])
 498  
 499      def get_text_columns(self):
 500          """Get all text columns.
 501  
 502          Returns:
 503          * List of text column names
 504          """
 505          return self.text_columns or []
 506  
 507      def get_datetime_columns(self):
 508          """Get all datetime columns.
 509  
 510          Returns:
 511          * List of datetime column names
 512          """
 513          return self.datetime_columns or []
 514  
 515      def get_unknown_columns(self):
 516          """Get all unknown/unclassified columns.
 517  
 518          Returns:
 519          * List of unknown column names
 520          """
 521          return self.unknown_columns or []
 522  
 523      def get_list_columns(self):
 524          """Get all list/array columns.
 525  
 526          Returns:
 527          * List of list column names
 528          """
 529          return self.list_columns or []
 530  
 531      def get_column_type(self, column_name: str) -> ColumnType:
 532          """Get the column type for a specific column.
 533  
 534          Args:
 535          * `column_name`: Name of the column to check
 536  
 537          Returns:
 538          * `evidently.legacy.core.ColumnType` enum value for the column
 539          """
 540          if column_name in self.get_numerical_columns():
 541              return ColumnType.Numerical
 542          if column_name in self.get_categorical_columns():
 543              return ColumnType.Categorical
 544          if column_name in self.get_text_columns():
 545              return ColumnType.Text
 546          if column_name in self.get_datetime_columns():
 547              return ColumnType.Datetime
 548          if column_name in self.get_unknown_columns():
 549              return ColumnType.Unknown
 550          if column_name in self.get_list_columns():
 551              return ColumnType.List
 552          if column_name == self.timestamp:
 553              return ColumnType.Date
 554          if column_name == self.id_column:
 555              return ColumnType.Id
 556          for special_column in self.special_columns:
 557              ct = special_column.get_column_type(column_name)
 558              if ct is not None:
 559                  return ct
 560          return ColumnType.Unknown
 561  
 562      def get_classification(self, classification_id: str) -> Optional[Classification]:
 563          """Get classification configuration by ID.
 564  
 565          Args:
 566          * `classification_id`: Name/ID of the classification task
 567  
 568          Returns:
 569          * `BinaryClassification` or `MulticlassClassification` configuration or None if not found
 570          """
 571          item_list = list(filter(lambda x: x.name == classification_id, self.classification or []))
 572          if len(item_list) == 0:
 573              return None
 574          if len(item_list) > 1:
 575              raise ValueError("More than one classification with id {}".format(classification_id))
 576          return item_list[0]
 577  
 578      def get_ranking(self, ranking_id: str) -> Optional[Recsys]:
 579          """Get ranking/recsys configuration by ID.
 580  
 581          Args:
 582          * `ranking_id`: Name/ID of the ranking task
 583  
 584          Returns:
 585          * `Recsys` configuration or None if not found
 586          """
 587          item_list = list(filter(lambda x: x.name == ranking_id, self.ranking or []))
 588          if len(item_list) == 0:
 589              return None
 590          if len(item_list) > 1:
 591              raise ValueError("More than one ranking with id {}".format(ranking_id))
 592          return item_list[0]
 593  
 594      def get_columns(self, types: List[ColumnType]) -> Generator[str, None, None]:
 595          """Get column names of specified types.
 596  
 597          Args:
 598          * `types`: List of `evidently.legacy.core.ColumnType` values to filter by
 599  
 600          Returns:
 601          * Generator yielding column names matching the specified types
 602          """
 603          if ColumnType.Numerical in types:
 604              yield from self.get_numerical_columns()
 605          if ColumnType.Categorical in types:
 606              yield from self.get_categorical_columns()
 607          if ColumnType.Text in types:
 608              yield from self.get_text_columns()
 609          if ColumnType.Datetime in types:
 610              yield from self.get_datetime_columns()
 611          if ColumnType.Unknown in types:
 612              yield from self.get_unknown_columns()
 613          if ColumnType.List in types:
 614              yield from self.get_list_columns()
 615  
 616      def get_regression(self, regression_id: str) -> Optional[Regression]:
 617          """Get regression configuration by ID.
 618  
 619          Args:
 620          * `regression_id`: Name/ID of the regression task
 621  
 622          Returns:
 623          * `Regression` configuration or None if not found
 624          """
 625          item_list = list(filter(lambda x: x.name == regression_id, self.regression or []))
 626          if len(item_list) == 0:
 627              return None
 628          if len(item_list) > 1:
 629              raise ValueError("More than one regression with id {}".format(regression_id))
 630          return item_list[0]
 631  
 632  
 633  class DatasetColumn:
 634      """Wrapper for a single column in a dataset.
 635  
 636      Contains the column type and the actual data as a pandas Series.
 637      Used internally to access column data with type information.
 638      """
 639  
 640      type: ColumnType
 641      """Column type (numerical, categorical, text, etc.)."""
 642      data: pd.Series
 643      """Pandas Series containing the column data."""
 644  
 645      def __init__(self, type: Union[str, ColumnType], data: pd.Series) -> None:
 646          """Initialize a dataset column.
 647  
 648          Args:
 649          * `type`: `ColumnType` or string name of the column type.
 650          * `data`: `pandas.Series` containing the column data.
 651          """
 652          self.type = ColumnType(type)
 653          self.data = data
 654  
 655  
 656  class ColumnCondition(AutoAliasMixin, EvidentlyBaseModel, abc.ABC):
 657      """Base class for column value conditions.
 658  
 659      Used to define conditions that check values in a column (e.g., greater than,
 660      in range, matches pattern). Used in descriptor tests and column filters.
 661      """
 662  
 663      __alias_type__: ClassVar[str] = "column_condition"
 664      """Alias type for serialization."""
 665  
 666      class Config:
 667          is_base_type = True
 668  
 669      @abstractmethod
 670      def check(self, value: Any) -> bool:
 671          """Check if a value satisfies the condition.
 672  
 673          Args:
 674          * `value`: Value to check.
 675  
 676          Returns:
 677          * `True` if condition is satisfied, `False` otherwise.
 678          """
 679          raise NotImplementedError
 680  
 681      @abstractmethod
 682      def get_default_alias(self, column: str) -> str:
 683          """Get default alias name for this condition.
 684  
 685          Args:
 686          * `column`: Column name this condition applies to.
 687  
 688          Returns:
 689          * Default alias string.
 690          """
 691          raise NotImplementedError
 692  
 693  
 694  class DescriptorTest(BaseModel):
 695      """Test condition for a descriptor column.
 696  
 697      Defines a condition to test values in a descriptor column. Can be used
 698      to create derived descriptors based on test results.
 699      """
 700  
 701      condition: ColumnCondition
 702      """Column condition to apply."""
 703      column: Optional[str] = None
 704      """Optional column name (uses parent descriptor column if None)."""
 705      alias: Optional[str] = None
 706      """Optional alias name for the test result."""
 707  
 708      def __init__(
 709          self,
 710          condition: Union[ColumnCondition, GenericTest],
 711          column: Optional[str] = None,
 712          alias: Optional[str] = None,
 713          **data: Any,
 714      ) -> None:
 715          c: ColumnCondition = condition.for_descriptor().condition if isinstance(condition, GenericTest) else condition
 716          super().__init__(alias=alias, column=column, condition=c, **data)
 717  
 718      def to_descriptor(self, descriptor: Optional["Descriptor"] = None) -> "Descriptor":
 719          if self.column is None:
 720              if descriptor is None:
 721                  raise ValueError("Parent descriptor is required for test without column")
 722              descriptor_columns = descriptor.list_output_columns()
 723              if len(descriptor_columns) == 1:
 724                  column = descriptor_columns[0]
 725              else:
 726                  raise ValueError(
 727                      f"Column is required for test with multiple columns in parent descriptor: [{', '.join(descriptor_columns)}]"
 728                  )
 729          else:
 730              column = self.column
 731          return ColumnTest(column, self.condition, self.alias or self.condition.get_default_alias(column))
 732  
 733  
 734  AnyDescriptorTest = Union["DescriptorTest", "GenericTest"]
 735  
 736  
 737  class Descriptor(AutoAliasMixin, EvidentlyBaseModel, abc.ABC):
 738      """Base class for descriptors that compute row-level features.
 739  
 740      Descriptors compute additional columns from existing data (e.g., text length,
 741      sentiment score, custom transformations). Used to enrich datasets with
 742      computed features for evaluation.
 743      """
 744  
 745      class Config:
 746          is_base_type = True
 747  
 748      __alias_type__: ClassVar = "descriptor_v2"
 749      """Alias type for serialization."""
 750  
 751      alias: str
 752      """Name for the descriptor output column."""
 753      tests: List[DescriptorTest] = []
 754      """List of test conditions to apply to descriptor values."""
 755  
 756      def __init__(self, alias: str, tests: Optional[List[AnyDescriptorTest]] = None, **data: Any) -> None:
 757          self.alias = alias
 758          self.tests = [t.for_descriptor() if isinstance(t, GenericTest) else t for t in (tests or [])]
 759          super().__init__(**data)
 760  
 761      @abc.abstractmethod
 762      def generate_data(
 763          self, dataset: "Dataset", options: Options
 764      ) -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]:
 765          raise NotImplementedError()
 766  
 767      def validate_input(self, data_definition: DataDefinition) -> None:
 768          input_columns = self.list_input_columns()
 769          if input_columns is not None:
 770              all_columns = set(data_definition.get_columns(list(ColumnType)))
 771              for column in input_columns:
 772                  if column not in all_columns:
 773                      raise ValueError(
 774                          f"Column '{column}' is not found in dataset. Available columns: [{', '.join(all_columns)}]"
 775                      )
 776  
 777      def list_output_columns(self) -> List[str]:  # todo: also types?
 778          return [self.alias]
 779  
 780      def list_input_columns(self) -> Optional[List[str]]:  # todo: make not optional
 781          return None
 782  
 783      def get_sub_descriptors(self) -> List["Descriptor"]:
 784          return [t.to_descriptor(self) for t in self.tests]
 785  
 786      def get_special_columns_info(self, rename: Dict[str, str]) -> List[SpecialColumnInfo]:
 787          return []
 788  
 789      def add_to_descriptors_list(self) -> bool:
 790          return True
 791  
 792  
 793  class SingleInputDescriptor(Descriptor, abc.ABC):
 794      """Base class for descriptors that operate on a single input column.
 795  
 796      Simplifies descriptor implementation for descriptors that only need one
 797      input column. Subclasses only need to implement `generate_data()`.
 798      """
 799  
 800      column: str
 801      """Name of the input column to process."""
 802  
 803      def list_input_columns(self) -> List[str]:
 804          """Get the list of input columns.
 805  
 806          Returns:
 807          * List containing the single input column name.
 808          """
 809          return [self.column]
 810  
 811  
 812  class ColumnTest(SingleInputDescriptor):
 813      """Descriptor that tests values in a column against a condition.
 814  
 815      Creates a boolean descriptor column indicating whether each value in the
 816      input column satisfies the condition. Useful for filtering or flagging rows.
 817      """
 818  
 819      column: str
 820      """Name of the input column to test."""
 821      condition: ColumnCondition
 822      """Column condition to apply."""
 823  
 824      def __init__(
 825          self, column: str, condition: Union[ColumnCondition, GenericTest], alias: Optional[str] = None, **data: Any
 826      ) -> None:
 827          self.column = column
 828          if isinstance(condition, dict):
 829              condition = parse_obj_as(ColumnCondition, condition)  # type: ignore[type-abstract]
 830          descriptor_condition: ColumnCondition = (
 831              condition if isinstance(condition, ColumnCondition) else condition.for_descriptor().condition
 832          )
 833          self.condition = descriptor_condition
 834          super().__init__(alias=alias or descriptor_condition.get_default_alias(column), **data)
 835  
 836      def generate_data(
 837          self, dataset: "Dataset", options: Options
 838      ) -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]:
 839          """Generate a boolean column indicating which rows satisfy the condition.
 840  
 841          Args:
 842          * `dataset`: `Dataset` to process.
 843          * `options`: Processing options.
 844  
 845          Returns:
 846          * `DatasetColumn` with boolean values (True if condition passes, False otherwise).
 847          """
 848          data = dataset.column(self.column)
 849          res = data.data.apply(self.condition.check)
 850          return DatasetColumn(ColumnType.Categorical, res)
 851  
 852  
 853  class TestSummaryInfo(SpecialColumnInfo):
 854      """Special column information for test summary aggregation.
 855  
 856      Defines columns that aggregate test results across multiple descriptors,
 857      providing summary statistics like "all tests pass", "any test fails", etc.
 858      """
 859  
 860      all_column: Optional[str] = None
 861      """Optional column name for 'all tests pass' indicator."""
 862      any_column: Optional[str] = None
 863      """Optional column name for 'any test fails' indicator."""
 864      count_column: Optional[str] = None
 865      """Optional column name for test failure count."""
 866      rate_column: Optional[str] = None
 867      """Optional column name for test failure rate."""
 868      score_column: Optional[str] = None
 869      """Optional column name for weighted test score."""
 870      score_weights: Optional[Dict[str, float]] = None
 871      """Optional dictionary mapping test names to weights."""
 872  
 873      @property
 874      def has_all(self):
 875          """Check if 'all' column is configured.
 876  
 877          Returns:
 878          * `True` if `any_column` is set, `False` otherwise.
 879          """
 880          return self.any_column is not None
 881  
 882      @property
 883      def has_any(self):
 884          """Check if 'any' column is configured.
 885  
 886          Returns:
 887          * `True` if `any_column` is set, `False` otherwise.
 888          """
 889          return self.any_column is not None
 890  
 891      @property
 892      def has_count(self):
 893          """Check if 'count' column is configured.
 894  
 895          Returns:
 896          * `True` if `count_column` is set, `False` otherwise.
 897          """
 898          return self.count_column is not None
 899  
 900      @property
 901      def has_rate(self):
 902          """Check if 'rate' column is configured.
 903  
 904          Returns:
 905          * `True` if `rate_column` is set, `False` otherwise.
 906          """
 907          return self.rate_column is not None
 908  
 909      @property
 910      def has_score(self):
 911          """Check if 'score' column is configured.
 912  
 913          Returns:
 914          * `True` if `score_column` is set, `False` otherwise.
 915          """
 916          return self.score_column is not None
 917  
 918      def get_metrics(self) -> List["MetricOrContainer"]:
 919          """Get metrics for aggregating test summary columns.
 920  
 921          Returns:
 922          * List containing a `TestSummaryInfoPreset` metric.
 923          """
 924          from evidently.presets.special import TestSummaryInfoPreset
 925  
 926          return [TestSummaryInfoPreset(column_info=self)]
 927  
 928      def get_column_type(self, column_name: str) -> Optional[ColumnType]:
 929          """Get the column type for a summary column name.
 930  
 931          Args:
 932          * `column_name`: Name of the column to check.
 933  
 934          Returns:
 935          * `ColumnType.Categorical` for all/any columns, `ColumnType.Numerical` for count/rate/score columns, or `None` if not found.
 936          """
 937          if column_name in (self.all_column, self.any_column):
 938              return ColumnType.Categorical
 939          if column_name in (self.count_column, self.rate_column, self.score_column):
 940              return ColumnType.Numerical
 941          return None
 942  
 943  
 944  class TestSummary(Descriptor):
 945      """Descriptor that aggregates test results across multiple test descriptors.
 946  
 947      Computes summary statistics from boolean test result columns, such as:
 948      - Whether all tests pass for each row
 949      - Whether any test fails for each row
 950      - Count and rate of passing tests
 951      - Weighted score across tests
 952      """
 953  
 954      success_all: bool = True
 955      """Whether to compute 'all tests pass' indicator."""
 956      success_any: bool = False
 957      """Whether to compute 'any test fails' indicator."""
 958      success_count: bool = False
 959      """Whether to compute count of passing tests."""
 960      success_rate: bool = False
 961      """Whether to compute proportion of passing tests."""
 962      score: bool = False
 963      """Whether to compute weighted score across tests."""
 964      score_weights: Optional[Dict[str, float]] = None
 965      """Optional dictionary mapping test names to weights for scoring."""
 966      normalize_scores: bool = True
 967      """Whether to normalize scores by total weight."""
 968  
 969      def __init__(
 970          self,
 971          success_all: bool = True,
 972          success_any: bool = False,
 973          success_count: bool = False,
 974          success_rate: bool = False,
 975          score: bool = False,
 976          score_weights: Optional[Dict[str, float]] = None,
 977          alias: Optional[str] = None,
 978          normalize_scores: bool = True,
 979          **data: Any,
 980      ):
 981          self.success_all = success_all
 982          self.success_any = success_any
 983          self.success_count = success_count
 984          self.success_rate = success_rate
 985          self.score = score
 986          self.score_weights = score_weights
 987          self.normalize_scores = normalize_scores
 988          super().__init__(alias=alias or "summary", **data)
 989  
 990      def generate_data(
 991          self, dataset: "Dataset", options: Options
 992      ) -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]:
 993          """Generate summary columns from test result columns.
 994  
 995          Aggregates boolean test results into summary statistics based on
 996          configured flags (success_all, success_any, success_count, etc.).
 997  
 998          Args:
 999          * `dataset`: `Dataset` containing test result columns.
1000          * `options`: Processing options.
1001  
1002          Returns:
1003          * Dictionary of summary columns, or single column if only one is generated.
1004  
1005          Raises:
1006          * `ValueError`: If no tests are specified or no summary columns are configured.
1007          """
1008          tests = dataset.data_definition.test_descriptors or []
1009          if len(tests) == 0:
1010              raise ValueError("No tests specified")
1011          summary_columns = {}
1012          test_results = dataset.as_dataframe()[tests]
1013          if self.success_count:
1014              summary_columns["success_count"] = (ColumnType.Numerical, test_results.sum(axis=1))
1015          if self.success_rate:
1016              summary_columns["success_rate"] = (ColumnType.Numerical, test_results.sum(axis=1) / len(tests))
1017          if self.success_all:
1018              summary_columns["success_all"] = (ColumnType.Categorical, test_results.all(axis=1))
1019          if self.success_any:
1020              summary_columns["success_any"] = (ColumnType.Categorical, test_results.any(axis=1))
1021          if self.score:
1022              weights = self.score_weights or {t: 1 for t in tests}
1023              total_weight = sum(weights.values()) if self.normalize_scores else 1
1024              summary_columns["score"] = (  # type: ignore[assignment]
1025                  ColumnType.Numerical,
1026                  sum(test_results[col] * weight / total_weight for col, weight in weights.items()),
1027              )
1028          alias = self.alias or "summary"
1029          result = {f"{alias}_{key}": DatasetColumn(ct, value) for key, (ct, value) in summary_columns.items()}
1030          if len(tests) == 0:
1031              raise ValueError("No summary columns specified")
1032          if len(result) == 1:
1033              return {alias: list(result.values())[0]}
1034          return result
1035  
1036      def list_input_columns(self) -> Optional[List[str]]:
1037          """Get list of input columns needed for this descriptor.
1038  
1039          Returns:
1040          * List of test column names if score weights are specified, `None` otherwise.
1041          """
1042          if self.score and self.score_weights is not None:
1043              return list(self.score_weights.keys())
1044          return None
1045  
1046      def get_special_columns_info(self, rename: Dict[str, str]) -> List[SpecialColumnInfo]:
1047          """Get special column information for test summary aggregation.
1048  
1049          Args:
1050          * `rename`: Dictionary mapping internal column names to final names.
1051  
1052          Returns:
1053          * List of `TestSummaryInfo` objects describing the summary columns.
1054          """
1055          alias = self.alias or "summary"
1056          if len(rename) == 1:
1057              return [
1058                  TestSummaryInfo(
1059                      all_column=rename[alias] if self.success_all else None,
1060                      any_column=rename[alias] if self.success_any else None,
1061                      count_column=rename[alias] if self.success_count else None,
1062                      rate_column=rename[alias] if self.success_rate else None,
1063                      score_column=rename[alias] if self.score else None,
1064                  )
1065              ]
1066  
1067          return [
1068              TestSummaryInfo(
1069                  all_column=rename[f"{alias}_success_all"] if self.success_all else None,
1070                  any_column=rename[f"{alias}_success_any"] if self.success_any else None,
1071                  count_column=rename[f"{alias}_success_count"] if self.success_count else None,
1072                  rate_column=rename[f"{alias}_success_rate"] if self.success_rate else None,
1073                  score_column=rename[f"{alias}_score"] if self.score else None,
1074                  score_weights=self.score_weights,
1075              )
1076          ]
1077  
1078      def add_to_descriptors_list(self) -> bool:
1079          return False
1080  
1081  
1082  class FeatureDescriptor(Descriptor):
1083      feature: GeneratedFeatures
1084  
1085      def __init__(
1086          self, feature: GeneratedFeatures, alias: Optional[str] = None, tests: Optional[List[AnyDescriptorTest]] = None
1087      ):
1088          # this is needed because we try to access it before super call
1089          feature = feature if isinstance(feature, GeneratedFeatures) else parse_obj_as(GeneratedFeatures, feature)  # type: ignore[type-abstract]
1090          feature_columns = feature.list_columns()
1091          super().__init__(feature=feature, alias=alias or f"{feature_columns[0].display_name}", tests=tests)
1092  
1093      def get_dataset_column(self, column_name: str, values: pd.Series) -> DatasetColumn:
1094          column_type = self.feature.get_type(column_name)
1095          if column_type == ColumnType.Numerical:
1096              values = pd.to_numeric(values, errors="coerce")
1097          dataset_column = DatasetColumn(type=column_type, data=values)
1098          return dataset_column
1099  
1100      def generate_data(
1101          self, dataset: "Dataset", options: Options
1102      ) -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]:
1103          feature = self.feature.generate_features_renamed(
1104              dataset.as_dataframe(),
1105              create_data_definition(None, dataset.as_dataframe(), ColumnMapping()),
1106              options,
1107          )
1108          return {
1109              col.display_name: self.get_dataset_column(col.name, feature[col.name])
1110              for col in self.feature.list_columns()
1111          }
1112  
1113      def list_output_columns(self) -> List[str]:
1114          return [c.display_name for c in self.feature.list_columns()]
1115  
1116  
1117  def _determine_descriptor_column_name(alias: str, columns: List[str]):
1118      index = 1
1119      key = alias
1120      while key in columns:
1121          key = f"{alias}_{index}"
1122          index += 1
1123      return key
1124  
1125  
1126  @dataclasses.dataclass
1127  class StatCountValue:
1128      count: int
1129      share: float
1130  
1131  
1132  @dataclasses.dataclass
1133  class GeneralColumnStats:
1134      missing_values: StatCountValue
1135  
1136  
1137  @dataclasses.dataclass
1138  class NumericalColumnStats:
1139      max: Numeric
1140      min: Numeric
1141      mean: Numeric
1142      std: Numeric
1143      quantiles: Dict[str, Numeric]
1144      infinite: StatCountValue
1145  
1146  
1147  @dataclasses.dataclass
1148  class LabelStats:
1149      count: StatCountValue
1150  
1151  
1152  @dataclasses.dataclass
1153  class CategoricalColumnStats:
1154      unique_count: int
1155      label_stats: Dict[Label, LabelStats]
1156  
1157      @property
1158      def most_common(self) -> Optional[Tuple[Label, LabelStats]]:
1159          most_common = None
1160          for key, value in self.label_stats.items():
1161              if most_common is None:
1162                  most_common = key
1163                  continue
1164              if self.label_stats[most_common].count < value.count:
1165                  most_common = key
1166          if most_common is None:
1167              return None
1168          return most_common, self.label_stats[most_common]
1169  
1170  
1171  @dataclasses.dataclass
1172  class ColumnStats:
1173      general_stats: GeneralColumnStats
1174      numerical_stats: Optional[NumericalColumnStats]
1175      categorical_stats: Optional[CategoricalColumnStats]
1176  
1177  
1178  @dataclasses.dataclass
1179  class DatasetStats:
1180      """Statistics summary for a dataset.
1181  
1182      Contains overall dataset statistics including row count, column count,
1183      and per-column statistics.
1184      """
1185  
1186      row_count: int
1187      """Total number of rows in the dataset."""
1188      column_count: int
1189      """Total number of columns in the dataset."""
1190      column_stats: Dict[str, ColumnStats]
1191      """Dictionary mapping column names to their ColumnStats."""
1192  
1193  
1194  PossibleDatasetTypes = Union["Dataset", pd.DataFrame]
1195  
1196  
1197  class Dataset:
1198      """Dataset object that wraps your data with metadata and data definition.
1199  
1200      `Dataset` is the main data structure in Evidently. It wraps a `pandas.DataFrame`
1201      with additional metadata including:
1202      - `DataDefinition`: column types and roles mapping
1203      - Descriptors: computed row-level scores (for text/LLM data)
1204      - Metadata and tags: additional information about the dataset
1205  
1206      You typically create a `Dataset` from a `pandas.DataFrame` using `Dataset.from_pandas()`.
1207      Use `Dataset` objects with `Report.run()` to perform evaluations.
1208  
1209      **Documentation**: See [Data Definition Guide](https://docs.evidentlyai.com/docs/library/data_definition) for column mapping.
1210  
1211      Create from pandas DataFrame:
1212      ```python
1213      from evidently import Dataset, DataDefinition
1214  
1215      dataset = Dataset.from_pandas(
1216          source_df,
1217          data_definition=DataDefinition()
1218      )
1219      ```
1220  
1221      Add descriptors for text evaluation:
1222      ```python
1223      from evidently.descriptors import TextLength
1224  
1225      dataset.add_descriptors([TextLength(column="text")])
1226      ```
1227  
1228      Use in a Report:
1229      ```python
1230      from evidently import Report
1231      from evidently.presets import DataSummaryPreset
1232  
1233      report = Report([DataSummaryPreset()])
1234      snapshot = report.run(dataset, None)
1235      ```
1236      """
1237  
1238      _data_definition: DataDefinition
1239      _metadata: Dict[str, MetadataValueType]
1240      _tags: List[str]
1241  
1242      @classmethod
1243      def from_pandas(
1244          cls,
1245          data: pd.DataFrame,
1246          data_definition: Optional[DataDefinition] = None,
1247          descriptors: Optional[List[Descriptor]] = None,
1248          options: AnyOptions = None,
1249          metadata: Optional[Dict[str, MetadataValueType]] = None,
1250          tags: Optional[List[str]] = None,
1251      ) -> "Dataset":
1252          """Create a `Dataset` from a `pandas.DataFrame`.
1253  
1254          Args:
1255          * `data`: `pandas.DataFrame` with your data
1256          * `data_definition`: Optional `DataDefinition` for column mapping (auto-inferred if None).
1257            Use `DataDefinition()` for automatic mapping or provide explicit column mappings.
1258          * `descriptors`: Optional list of descriptors to compute and add to dataset
1259          * `options`: Optional options for descriptor computation
1260          * `metadata`: Optional metadata dictionary
1261          * `tags`: Optional list of tags
1262  
1263          Returns:
1264          * `Dataset` object ready for use with `Report.run()`
1265  
1266          Example:
1267          ```python
1268          from evidently import Dataset, DataDefinition
1269  
1270          dataset = Dataset.from_pandas(df, data_definition=DataDefinition())
1271          ```
1272          """
1273          dataset = PandasDataset(data, data_definition, metadata=metadata, tags=tags)
1274          if descriptors is not None:
1275              dataset.add_descriptors(descriptors, options)
1276          return dataset
1277  
1278      @staticmethod
1279      def from_any(dataset: PossibleDatasetTypes) -> "Dataset":
1280          """Convert various dataset types to a `Dataset` object.
1281  
1282          Args:
1283          * `dataset`: `pandas.DataFrame` or `Dataset` object
1284  
1285          Returns:
1286          * `Dataset` object (converts DataFrame if needed)
1287  
1288          Raises:
1289          * ValueError if dataset type is not supported
1290          """
1291          if isinstance(dataset, Dataset):
1292              return dataset
1293          if isinstance(dataset, pd.DataFrame):
1294              return Dataset.from_pandas(dataset)
1295          raise ValueError(f"Unsupported dataset type: {type(dataset)}")
1296  
1297      @abstractmethod
1298      def as_dataframe(self) -> pd.DataFrame:
1299          """Get the underlying `pandas.DataFrame`.
1300  
1301          Returns:
1302          * `pandas.DataFrame` with all data including computed descriptors
1303          """
1304          raise NotImplementedError()
1305  
1306      @abstractmethod
1307      def column(self, column_name: str) -> DatasetColumn:
1308          """Get a specific column from the dataset.
1309  
1310          Args:
1311          * `column_name`: Name of the column to retrieve
1312  
1313          Returns:
1314          * `DatasetColumn` object with column data and type information
1315          """
1316          raise NotImplementedError()
1317  
1318      @abstractmethod
1319      def subdataset(self, column_name: str, label: object) -> "Dataset":
1320          """Create a filtered subdataset matching a column value.
1321  
1322          Args:
1323          * `column_name`: Column to filter by
1324          * `label`: Value to filter for
1325  
1326          Returns:
1327          * New `Dataset` containing only rows where column equals label
1328          """
1329          raise NotImplementedError()
1330  
1331      @abstractmethod
1332      def stats(self) -> DatasetStats:
1333          """Get statistical summary of the dataset.
1334  
1335          Returns:
1336          * `DatasetStats` object with row count, column count, and per-column statistics
1337          """
1338          raise NotImplementedError()
1339  
1340      @property
1341      def data_definition(self) -> DataDefinition:
1342          """Get the `DataDefinition` mapping for this dataset.
1343  
1344          Returns:
1345          * `DataDefinition` object with column type and role mappings
1346          * `DataDefinition` object with column types and roles
1347          """
1348          return self._data_definition
1349  
1350      @property
1351      def metadata(self) -> Dict[str, MetadataValueType]:
1352          """Get metadata associated with this dataset.
1353  
1354          Returns:
1355          * Dictionary of metadata key-value pairs
1356          """
1357          return self._metadata
1358  
1359      @property
1360      def tags(self) -> List[str]:
1361          """Get tags associated with this dataset.
1362  
1363          Returns:
1364          * List of tag strings
1365          """
1366          return self._tags
1367  
1368      @abstractmethod
1369      def add_descriptor(self, descriptor: Descriptor, options: AnyOptions = None):
1370          """Add a descriptor to compute row-level scores.
1371  
1372          Args:
1373          * `descriptor`: `Descriptor` object to compute
1374          * `options`: Optional options for descriptor computation
1375          """
1376          raise NotImplementedError
1377  
1378      def add_descriptors(self, descriptors: List[Descriptor], options: AnyOptions = None):
1379          """Add multiple descriptors to the dataset.
1380  
1381          Args:
1382          * `descriptors`: List of `Descriptor` objects to compute
1383          * `options`: Optional options for descriptor computation
1384          """
1385          for descriptor in descriptors:
1386              self.add_descriptor(descriptor, options)
1387  
1388      @abstractmethod
1389      def save(self, uri: str):
1390          """Save the dataset to a file.
1391  
1392          Args:
1393          * `uri`: File path to save the dataset (supports .evidently_dataset format)
1394          """
1395          raise NotImplementedError
1396  
1397      @classmethod
1398      @abstractmethod
1399      def _can_load(cls, uri: str) -> bool:
1400          raise NotImplementedError
1401  
1402      @classmethod
1403      @abstractmethod
1404      def _load(cls, uri: str) -> "Dataset":
1405          raise NotImplementedError
1406  
1407      @classmethod
1408      def load(cls, uri: str) -> "Dataset":
1409          """Load a dataset from a file.
1410  
1411          Args:
1412          * `uri`: File path to load from (supports CSV, Parquet, and .evidently_dataset formats)
1413  
1414          Returns:
1415          * `Dataset` object loaded from file
1416  
1417          Raises:
1418          * Exception if dataset cannot be loaded
1419          """
1420          for subclass in cls.__subclasses__():
1421              if subclass._can_load(uri):
1422                  return subclass._load(uri)
1423          raise Exception(f"Dataset {uri} could not be loaded")
1424  
1425  
1426  INTEGER_CARDINALITY_LIMIT = 10
1427  
1428  
1429  def infer_column_type(column_data: pd.Series) -> ColumnType:
1430      if column_data.dtype.name.startswith("float"):
1431          return ColumnType.Numerical
1432      if column_data.dtype.name.startswith("int"):
1433          if column_data.nunique() <= INTEGER_CARDINALITY_LIMIT:
1434              return ColumnType.Categorical
1435          else:
1436              return ColumnType.Numerical
1437      if column_data.dtype.name in ["string", "str"]:
1438          if column_data.nunique() > (column_data.count() * 0.5):
1439              return ColumnType.Text
1440          else:
1441              return ColumnType.Categorical
1442      if column_data.dtype.name == "object":
1443          without_na = column_data.dropna()
1444          if without_na.count() == 0:
1445              return ColumnType.Unknown
1446          if isinstance(without_na.iloc[0], str) and isinstance(without_na.iloc[-1], str):
1447              if column_data.nunique() > (column_data.count() * 0.5):
1448                  return ColumnType.Text
1449              else:
1450                  return ColumnType.Categorical
1451          elif isinstance(without_na.iloc[0], (list, tuple)) and isinstance(without_na.iloc[-1], (list, tuple)):
1452              return ColumnType.List
1453          return ColumnType.Unknown
1454      if column_data.dtype.name in ["bool", "category"]:
1455          return ColumnType.Categorical
1456      if column_data.dtype.name.startswith("datetime"):
1457          return ColumnType.Datetime
1458      return ColumnType.Unknown
1459  
1460  
1461  MARKER_CONTENT = """{"version": "1.0"}"""
1462  MARKER_FILENAME = ".evidently_dataset"
1463  DATA_FILENAME = "data.parquet"
1464  META_FILENAME = "dataset.json"
1465  
1466  
1467  def _write_evidently_dataset(dataset: Dataset, uri: str):
1468      with tarfile.open(uri, "w") as tar:  # todo: use fsspec location
1469          # Add marker file
1470          marker_data = MARKER_CONTENT.encode("utf-8")
1471          marker_info = tarfile.TarInfo(MARKER_FILENAME)
1472          marker_info.size = len(marker_data)
1473          tar.addfile(marker_info, io.BytesIO(marker_data))
1474  
1475          # Add dataframe as parquet
1476          buffer = io.BytesIO()
1477          dataset.as_dataframe().to_parquet(buffer, index=False)
1478          buffer.seek(0)
1479          data_info = tarfile.TarInfo(DATA_FILENAME)
1480          data_info.size = len(buffer.getbuffer())
1481          tar.addfile(data_info, buffer)
1482  
1483          # Add metadata as JSON
1484          metadata = {
1485              "tags": dataset.tags,
1486              "metadata": dataset.metadata,
1487              "data_definition": dataset.data_definition.dict(),
1488          }
1489          meta_bytes = json.dumps(metadata, indent=2).encode("utf-8")
1490          meta_info = tarfile.TarInfo(META_FILENAME)
1491          meta_info.size = len(meta_bytes)
1492          tar.addfile(meta_info, io.BytesIO(meta_bytes))
1493  
1494  
1495  def _read_evidently_dataset(uri: str) -> Dataset:
1496      with tarfile.open(uri, "r") as tar:
1497          names = tar.getnames()
1498  
1499          # Check marker
1500          if MARKER_FILENAME not in names:
1501              raise ValueError("Not a valid Evidently dataset: missing marker")
1502          marker_file = tar.extractfile(MARKER_FILENAME)
1503          if marker_file is None or marker_file.read().decode("utf-8") != MARKER_CONTENT:
1504              raise ValueError("Invalid Evidently dataset marker content")
1505  
1506          # Load dataframe
1507          if DATA_FILENAME not in names:
1508              raise ValueError("Missing data file in Evidently dataset")
1509          data_file = tar.extractfile(DATA_FILENAME)
1510          if data_file is None:
1511              raise ValueError("Missing data file in Evidently dataset")
1512          df = pd.read_parquet(data_file)
1513  
1514          # Load metadata
1515          if META_FILENAME not in names:
1516              raise ValueError("Missing metadata file in Evidently dataset")
1517          meta_file = tar.extractfile(META_FILENAME)
1518          if meta_file is None:
1519              raise ValueError("Missing metadata file in Evidently dataset")
1520          metadata = json.load(meta_file)
1521  
1522      return Dataset.from_pandas(
1523          df,
1524          data_definition=DataDefinition.parse_obj(metadata["data_definition"]),
1525          metadata=metadata["metadata"],
1526          tags=metadata["tags"],
1527      )
1528  
1529  
1530  class PandasDataset(Dataset):
1531      SUPPORTED_FORMATS = {"csv": pd.read_csv, "parquet": pd.read_parquet, EVIDENTLY_DATASET_EXT: _read_evidently_dataset}
1532      _data: pd.DataFrame
1533      _data_definition: DataDefinition
1534      _dataset_stats: DatasetStats
1535      _metadata: Dict[str, MetadataValueType]
1536      _tags: List[str]
1537  
1538      def __init__(
1539          self,
1540          data: pd.DataFrame,
1541          data_definition: Optional[DataDefinition] = None,
1542          metadata: Optional[Dict[str, MetadataValueType]] = None,
1543          tags: Optional[List[str]] = None,
1544      ):
1545          self._data = data.copy()
1546          if (
1547              data_definition is None
1548              or data_definition.datetime_columns is None
1549              or data_definition.categorical_columns is None
1550              or data_definition.text_columns is None
1551              or data_definition.numerical_columns is None
1552              or data_definition.unknown_columns is None
1553              or data_definition.list_columns is None
1554          ):
1555              reserved_fields = []
1556              if data_definition is not None:
1557                  if data_definition.service_columns is not None:
1558                      if data_definition.service_columns.trace_link is not None:
1559                          reserved_fields.append(data_definition.service_columns.trace_link)
1560                  if data_definition.timestamp is not None:
1561                      reserved_fields.append(data_definition.timestamp)
1562                  if data_definition.id_column is not None:
1563                      reserved_fields.append(data_definition.id_column)
1564                  if data_definition.numerical_columns is not None:
1565                      reserved_fields.extend(data_definition.numerical_columns)
1566                  if data_definition.categorical_columns is not None:
1567                      reserved_fields.extend(data_definition.categorical_columns)
1568                  if data_definition.datetime_columns is not None:
1569                      reserved_fields.extend(data_definition.datetime_columns)
1570                  if data_definition.text_columns is not None:
1571                      reserved_fields.extend(data_definition.text_columns)
1572                  if data_definition.unknown_columns is not None:
1573                      reserved_fields.extend(data_definition.unknown_columns)
1574                  if data_definition.list_columns is not None:
1575                      reserved_fields.extend(data_definition.list_columns)
1576                  if data_definition.numerical_descriptors is not None:
1577                      reserved_fields.extend(data_definition.numerical_descriptors)
1578                  if data_definition.categorical_descriptors is not None:
1579                      reserved_fields.extend(data_definition.categorical_descriptors)
1580              generated_data_definition = self._generate_data_definition(
1581                  data,
1582                  reserved_fields,
1583                  data_definition.service_columns if data_definition is not None else None,
1584              )
1585              if data_definition is None:
1586                  self._data_definition = generated_data_definition
1587              else:
1588                  self._data_definition = copy.deepcopy(data_definition)
1589                  if self._data_definition.datetime_columns is None:
1590                      if self._data_definition.timestamp is not None and generated_data_definition.timestamp is not None:
1591                          self._data_definition.datetime_columns = [generated_data_definition.timestamp]
1592                      else:
1593                          self._data_definition.datetime_columns = generated_data_definition.datetime_columns
1594                  if self._data_definition.numerical_columns is None:
1595                      self._data_definition.numerical_columns = generated_data_definition.numerical_columns
1596                  if self._data_definition.categorical_columns is None:
1597                      self._data_definition.categorical_columns = generated_data_definition.categorical_columns
1598                  if self._data_definition.text_columns is None:
1599                      self._data_definition.text_columns = generated_data_definition.text_columns
1600                  if self._data_definition.unknown_columns is None:
1601                      self._data_definition.unknown_columns = generated_data_definition.unknown_columns
1602                  if self._data_definition.list_columns is None:
1603                      self._data_definition.list_columns = generated_data_definition.list_columns
1604                  if self._data_definition.timestamp is None and generated_data_definition.timestamp is not None:
1605                      self._data_definition.timestamp = generated_data_definition.timestamp
1606                  if (
1607                      self._data_definition.service_columns is None
1608                      and generated_data_definition.service_columns is not None
1609                  ):
1610                      self._data_definition.service_columns = generated_data_definition.service_columns
1611          else:
1612              self._data_definition = copy.deepcopy(data_definition)
1613          (rows, columns) = data.shape
1614  
1615          column_stats = {}
1616          for column in data.columns:
1617              column_stats[column] = self._collect_stats(self._data_definition.get_column_type(column), data[column])
1618          self._dataset_stats = DatasetStats(rows, columns, column_stats)
1619          self._metadata = metadata or {}
1620          self._tags = tags or []
1621  
1622      def as_dataframe(self) -> pd.DataFrame:
1623          return self._data
1624  
1625      def column(self, column_name: str) -> DatasetColumn:
1626          return DatasetColumn(self._data_definition.get_column_type(column_name), self._data[column_name])
1627  
1628      def subdataset(self, column_name: str, label: object):
1629          return PandasDataset(self._data[self._data[column_name] == label], self._data_definition)
1630  
1631      def _generate_data_definition(
1632          self,
1633          data: pd.DataFrame,
1634          reserved_fields: List[str],
1635          service_columns: Optional[ServiceColumns] = None,
1636      ) -> DataDefinition:
1637          numerical = []
1638          categorical = []
1639          text = []
1640          datetime = []
1641          unknown = []
1642          list_columns = []
1643          service = None
1644          for column in data.columns:
1645              if column in reserved_fields:
1646                  continue
1647              if service_columns is None and column == DEFAULT_TRACE_LINK_COLUMN:
1648                  if service is None:
1649                      service = ServiceColumns(trace_link=column)
1650                  else:
1651                      service.trace_link = column
1652                  continue
1653              column_type = infer_column_type(data[column])
1654              if column_type == ColumnType.Numerical:
1655                  numerical.append(column)
1656              if column_type == ColumnType.Categorical:
1657                  categorical.append(column)
1658              if column_type == ColumnType.Datetime:
1659                  datetime.append(column)
1660              if column_type == ColumnType.Text:
1661                  text.append(column)
1662              if column_type == ColumnType.Unknown:
1663                  unknown.append(column)
1664              if column_type == ColumnType.List:
1665                  list_columns.append(column)
1666  
1667          return DataDefinition(
1668              timestamp=datetime[0] if len(datetime) == 1 else None,
1669              service_columns=service,
1670              numerical_columns=numerical,
1671              categorical_columns=categorical,
1672              unknown_columns=unknown,
1673              list_columns=list_columns,
1674              datetime_columns=datetime if len(datetime) != 1 else [],
1675              text_columns=text,
1676          )
1677  
1678      def stats(self) -> DatasetStats:
1679          return self._dataset_stats
1680  
1681      def add_column(self, key: str, data: DatasetColumn, add_to_descriptor_list: bool = True):
1682          self._dataset_stats.column_count += 1
1683          self._dataset_stats.column_stats[key] = self._collect_stats(data.type, data.data)
1684          self._data[key] = data.data
1685          if add_to_descriptor_list and data.type == ColumnType.Numerical:
1686              self._data_definition.numerical_descriptors.append(key)
1687          if add_to_descriptor_list and data.type == ColumnType.Categorical:
1688              self._data_definition.categorical_descriptors.append(key)
1689  
1690      def add_descriptor(self, descriptor: Descriptor, options: AnyOptions = None):
1691          descriptor.validate_input(self._data_definition)
1692          new_columns = descriptor.generate_data(self, Options.from_any_options(options))
1693          if isinstance(new_columns, DatasetColumn):
1694              new_columns = {descriptor.alias: new_columns}
1695          rename = {}
1696          for col, value in new_columns.items():
1697              name = _determine_descriptor_column_name(col, self._data.columns.tolist())
1698              rename[col] = name
1699              self.add_column(name, value, descriptor.add_to_descriptors_list())
1700              if isinstance(descriptor, ColumnTest):
1701                  if self._data_definition.test_descriptors is None:
1702                      self._data_definition.test_descriptors = []
1703                  self._data_definition.test_descriptors.append(name)
1704          self.data_definition.special_columns.extend(descriptor.get_special_columns_info(rename))
1705          for sub in descriptor.get_sub_descriptors():
1706              self.add_descriptor(sub, options)
1707  
1708      def _collect_stats(self, column_type: ColumnType, data: pd.Series):
1709          numerical_stats = None
1710          if column_type == ColumnType.Numerical:
1711              numerical_stats = _collect_numerical_stats(data)
1712  
1713          categorical_stats = None
1714          if column_type == ColumnType.Categorical:
1715              categorical_stats = _collect_categorical_stats(data)
1716  
1717          return ColumnStats(
1718              general_stats=GeneralColumnStats(missing_values=StatCountValue(0, 0)),
1719              numerical_stats=numerical_stats,
1720              categorical_stats=categorical_stats,
1721          )
1722  
1723      def save(self, uri: str):
1724          if not uri.endswith(f".{EVIDENTLY_DATASET_EXT}"):
1725              uri += f".{EVIDENTLY_DATASET_EXT}"
1726          _write_evidently_dataset(self, uri)
1727  
1728      @classmethod
1729      def _can_load(cls, uri: str) -> bool:
1730          split = uri.split(".")[-1]
1731          return split in cls.SUPPORTED_FORMATS or os.path.exists(f"{uri}.{EVIDENTLY_DATASET_EXT}")
1732  
1733      @classmethod
1734      def _load(cls, uri: str) -> "Dataset":
1735          ext = uri.split(".")[-1]
1736          if ext not in cls.SUPPORTED_FORMATS:
1737              if os.path.exists(f"{uri}.{EVIDENTLY_DATASET_EXT}"):
1738                  ext = EVIDENTLY_DATASET_EXT
1739                  uri = f"{uri}.{ext}"
1740              else:
1741                  raise ValueError(f"Unsupported format: {ext}")
1742          # todo: load from fsspec stream instead
1743          data = cls.SUPPORTED_FORMATS[ext](uri)  # type: ignore[operator]
1744          if isinstance(data, Dataset):
1745              return data
1746          return Dataset.from_pandas(data)
1747  
1748  
1749  def _collect_numerical_stats(data: pd.Series):
1750      infinite_count = data.groupby(np.isinf(data)).count().get(True, 0)
1751      return NumericalColumnStats(
1752          max=data.max(),
1753          min=data.min(),
1754          mean=data.mean(),
1755          std=data.std(),
1756          quantiles={
1757              "p25": data.quantile(0.25),
1758              "p75": data.quantile(0.75),
1759          },
1760          infinite=StatCountValue(infinite_count, infinite_count / data.count()),
1761      )
1762  
1763  
1764  def _collect_categorical_stats(data: pd.Series):
1765      total_count = data.count()
1766      return CategoricalColumnStats(
1767          unique_count=data.nunique(),
1768          label_stats={
1769              label: LabelStats(count=StatCountValue(count, count / total_count))
1770              for label, count in data.value_counts().items()
1771          },
1772      )