datasets.py
1 import abc 2 import copy 3 import dataclasses 4 import io 5 import json 6 import os 7 import tarfile 8 from abc import abstractmethod 9 from enum import Enum 10 from typing import TYPE_CHECKING 11 from typing import Any 12 from typing import ClassVar 13 from typing import Dict 14 from typing import Generator 15 from typing import List 16 from typing import Optional 17 from typing import Tuple 18 from typing import Union 19 20 import numpy as np 21 import pandas as pd 22 23 from evidently._pydantic_compat import BaseModel 24 from evidently._pydantic_compat import parse_obj_as 25 from evidently.core.base_types import Label 26 from evidently.core.tests import GenericTest 27 from evidently.legacy.base_metric import DisplayName 28 from evidently.legacy.core import ColumnType 29 from evidently.legacy.features.generated_features import GeneratedFeatures 30 from evidently.legacy.options.base import AnyOptions 31 from evidently.legacy.options.base import Options 32 from evidently.legacy.pipeline.column_mapping import ColumnMapping 33 from evidently.legacy.suite.base_suite import MetadataValueType 34 from evidently.legacy.utils.data_preprocessing import create_data_definition 35 from evidently.legacy.utils.types import Numeric 36 from evidently.pydantic_utils import AutoAliasMixin 37 from evidently.pydantic_utils import EvidentlyBaseModel 38 39 EVIDENTLY_DATASET_EXT = "evidently_dataset" 40 41 if TYPE_CHECKING: 42 from evidently.core.container import MetricOrContainer 43 44 45 class ColumnRole(Enum): 46 """Role of a column in the dataset. 47 48 Defines the semantic role of a column (e.g., target, prediction, feature). 49 Used in `DataDefinition` to specify column purposes. 50 """ 51 52 Unset = "Unset" 53 """Column role is not set.""" 54 Target = "target" 55 """Column contains target/ground truth values.""" 56 Output = "output" 57 """Column contains model output/predictions.""" 58 Feature = "feature" 59 """Column is a feature used for prediction.""" 60 Descriptor = "descriptor" 61 """Column is a computed descriptor (e.g., from text).""" 62 UserId = "user_id" 63 """Column contains user IDs (for ranking/recsys).""" 64 ItemId = "item_id" 65 """Column contains item IDs (for ranking/recsys).""" 66 Input = "input" 67 """Column is an input to the model.""" 68 Context = "context" 69 """Column contains context information.""" 70 Example = "example" 71 """Column contains example data.""" 72 73 74 @dataclasses.dataclass 75 class ColumnInfo: 76 """Information about a column's type and role.""" 77 78 type: ColumnType 79 """Column type (numerical, categorical, text, etc.).""" 80 role: ColumnRole = ColumnRole.Unset 81 """Column role (target, feature, etc.).""" 82 83 84 @dataclasses.dataclass 85 class BinaryClassification: 86 """Configuration for binary classification evaluation tasks. 87 88 Maps columns containing target labels and predictions for binary classification. 89 Used in `DataDefinition` to specify which columns contain classification data. 90 91 Example: 92 ```python 93 definition = DataDefinition( 94 classification=[BinaryClassification( 95 target="target", 96 prediction_labels="prediction" 97 )] 98 ) 99 ``` 100 """ 101 102 name: str 103 """Identifier for this classification task.""" 104 target: str 105 """Column name with true binary labels.""" 106 prediction_labels: Optional[str] 107 """Column name with predicted binary labels.""" 108 prediction_probas: Optional[str] 109 """Column name with predicted probabilities.""" 110 pos_label: Label 111 """Value representing the positive class.""" 112 labels: Optional[Dict[Label, str]] 113 """Optional mapping of label values to display names.""" 114 115 def __init__( 116 self, 117 *, 118 name: str = "default", 119 target: Optional[str] = None, 120 prediction_labels: Optional[str] = None, 121 prediction_probas: Optional[str] = None, 122 pos_label: Optional[str] = None, 123 labels: Optional[Dict[Label, str]] = None, 124 ): 125 """Initialize binary classification configuration. 126 127 If no arguments are provided, defaults to `target="target"` and `prediction_probas="prediction"`. 128 Otherwise, requires `target` and at least one of `prediction_labels` or `prediction_probas`. 129 """ 130 self.name = name 131 if ( 132 target is None 133 and prediction_labels is None 134 and prediction_probas is None 135 and pos_label is None 136 and labels is None 137 ): 138 self.target = "target" 139 self.prediction_labels = None 140 self.prediction_probas = "prediction" 141 self.pos_label = 1 142 self.labels = None 143 return 144 if target is None or (prediction_labels is None and prediction_probas is None): 145 raise ValueError( 146 "Invalid BinaryClassification configuration:" " target and one of (labels or probas) should be set" 147 ) 148 self.target = target 149 self.prediction_labels = prediction_labels 150 self.prediction_probas = prediction_probas 151 self.pos_label = pos_label if pos_label is not None else 1 152 self.labels = labels 153 154 155 @dataclasses.dataclass 156 class MulticlassClassification: 157 """Configuration for multiclass classification evaluation tasks. 158 159 Maps columns containing target labels and predictions for multiclass classification. 160 Used in `DataDefinition` to specify which columns contain classification data. 161 162 Example: 163 ```python 164 definition = DataDefinition( 165 classification=[MulticlassClassification( 166 target="target", 167 prediction_labels="prediction", 168 prediction_probas=["0", "1", "2"] 169 )] 170 ) 171 ``` 172 """ 173 174 name: str = "default" 175 """Identifier for this classification task.""" 176 target: str = "target" 177 """Column name with true class labels.""" 178 prediction_labels: Optional[str] = "prediction" 179 """Column name with predicted class labels.""" 180 prediction_probas: Optional[List[str]] = None 181 """List of column names with predicted probabilities per class.""" 182 labels: Optional[Dict[Label, str]] = None 183 """Optional mapping of label values to display names.""" 184 185 def __init__( 186 self, 187 *, 188 name: str = "default", 189 target: Optional[str] = None, 190 prediction_labels: Optional[str] = None, 191 prediction_probas: Optional[List[str]] = None, 192 labels: Optional[Dict[Label, str]] = None, 193 ): 194 """Initialize multiclass classification configuration. 195 196 If no arguments are provided, defaults to `target="target"` and `prediction_labels="prediction"`. 197 Otherwise, requires `target` and at least one of `prediction_labels` or `prediction_probas`. 198 """ 199 self.name = name 200 if target is None and prediction_labels is None and prediction_probas is None and labels is None: 201 self.target = "target" 202 self.prediction_labels = "prediction" 203 self.prediction_probas = None 204 self.labels = None 205 return 206 if target is None or (prediction_labels is None and prediction_probas is None): 207 raise ValueError( 208 "Invalid MulticlassClassification configuration:" " target and one of (labels or probas) should be set" 209 ) 210 self.target = target 211 self.prediction_labels = prediction_labels 212 self.prediction_probas = prediction_probas 213 self.labels = labels 214 215 216 Classification = Union[BinaryClassification, MulticlassClassification] 217 218 219 @dataclasses.dataclass 220 class Regression: 221 """Configuration for regression evaluation tasks. 222 223 Maps columns containing target values and predictions for regression. 224 Used in `DataDefinition` to specify which columns contain regression data. 225 226 Example: 227 ```python 228 definition = DataDefinition( 229 regression=[Regression(target="y_true", prediction="y_pred")] 230 ) 231 ``` 232 """ 233 234 name: str = "default" 235 """Identifier for this regression task.""" 236 target: str = "target" 237 """Column name with actual/true values.""" 238 prediction: str = "prediction" 239 """Column name with predicted values.""" 240 241 242 @dataclasses.dataclass 243 class Recsys: 244 """Configuration for recommender systems and ranking evaluation tasks. 245 246 Maps columns for evaluating recommendation systems, including user-item interactions 247 and relevance scores. Used in `DataDefinition` to specify ranking/recsys data structure. 248 249 Example: 250 ```python 251 definition = DataDefinition( 252 ranking=[Recsys()] 253 ) 254 ``` 255 """ 256 257 name: str = "default" 258 """Identifier for this ranking task.""" 259 user_id: str = "user_id" 260 """Column name with user identifiers.""" 261 item_id: str = "item_id" 262 """Column name with item identifiers.""" 263 target: str = "target" 264 """Column name with relevance labels/scores.""" 265 prediction: str = "prediction" 266 """Column name with predicted scores or ranks.""" 267 recommendations_type: str = "score" 268 """Type of prediction - "score" or "rank".""" 269 270 271 @dataclasses.dataclass 272 class Completion: 273 pass 274 275 276 @dataclasses.dataclass 277 class RAG: 278 pass 279 280 281 @dataclasses.dataclass 282 class LLMClassification: 283 """Configuration for LLM classification evaluation tasks. 284 285 Maps columns containing LLM inputs, outputs, and optional reasoning for LLM evaluation. 286 Used in `DataDefinition` to specify which columns contain LLM interaction data. 287 288 Example: 289 ```python 290 definition = DataDefinition( 291 llm=LLMClassification( 292 input="question", 293 target="expected_answer", 294 predictions="model_answer" 295 ) 296 ) 297 ``` 298 """ 299 300 input: str 301 """Column name with LLM input/prompt text.""" 302 target: str 303 """Column name with expected/ground truth output.""" 304 predictions: Optional[str] = None 305 """Column name with LLM-generated output.""" 306 reasoning: Optional[str] = None 307 """Column name with reasoning text.""" 308 prediction_reasoning: Optional[str] = None 309 """Column name with reasoning for predictions.""" 310 name: str = "llm_default" 311 """Identifier for this LLM task.""" 312 313 314 class SpecialColumnInfo(AutoAliasMixin, EvidentlyBaseModel): 315 """Base class for special column information. 316 317 Used to define special columns that require custom handling or metrics. 318 Subclasses can provide custom metrics and column type information. 319 """ 320 321 __alias_type__: ClassVar = "special_column_info" 322 """Alias type for serialization.""" 323 324 class Config: 325 is_base_type = True 326 327 def get_metrics(self) -> List["MetricOrContainer"]: 328 """Get metrics associated with this special column. 329 330 Returns: 331 * List of metrics or metric containers. 332 """ 333 return [] 334 335 def get_column_type(self, column_name: str) -> Optional[ColumnType]: 336 """Get the column type for a column name. 337 338 Args: 339 * `column_name`: Name of the column. 340 341 Returns: 342 * `ColumnType` if known, `None` otherwise. 343 """ 344 return None 345 346 347 LLMDefinition = Union[Completion, RAG, LLMClassification] 348 349 350 DEFAULT_TRACE_LINK_COLUMN = "_evidently_trace_link" 351 352 353 class ServiceColumns(BaseModel): 354 """Service columns for special functionality. 355 356 Defines columns used for special features like trace linking and human feedback. 357 """ 358 359 trace_link: Optional[str] = None 360 """Optional column name for trace links.""" 361 human_feedback_label: Optional[str] = None 362 """Optional column name for human feedback labels.""" 363 human_feedback_comment: Optional[str] = None 364 """Optional column name for human feedback comments.""" 365 366 367 class DataDefinition(BaseModel): 368 """Maps column types and roles in your dataset for correct evaluation processing. 369 370 `DataDefinition` maps: 371 - Column types (e.g., categorical, numerical, text) 372 - Column roles (e.g., id, prediction, target, timestamp) 373 - Task-specific configurations (classification, regression, ranking, LLM) 374 375 This allows Evidently to process the data correctly. Some evaluations need specific 376 columns and will fail if they're missing. 377 378 **Documentation**: See [Data Definition Guide](https://docs.evidentlyai.com/docs/library/data_definition) for detailed mapping options. 379 380 Auto-mapping (empty DataDefinition): 381 ```python 382 dataset = Dataset.from_pandas(df, data_definition=DataDefinition()) 383 ``` 384 385 Manual mapping: 386 ```python 387 definition = DataDefinition( 388 numerical_columns=["Age", "Salary"], 389 categorical_columns=["Department"], 390 classification=[BinaryClassification(target="target", prediction_labels="prediction")] 391 ) 392 ``` 393 """ 394 395 id_column: Optional[str] = None 396 """Column name with unique identifiers.""" 397 timestamp: Optional[str] = None 398 """Column name with timestamp values.""" 399 service_columns: Optional[ServiceColumns] = None 400 """Service columns like trace links.""" 401 numerical_columns: Optional[List[str]] = None 402 """List of numerical column names.""" 403 categorical_columns: Optional[List[str]] = None 404 """List of categorical column names.""" 405 text_columns: Optional[List[str]] = None 406 """List of text column names.""" 407 datetime_columns: Optional[List[str]] = None 408 """List of datetime column names.""" 409 unknown_columns: Optional[List[str]] = None 410 """List of unknown/unclassified column names.""" 411 list_columns: Optional[List[str]] = None 412 """List of list/array column names.""" 413 classification: Optional[List[Classification]] = None 414 """List of classification task configurations (`BinaryClassification` or `MulticlassClassification`).""" 415 regression: Optional[List[Regression]] = None 416 """List of regression task configurations (`Regression`).""" 417 llm: Optional[LLMDefinition] = None 418 """LLM task configuration (`LLMClassification`).""" 419 numerical_descriptors: List[str] = [] 420 """List of numerical descriptor column names.""" 421 categorical_descriptors: List[str] = [] 422 """List of categorical descriptor column names.""" 423 test_descriptors: Optional[List[str]] = None 424 """List of test descriptor column names.""" 425 ranking: Optional[List[Recsys]] = None 426 """List of ranking/recsys task configurations (`Recsys`).""" 427 special_columns: List[SpecialColumnInfo] = [] 428 """Additional special column configurations.""" 429 embeddings: Optional[Dict[str, List[str]]] = None 430 """Embeddings columns definitions: mapping of embedding name to list of columns""" 431 432 def __init__( 433 self, 434 id_column: Optional[str] = None, 435 timestamp: Optional[str] = None, 436 numerical_columns: Optional[List[str]] = None, 437 categorical_columns: Optional[List[str]] = None, 438 text_columns: Optional[List[str]] = None, 439 datetime_columns: Optional[List[str]] = None, 440 classification: Optional[List[Classification]] = None, 441 regression: Optional[List[Regression]] = None, 442 llm: Optional[LLMDefinition] = None, 443 numerical_descriptors: Optional[List[str]] = None, 444 categorical_descriptors: Optional[List[str]] = None, 445 unknown_columns: Optional[List[str]] = None, 446 list_columns: Optional[List[str]] = None, 447 test_descriptors: Optional[List[str]] = None, 448 ranking: Optional[List[Recsys]] = None, 449 service_columns: Optional[ServiceColumns] = None, 450 special_columns: Optional[List[SpecialColumnInfo]] = None, 451 embeddings: Optional[Dict[str, List[str]]] = None, 452 ): 453 """Initialize DataDefinition with column mappings. 454 455 The constructor maps all parameters directly to the corresponding model fields. 456 If `numerical_descriptors` or `categorical_descriptors` are not provided, they default to empty lists. 457 """ 458 super().__init__( 459 id_column=id_column, 460 timestamp=timestamp, 461 numerical_columns=numerical_columns, 462 categorical_columns=categorical_columns, 463 text_columns=text_columns, 464 datetime_columns=datetime_columns, 465 unknown_columns=unknown_columns, 466 list_columns=list_columns, 467 # classification=classification, 468 # regression=regression, 469 # llm=llm, 470 numerical_descriptors=numerical_descriptors if numerical_descriptors is not None else [], 471 categorical_descriptors=categorical_descriptors if categorical_descriptors is not None else [], 472 test_descriptors=test_descriptors, 473 # ranking=ranking, 474 service_columns=service_columns, 475 special_columns=special_columns if special_columns is not None else [], 476 ) 477 self.classification = classification 478 self.regression = regression 479 self.llm = llm 480 self.ranking = ranking 481 self.embeddings = embeddings 482 483 def get_numerical_columns(self): 484 """Get all numerical columns including descriptors. 485 486 Returns: 487 * List of numerical column names (includes both explicitly mapped and descriptor columns) 488 """ 489 return (self.numerical_columns or []) + (self.numerical_descriptors or []) 490 491 def get_categorical_columns(self): 492 """Get all categorical columns including descriptors. 493 494 Returns: 495 * List of categorical column names (includes both explicitly mapped and descriptor columns) 496 """ 497 return (self.categorical_columns or []) + (self.categorical_descriptors or []) 498 499 def get_text_columns(self): 500 """Get all text columns. 501 502 Returns: 503 * List of text column names 504 """ 505 return self.text_columns or [] 506 507 def get_datetime_columns(self): 508 """Get all datetime columns. 509 510 Returns: 511 * List of datetime column names 512 """ 513 return self.datetime_columns or [] 514 515 def get_unknown_columns(self): 516 """Get all unknown/unclassified columns. 517 518 Returns: 519 * List of unknown column names 520 """ 521 return self.unknown_columns or [] 522 523 def get_list_columns(self): 524 """Get all list/array columns. 525 526 Returns: 527 * List of list column names 528 """ 529 return self.list_columns or [] 530 531 def get_column_type(self, column_name: str) -> ColumnType: 532 """Get the column type for a specific column. 533 534 Args: 535 * `column_name`: Name of the column to check 536 537 Returns: 538 * `evidently.legacy.core.ColumnType` enum value for the column 539 """ 540 if column_name in self.get_numerical_columns(): 541 return ColumnType.Numerical 542 if column_name in self.get_categorical_columns(): 543 return ColumnType.Categorical 544 if column_name in self.get_text_columns(): 545 return ColumnType.Text 546 if column_name in self.get_datetime_columns(): 547 return ColumnType.Datetime 548 if column_name in self.get_unknown_columns(): 549 return ColumnType.Unknown 550 if column_name in self.get_list_columns(): 551 return ColumnType.List 552 if column_name == self.timestamp: 553 return ColumnType.Date 554 if column_name == self.id_column: 555 return ColumnType.Id 556 for special_column in self.special_columns: 557 ct = special_column.get_column_type(column_name) 558 if ct is not None: 559 return ct 560 return ColumnType.Unknown 561 562 def get_classification(self, classification_id: str) -> Optional[Classification]: 563 """Get classification configuration by ID. 564 565 Args: 566 * `classification_id`: Name/ID of the classification task 567 568 Returns: 569 * `BinaryClassification` or `MulticlassClassification` configuration or None if not found 570 """ 571 item_list = list(filter(lambda x: x.name == classification_id, self.classification or [])) 572 if len(item_list) == 0: 573 return None 574 if len(item_list) > 1: 575 raise ValueError("More than one classification with id {}".format(classification_id)) 576 return item_list[0] 577 578 def get_ranking(self, ranking_id: str) -> Optional[Recsys]: 579 """Get ranking/recsys configuration by ID. 580 581 Args: 582 * `ranking_id`: Name/ID of the ranking task 583 584 Returns: 585 * `Recsys` configuration or None if not found 586 """ 587 item_list = list(filter(lambda x: x.name == ranking_id, self.ranking or [])) 588 if len(item_list) == 0: 589 return None 590 if len(item_list) > 1: 591 raise ValueError("More than one ranking with id {}".format(ranking_id)) 592 return item_list[0] 593 594 def get_columns(self, types: List[ColumnType]) -> Generator[str, None, None]: 595 """Get column names of specified types. 596 597 Args: 598 * `types`: List of `evidently.legacy.core.ColumnType` values to filter by 599 600 Returns: 601 * Generator yielding column names matching the specified types 602 """ 603 if ColumnType.Numerical in types: 604 yield from self.get_numerical_columns() 605 if ColumnType.Categorical in types: 606 yield from self.get_categorical_columns() 607 if ColumnType.Text in types: 608 yield from self.get_text_columns() 609 if ColumnType.Datetime in types: 610 yield from self.get_datetime_columns() 611 if ColumnType.Unknown in types: 612 yield from self.get_unknown_columns() 613 if ColumnType.List in types: 614 yield from self.get_list_columns() 615 616 def get_regression(self, regression_id: str) -> Optional[Regression]: 617 """Get regression configuration by ID. 618 619 Args: 620 * `regression_id`: Name/ID of the regression task 621 622 Returns: 623 * `Regression` configuration or None if not found 624 """ 625 item_list = list(filter(lambda x: x.name == regression_id, self.regression or [])) 626 if len(item_list) == 0: 627 return None 628 if len(item_list) > 1: 629 raise ValueError("More than one regression with id {}".format(regression_id)) 630 return item_list[0] 631 632 633 class DatasetColumn: 634 """Wrapper for a single column in a dataset. 635 636 Contains the column type and the actual data as a pandas Series. 637 Used internally to access column data with type information. 638 """ 639 640 type: ColumnType 641 """Column type (numerical, categorical, text, etc.).""" 642 data: pd.Series 643 """Pandas Series containing the column data.""" 644 645 def __init__(self, type: Union[str, ColumnType], data: pd.Series) -> None: 646 """Initialize a dataset column. 647 648 Args: 649 * `type`: `ColumnType` or string name of the column type. 650 * `data`: `pandas.Series` containing the column data. 651 """ 652 self.type = ColumnType(type) 653 self.data = data 654 655 656 class ColumnCondition(AutoAliasMixin, EvidentlyBaseModel, abc.ABC): 657 """Base class for column value conditions. 658 659 Used to define conditions that check values in a column (e.g., greater than, 660 in range, matches pattern). Used in descriptor tests and column filters. 661 """ 662 663 __alias_type__: ClassVar[str] = "column_condition" 664 """Alias type for serialization.""" 665 666 class Config: 667 is_base_type = True 668 669 @abstractmethod 670 def check(self, value: Any) -> bool: 671 """Check if a value satisfies the condition. 672 673 Args: 674 * `value`: Value to check. 675 676 Returns: 677 * `True` if condition is satisfied, `False` otherwise. 678 """ 679 raise NotImplementedError 680 681 @abstractmethod 682 def get_default_alias(self, column: str) -> str: 683 """Get default alias name for this condition. 684 685 Args: 686 * `column`: Column name this condition applies to. 687 688 Returns: 689 * Default alias string. 690 """ 691 raise NotImplementedError 692 693 694 class DescriptorTest(BaseModel): 695 """Test condition for a descriptor column. 696 697 Defines a condition to test values in a descriptor column. Can be used 698 to create derived descriptors based on test results. 699 """ 700 701 condition: ColumnCondition 702 """Column condition to apply.""" 703 column: Optional[str] = None 704 """Optional column name (uses parent descriptor column if None).""" 705 alias: Optional[str] = None 706 """Optional alias name for the test result.""" 707 708 def __init__( 709 self, 710 condition: Union[ColumnCondition, GenericTest], 711 column: Optional[str] = None, 712 alias: Optional[str] = None, 713 **data: Any, 714 ) -> None: 715 c: ColumnCondition = condition.for_descriptor().condition if isinstance(condition, GenericTest) else condition 716 super().__init__(alias=alias, column=column, condition=c, **data) 717 718 def to_descriptor(self, descriptor: Optional["Descriptor"] = None) -> "Descriptor": 719 if self.column is None: 720 if descriptor is None: 721 raise ValueError("Parent descriptor is required for test without column") 722 descriptor_columns = descriptor.list_output_columns() 723 if len(descriptor_columns) == 1: 724 column = descriptor_columns[0] 725 else: 726 raise ValueError( 727 f"Column is required for test with multiple columns in parent descriptor: [{', '.join(descriptor_columns)}]" 728 ) 729 else: 730 column = self.column 731 return ColumnTest(column, self.condition, self.alias or self.condition.get_default_alias(column)) 732 733 734 AnyDescriptorTest = Union["DescriptorTest", "GenericTest"] 735 736 737 class Descriptor(AutoAliasMixin, EvidentlyBaseModel, abc.ABC): 738 """Base class for descriptors that compute row-level features. 739 740 Descriptors compute additional columns from existing data (e.g., text length, 741 sentiment score, custom transformations). Used to enrich datasets with 742 computed features for evaluation. 743 """ 744 745 class Config: 746 is_base_type = True 747 748 __alias_type__: ClassVar = "descriptor_v2" 749 """Alias type for serialization.""" 750 751 alias: str 752 """Name for the descriptor output column.""" 753 tests: List[DescriptorTest] = [] 754 """List of test conditions to apply to descriptor values.""" 755 756 def __init__(self, alias: str, tests: Optional[List[AnyDescriptorTest]] = None, **data: Any) -> None: 757 self.alias = alias 758 self.tests = [t.for_descriptor() if isinstance(t, GenericTest) else t for t in (tests or [])] 759 super().__init__(**data) 760 761 @abc.abstractmethod 762 def generate_data( 763 self, dataset: "Dataset", options: Options 764 ) -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]: 765 raise NotImplementedError() 766 767 def validate_input(self, data_definition: DataDefinition) -> None: 768 input_columns = self.list_input_columns() 769 if input_columns is not None: 770 all_columns = set(data_definition.get_columns(list(ColumnType))) 771 for column in input_columns: 772 if column not in all_columns: 773 raise ValueError( 774 f"Column '{column}' is not found in dataset. Available columns: [{', '.join(all_columns)}]" 775 ) 776 777 def list_output_columns(self) -> List[str]: # todo: also types? 778 return [self.alias] 779 780 def list_input_columns(self) -> Optional[List[str]]: # todo: make not optional 781 return None 782 783 def get_sub_descriptors(self) -> List["Descriptor"]: 784 return [t.to_descriptor(self) for t in self.tests] 785 786 def get_special_columns_info(self, rename: Dict[str, str]) -> List[SpecialColumnInfo]: 787 return [] 788 789 def add_to_descriptors_list(self) -> bool: 790 return True 791 792 793 class SingleInputDescriptor(Descriptor, abc.ABC): 794 """Base class for descriptors that operate on a single input column. 795 796 Simplifies descriptor implementation for descriptors that only need one 797 input column. Subclasses only need to implement `generate_data()`. 798 """ 799 800 column: str 801 """Name of the input column to process.""" 802 803 def list_input_columns(self) -> List[str]: 804 """Get the list of input columns. 805 806 Returns: 807 * List containing the single input column name. 808 """ 809 return [self.column] 810 811 812 class ColumnTest(SingleInputDescriptor): 813 """Descriptor that tests values in a column against a condition. 814 815 Creates a boolean descriptor column indicating whether each value in the 816 input column satisfies the condition. Useful for filtering or flagging rows. 817 """ 818 819 column: str 820 """Name of the input column to test.""" 821 condition: ColumnCondition 822 """Column condition to apply.""" 823 824 def __init__( 825 self, column: str, condition: Union[ColumnCondition, GenericTest], alias: Optional[str] = None, **data: Any 826 ) -> None: 827 self.column = column 828 if isinstance(condition, dict): 829 condition = parse_obj_as(ColumnCondition, condition) # type: ignore[type-abstract] 830 descriptor_condition: ColumnCondition = ( 831 condition if isinstance(condition, ColumnCondition) else condition.for_descriptor().condition 832 ) 833 self.condition = descriptor_condition 834 super().__init__(alias=alias or descriptor_condition.get_default_alias(column), **data) 835 836 def generate_data( 837 self, dataset: "Dataset", options: Options 838 ) -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]: 839 """Generate a boolean column indicating which rows satisfy the condition. 840 841 Args: 842 * `dataset`: `Dataset` to process. 843 * `options`: Processing options. 844 845 Returns: 846 * `DatasetColumn` with boolean values (True if condition passes, False otherwise). 847 """ 848 data = dataset.column(self.column) 849 res = data.data.apply(self.condition.check) 850 return DatasetColumn(ColumnType.Categorical, res) 851 852 853 class TestSummaryInfo(SpecialColumnInfo): 854 """Special column information for test summary aggregation. 855 856 Defines columns that aggregate test results across multiple descriptors, 857 providing summary statistics like "all tests pass", "any test fails", etc. 858 """ 859 860 all_column: Optional[str] = None 861 """Optional column name for 'all tests pass' indicator.""" 862 any_column: Optional[str] = None 863 """Optional column name for 'any test fails' indicator.""" 864 count_column: Optional[str] = None 865 """Optional column name for test failure count.""" 866 rate_column: Optional[str] = None 867 """Optional column name for test failure rate.""" 868 score_column: Optional[str] = None 869 """Optional column name for weighted test score.""" 870 score_weights: Optional[Dict[str, float]] = None 871 """Optional dictionary mapping test names to weights.""" 872 873 @property 874 def has_all(self): 875 """Check if 'all' column is configured. 876 877 Returns: 878 * `True` if `any_column` is set, `False` otherwise. 879 """ 880 return self.any_column is not None 881 882 @property 883 def has_any(self): 884 """Check if 'any' column is configured. 885 886 Returns: 887 * `True` if `any_column` is set, `False` otherwise. 888 """ 889 return self.any_column is not None 890 891 @property 892 def has_count(self): 893 """Check if 'count' column is configured. 894 895 Returns: 896 * `True` if `count_column` is set, `False` otherwise. 897 """ 898 return self.count_column is not None 899 900 @property 901 def has_rate(self): 902 """Check if 'rate' column is configured. 903 904 Returns: 905 * `True` if `rate_column` is set, `False` otherwise. 906 """ 907 return self.rate_column is not None 908 909 @property 910 def has_score(self): 911 """Check if 'score' column is configured. 912 913 Returns: 914 * `True` if `score_column` is set, `False` otherwise. 915 """ 916 return self.score_column is not None 917 918 def get_metrics(self) -> List["MetricOrContainer"]: 919 """Get metrics for aggregating test summary columns. 920 921 Returns: 922 * List containing a `TestSummaryInfoPreset` metric. 923 """ 924 from evidently.presets.special import TestSummaryInfoPreset 925 926 return [TestSummaryInfoPreset(column_info=self)] 927 928 def get_column_type(self, column_name: str) -> Optional[ColumnType]: 929 """Get the column type for a summary column name. 930 931 Args: 932 * `column_name`: Name of the column to check. 933 934 Returns: 935 * `ColumnType.Categorical` for all/any columns, `ColumnType.Numerical` for count/rate/score columns, or `None` if not found. 936 """ 937 if column_name in (self.all_column, self.any_column): 938 return ColumnType.Categorical 939 if column_name in (self.count_column, self.rate_column, self.score_column): 940 return ColumnType.Numerical 941 return None 942 943 944 class TestSummary(Descriptor): 945 """Descriptor that aggregates test results across multiple test descriptors. 946 947 Computes summary statistics from boolean test result columns, such as: 948 - Whether all tests pass for each row 949 - Whether any test fails for each row 950 - Count and rate of passing tests 951 - Weighted score across tests 952 """ 953 954 success_all: bool = True 955 """Whether to compute 'all tests pass' indicator.""" 956 success_any: bool = False 957 """Whether to compute 'any test fails' indicator.""" 958 success_count: bool = False 959 """Whether to compute count of passing tests.""" 960 success_rate: bool = False 961 """Whether to compute proportion of passing tests.""" 962 score: bool = False 963 """Whether to compute weighted score across tests.""" 964 score_weights: Optional[Dict[str, float]] = None 965 """Optional dictionary mapping test names to weights for scoring.""" 966 normalize_scores: bool = True 967 """Whether to normalize scores by total weight.""" 968 969 def __init__( 970 self, 971 success_all: bool = True, 972 success_any: bool = False, 973 success_count: bool = False, 974 success_rate: bool = False, 975 score: bool = False, 976 score_weights: Optional[Dict[str, float]] = None, 977 alias: Optional[str] = None, 978 normalize_scores: bool = True, 979 **data: Any, 980 ): 981 self.success_all = success_all 982 self.success_any = success_any 983 self.success_count = success_count 984 self.success_rate = success_rate 985 self.score = score 986 self.score_weights = score_weights 987 self.normalize_scores = normalize_scores 988 super().__init__(alias=alias or "summary", **data) 989 990 def generate_data( 991 self, dataset: "Dataset", options: Options 992 ) -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]: 993 """Generate summary columns from test result columns. 994 995 Aggregates boolean test results into summary statistics based on 996 configured flags (success_all, success_any, success_count, etc.). 997 998 Args: 999 * `dataset`: `Dataset` containing test result columns. 1000 * `options`: Processing options. 1001 1002 Returns: 1003 * Dictionary of summary columns, or single column if only one is generated. 1004 1005 Raises: 1006 * `ValueError`: If no tests are specified or no summary columns are configured. 1007 """ 1008 tests = dataset.data_definition.test_descriptors or [] 1009 if len(tests) == 0: 1010 raise ValueError("No tests specified") 1011 summary_columns = {} 1012 test_results = dataset.as_dataframe()[tests] 1013 if self.success_count: 1014 summary_columns["success_count"] = (ColumnType.Numerical, test_results.sum(axis=1)) 1015 if self.success_rate: 1016 summary_columns["success_rate"] = (ColumnType.Numerical, test_results.sum(axis=1) / len(tests)) 1017 if self.success_all: 1018 summary_columns["success_all"] = (ColumnType.Categorical, test_results.all(axis=1)) 1019 if self.success_any: 1020 summary_columns["success_any"] = (ColumnType.Categorical, test_results.any(axis=1)) 1021 if self.score: 1022 weights = self.score_weights or {t: 1 for t in tests} 1023 total_weight = sum(weights.values()) if self.normalize_scores else 1 1024 summary_columns["score"] = ( # type: ignore[assignment] 1025 ColumnType.Numerical, 1026 sum(test_results[col] * weight / total_weight for col, weight in weights.items()), 1027 ) 1028 alias = self.alias or "summary" 1029 result = {f"{alias}_{key}": DatasetColumn(ct, value) for key, (ct, value) in summary_columns.items()} 1030 if len(tests) == 0: 1031 raise ValueError("No summary columns specified") 1032 if len(result) == 1: 1033 return {alias: list(result.values())[0]} 1034 return result 1035 1036 def list_input_columns(self) -> Optional[List[str]]: 1037 """Get list of input columns needed for this descriptor. 1038 1039 Returns: 1040 * List of test column names if score weights are specified, `None` otherwise. 1041 """ 1042 if self.score and self.score_weights is not None: 1043 return list(self.score_weights.keys()) 1044 return None 1045 1046 def get_special_columns_info(self, rename: Dict[str, str]) -> List[SpecialColumnInfo]: 1047 """Get special column information for test summary aggregation. 1048 1049 Args: 1050 * `rename`: Dictionary mapping internal column names to final names. 1051 1052 Returns: 1053 * List of `TestSummaryInfo` objects describing the summary columns. 1054 """ 1055 alias = self.alias or "summary" 1056 if len(rename) == 1: 1057 return [ 1058 TestSummaryInfo( 1059 all_column=rename[alias] if self.success_all else None, 1060 any_column=rename[alias] if self.success_any else None, 1061 count_column=rename[alias] if self.success_count else None, 1062 rate_column=rename[alias] if self.success_rate else None, 1063 score_column=rename[alias] if self.score else None, 1064 ) 1065 ] 1066 1067 return [ 1068 TestSummaryInfo( 1069 all_column=rename[f"{alias}_success_all"] if self.success_all else None, 1070 any_column=rename[f"{alias}_success_any"] if self.success_any else None, 1071 count_column=rename[f"{alias}_success_count"] if self.success_count else None, 1072 rate_column=rename[f"{alias}_success_rate"] if self.success_rate else None, 1073 score_column=rename[f"{alias}_score"] if self.score else None, 1074 score_weights=self.score_weights, 1075 ) 1076 ] 1077 1078 def add_to_descriptors_list(self) -> bool: 1079 return False 1080 1081 1082 class FeatureDescriptor(Descriptor): 1083 feature: GeneratedFeatures 1084 1085 def __init__( 1086 self, feature: GeneratedFeatures, alias: Optional[str] = None, tests: Optional[List[AnyDescriptorTest]] = None 1087 ): 1088 # this is needed because we try to access it before super call 1089 feature = feature if isinstance(feature, GeneratedFeatures) else parse_obj_as(GeneratedFeatures, feature) # type: ignore[type-abstract] 1090 feature_columns = feature.list_columns() 1091 super().__init__(feature=feature, alias=alias or f"{feature_columns[0].display_name}", tests=tests) 1092 1093 def get_dataset_column(self, column_name: str, values: pd.Series) -> DatasetColumn: 1094 column_type = self.feature.get_type(column_name) 1095 if column_type == ColumnType.Numerical: 1096 values = pd.to_numeric(values, errors="coerce") 1097 dataset_column = DatasetColumn(type=column_type, data=values) 1098 return dataset_column 1099 1100 def generate_data( 1101 self, dataset: "Dataset", options: Options 1102 ) -> Union[DatasetColumn, Dict[DisplayName, DatasetColumn]]: 1103 feature = self.feature.generate_features_renamed( 1104 dataset.as_dataframe(), 1105 create_data_definition(None, dataset.as_dataframe(), ColumnMapping()), 1106 options, 1107 ) 1108 return { 1109 col.display_name: self.get_dataset_column(col.name, feature[col.name]) 1110 for col in self.feature.list_columns() 1111 } 1112 1113 def list_output_columns(self) -> List[str]: 1114 return [c.display_name for c in self.feature.list_columns()] 1115 1116 1117 def _determine_descriptor_column_name(alias: str, columns: List[str]): 1118 index = 1 1119 key = alias 1120 while key in columns: 1121 key = f"{alias}_{index}" 1122 index += 1 1123 return key 1124 1125 1126 @dataclasses.dataclass 1127 class StatCountValue: 1128 count: int 1129 share: float 1130 1131 1132 @dataclasses.dataclass 1133 class GeneralColumnStats: 1134 missing_values: StatCountValue 1135 1136 1137 @dataclasses.dataclass 1138 class NumericalColumnStats: 1139 max: Numeric 1140 min: Numeric 1141 mean: Numeric 1142 std: Numeric 1143 quantiles: Dict[str, Numeric] 1144 infinite: StatCountValue 1145 1146 1147 @dataclasses.dataclass 1148 class LabelStats: 1149 count: StatCountValue 1150 1151 1152 @dataclasses.dataclass 1153 class CategoricalColumnStats: 1154 unique_count: int 1155 label_stats: Dict[Label, LabelStats] 1156 1157 @property 1158 def most_common(self) -> Optional[Tuple[Label, LabelStats]]: 1159 most_common = None 1160 for key, value in self.label_stats.items(): 1161 if most_common is None: 1162 most_common = key 1163 continue 1164 if self.label_stats[most_common].count < value.count: 1165 most_common = key 1166 if most_common is None: 1167 return None 1168 return most_common, self.label_stats[most_common] 1169 1170 1171 @dataclasses.dataclass 1172 class ColumnStats: 1173 general_stats: GeneralColumnStats 1174 numerical_stats: Optional[NumericalColumnStats] 1175 categorical_stats: Optional[CategoricalColumnStats] 1176 1177 1178 @dataclasses.dataclass 1179 class DatasetStats: 1180 """Statistics summary for a dataset. 1181 1182 Contains overall dataset statistics including row count, column count, 1183 and per-column statistics. 1184 """ 1185 1186 row_count: int 1187 """Total number of rows in the dataset.""" 1188 column_count: int 1189 """Total number of columns in the dataset.""" 1190 column_stats: Dict[str, ColumnStats] 1191 """Dictionary mapping column names to their ColumnStats.""" 1192 1193 1194 PossibleDatasetTypes = Union["Dataset", pd.DataFrame] 1195 1196 1197 class Dataset: 1198 """Dataset object that wraps your data with metadata and data definition. 1199 1200 `Dataset` is the main data structure in Evidently. It wraps a `pandas.DataFrame` 1201 with additional metadata including: 1202 - `DataDefinition`: column types and roles mapping 1203 - Descriptors: computed row-level scores (for text/LLM data) 1204 - Metadata and tags: additional information about the dataset 1205 1206 You typically create a `Dataset` from a `pandas.DataFrame` using `Dataset.from_pandas()`. 1207 Use `Dataset` objects with `Report.run()` to perform evaluations. 1208 1209 **Documentation**: See [Data Definition Guide](https://docs.evidentlyai.com/docs/library/data_definition) for column mapping. 1210 1211 Create from pandas DataFrame: 1212 ```python 1213 from evidently import Dataset, DataDefinition 1214 1215 dataset = Dataset.from_pandas( 1216 source_df, 1217 data_definition=DataDefinition() 1218 ) 1219 ``` 1220 1221 Add descriptors for text evaluation: 1222 ```python 1223 from evidently.descriptors import TextLength 1224 1225 dataset.add_descriptors([TextLength(column="text")]) 1226 ``` 1227 1228 Use in a Report: 1229 ```python 1230 from evidently import Report 1231 from evidently.presets import DataSummaryPreset 1232 1233 report = Report([DataSummaryPreset()]) 1234 snapshot = report.run(dataset, None) 1235 ``` 1236 """ 1237 1238 _data_definition: DataDefinition 1239 _metadata: Dict[str, MetadataValueType] 1240 _tags: List[str] 1241 1242 @classmethod 1243 def from_pandas( 1244 cls, 1245 data: pd.DataFrame, 1246 data_definition: Optional[DataDefinition] = None, 1247 descriptors: Optional[List[Descriptor]] = None, 1248 options: AnyOptions = None, 1249 metadata: Optional[Dict[str, MetadataValueType]] = None, 1250 tags: Optional[List[str]] = None, 1251 ) -> "Dataset": 1252 """Create a `Dataset` from a `pandas.DataFrame`. 1253 1254 Args: 1255 * `data`: `pandas.DataFrame` with your data 1256 * `data_definition`: Optional `DataDefinition` for column mapping (auto-inferred if None). 1257 Use `DataDefinition()` for automatic mapping or provide explicit column mappings. 1258 * `descriptors`: Optional list of descriptors to compute and add to dataset 1259 * `options`: Optional options for descriptor computation 1260 * `metadata`: Optional metadata dictionary 1261 * `tags`: Optional list of tags 1262 1263 Returns: 1264 * `Dataset` object ready for use with `Report.run()` 1265 1266 Example: 1267 ```python 1268 from evidently import Dataset, DataDefinition 1269 1270 dataset = Dataset.from_pandas(df, data_definition=DataDefinition()) 1271 ``` 1272 """ 1273 dataset = PandasDataset(data, data_definition, metadata=metadata, tags=tags) 1274 if descriptors is not None: 1275 dataset.add_descriptors(descriptors, options) 1276 return dataset 1277 1278 @staticmethod 1279 def from_any(dataset: PossibleDatasetTypes) -> "Dataset": 1280 """Convert various dataset types to a `Dataset` object. 1281 1282 Args: 1283 * `dataset`: `pandas.DataFrame` or `Dataset` object 1284 1285 Returns: 1286 * `Dataset` object (converts DataFrame if needed) 1287 1288 Raises: 1289 * ValueError if dataset type is not supported 1290 """ 1291 if isinstance(dataset, Dataset): 1292 return dataset 1293 if isinstance(dataset, pd.DataFrame): 1294 return Dataset.from_pandas(dataset) 1295 raise ValueError(f"Unsupported dataset type: {type(dataset)}") 1296 1297 @abstractmethod 1298 def as_dataframe(self) -> pd.DataFrame: 1299 """Get the underlying `pandas.DataFrame`. 1300 1301 Returns: 1302 * `pandas.DataFrame` with all data including computed descriptors 1303 """ 1304 raise NotImplementedError() 1305 1306 @abstractmethod 1307 def column(self, column_name: str) -> DatasetColumn: 1308 """Get a specific column from the dataset. 1309 1310 Args: 1311 * `column_name`: Name of the column to retrieve 1312 1313 Returns: 1314 * `DatasetColumn` object with column data and type information 1315 """ 1316 raise NotImplementedError() 1317 1318 @abstractmethod 1319 def subdataset(self, column_name: str, label: object) -> "Dataset": 1320 """Create a filtered subdataset matching a column value. 1321 1322 Args: 1323 * `column_name`: Column to filter by 1324 * `label`: Value to filter for 1325 1326 Returns: 1327 * New `Dataset` containing only rows where column equals label 1328 """ 1329 raise NotImplementedError() 1330 1331 @abstractmethod 1332 def stats(self) -> DatasetStats: 1333 """Get statistical summary of the dataset. 1334 1335 Returns: 1336 * `DatasetStats` object with row count, column count, and per-column statistics 1337 """ 1338 raise NotImplementedError() 1339 1340 @property 1341 def data_definition(self) -> DataDefinition: 1342 """Get the `DataDefinition` mapping for this dataset. 1343 1344 Returns: 1345 * `DataDefinition` object with column type and role mappings 1346 * `DataDefinition` object with column types and roles 1347 """ 1348 return self._data_definition 1349 1350 @property 1351 def metadata(self) -> Dict[str, MetadataValueType]: 1352 """Get metadata associated with this dataset. 1353 1354 Returns: 1355 * Dictionary of metadata key-value pairs 1356 """ 1357 return self._metadata 1358 1359 @property 1360 def tags(self) -> List[str]: 1361 """Get tags associated with this dataset. 1362 1363 Returns: 1364 * List of tag strings 1365 """ 1366 return self._tags 1367 1368 @abstractmethod 1369 def add_descriptor(self, descriptor: Descriptor, options: AnyOptions = None): 1370 """Add a descriptor to compute row-level scores. 1371 1372 Args: 1373 * `descriptor`: `Descriptor` object to compute 1374 * `options`: Optional options for descriptor computation 1375 """ 1376 raise NotImplementedError 1377 1378 def add_descriptors(self, descriptors: List[Descriptor], options: AnyOptions = None): 1379 """Add multiple descriptors to the dataset. 1380 1381 Args: 1382 * `descriptors`: List of `Descriptor` objects to compute 1383 * `options`: Optional options for descriptor computation 1384 """ 1385 for descriptor in descriptors: 1386 self.add_descriptor(descriptor, options) 1387 1388 @abstractmethod 1389 def save(self, uri: str): 1390 """Save the dataset to a file. 1391 1392 Args: 1393 * `uri`: File path to save the dataset (supports .evidently_dataset format) 1394 """ 1395 raise NotImplementedError 1396 1397 @classmethod 1398 @abstractmethod 1399 def _can_load(cls, uri: str) -> bool: 1400 raise NotImplementedError 1401 1402 @classmethod 1403 @abstractmethod 1404 def _load(cls, uri: str) -> "Dataset": 1405 raise NotImplementedError 1406 1407 @classmethod 1408 def load(cls, uri: str) -> "Dataset": 1409 """Load a dataset from a file. 1410 1411 Args: 1412 * `uri`: File path to load from (supports CSV, Parquet, and .evidently_dataset formats) 1413 1414 Returns: 1415 * `Dataset` object loaded from file 1416 1417 Raises: 1418 * Exception if dataset cannot be loaded 1419 """ 1420 for subclass in cls.__subclasses__(): 1421 if subclass._can_load(uri): 1422 return subclass._load(uri) 1423 raise Exception(f"Dataset {uri} could not be loaded") 1424 1425 1426 INTEGER_CARDINALITY_LIMIT = 10 1427 1428 1429 def infer_column_type(column_data: pd.Series) -> ColumnType: 1430 if column_data.dtype.name.startswith("float"): 1431 return ColumnType.Numerical 1432 if column_data.dtype.name.startswith("int"): 1433 if column_data.nunique() <= INTEGER_CARDINALITY_LIMIT: 1434 return ColumnType.Categorical 1435 else: 1436 return ColumnType.Numerical 1437 if column_data.dtype.name in ["string", "str"]: 1438 if column_data.nunique() > (column_data.count() * 0.5): 1439 return ColumnType.Text 1440 else: 1441 return ColumnType.Categorical 1442 if column_data.dtype.name == "object": 1443 without_na = column_data.dropna() 1444 if without_na.count() == 0: 1445 return ColumnType.Unknown 1446 if isinstance(without_na.iloc[0], str) and isinstance(without_na.iloc[-1], str): 1447 if column_data.nunique() > (column_data.count() * 0.5): 1448 return ColumnType.Text 1449 else: 1450 return ColumnType.Categorical 1451 elif isinstance(without_na.iloc[0], (list, tuple)) and isinstance(without_na.iloc[-1], (list, tuple)): 1452 return ColumnType.List 1453 return ColumnType.Unknown 1454 if column_data.dtype.name in ["bool", "category"]: 1455 return ColumnType.Categorical 1456 if column_data.dtype.name.startswith("datetime"): 1457 return ColumnType.Datetime 1458 return ColumnType.Unknown 1459 1460 1461 MARKER_CONTENT = """{"version": "1.0"}""" 1462 MARKER_FILENAME = ".evidently_dataset" 1463 DATA_FILENAME = "data.parquet" 1464 META_FILENAME = "dataset.json" 1465 1466 1467 def _write_evidently_dataset(dataset: Dataset, uri: str): 1468 with tarfile.open(uri, "w") as tar: # todo: use fsspec location 1469 # Add marker file 1470 marker_data = MARKER_CONTENT.encode("utf-8") 1471 marker_info = tarfile.TarInfo(MARKER_FILENAME) 1472 marker_info.size = len(marker_data) 1473 tar.addfile(marker_info, io.BytesIO(marker_data)) 1474 1475 # Add dataframe as parquet 1476 buffer = io.BytesIO() 1477 dataset.as_dataframe().to_parquet(buffer, index=False) 1478 buffer.seek(0) 1479 data_info = tarfile.TarInfo(DATA_FILENAME) 1480 data_info.size = len(buffer.getbuffer()) 1481 tar.addfile(data_info, buffer) 1482 1483 # Add metadata as JSON 1484 metadata = { 1485 "tags": dataset.tags, 1486 "metadata": dataset.metadata, 1487 "data_definition": dataset.data_definition.dict(), 1488 } 1489 meta_bytes = json.dumps(metadata, indent=2).encode("utf-8") 1490 meta_info = tarfile.TarInfo(META_FILENAME) 1491 meta_info.size = len(meta_bytes) 1492 tar.addfile(meta_info, io.BytesIO(meta_bytes)) 1493 1494 1495 def _read_evidently_dataset(uri: str) -> Dataset: 1496 with tarfile.open(uri, "r") as tar: 1497 names = tar.getnames() 1498 1499 # Check marker 1500 if MARKER_FILENAME not in names: 1501 raise ValueError("Not a valid Evidently dataset: missing marker") 1502 marker_file = tar.extractfile(MARKER_FILENAME) 1503 if marker_file is None or marker_file.read().decode("utf-8") != MARKER_CONTENT: 1504 raise ValueError("Invalid Evidently dataset marker content") 1505 1506 # Load dataframe 1507 if DATA_FILENAME not in names: 1508 raise ValueError("Missing data file in Evidently dataset") 1509 data_file = tar.extractfile(DATA_FILENAME) 1510 if data_file is None: 1511 raise ValueError("Missing data file in Evidently dataset") 1512 df = pd.read_parquet(data_file) 1513 1514 # Load metadata 1515 if META_FILENAME not in names: 1516 raise ValueError("Missing metadata file in Evidently dataset") 1517 meta_file = tar.extractfile(META_FILENAME) 1518 if meta_file is None: 1519 raise ValueError("Missing metadata file in Evidently dataset") 1520 metadata = json.load(meta_file) 1521 1522 return Dataset.from_pandas( 1523 df, 1524 data_definition=DataDefinition.parse_obj(metadata["data_definition"]), 1525 metadata=metadata["metadata"], 1526 tags=metadata["tags"], 1527 ) 1528 1529 1530 class PandasDataset(Dataset): 1531 SUPPORTED_FORMATS = {"csv": pd.read_csv, "parquet": pd.read_parquet, EVIDENTLY_DATASET_EXT: _read_evidently_dataset} 1532 _data: pd.DataFrame 1533 _data_definition: DataDefinition 1534 _dataset_stats: DatasetStats 1535 _metadata: Dict[str, MetadataValueType] 1536 _tags: List[str] 1537 1538 def __init__( 1539 self, 1540 data: pd.DataFrame, 1541 data_definition: Optional[DataDefinition] = None, 1542 metadata: Optional[Dict[str, MetadataValueType]] = None, 1543 tags: Optional[List[str]] = None, 1544 ): 1545 self._data = data.copy() 1546 if ( 1547 data_definition is None 1548 or data_definition.datetime_columns is None 1549 or data_definition.categorical_columns is None 1550 or data_definition.text_columns is None 1551 or data_definition.numerical_columns is None 1552 or data_definition.unknown_columns is None 1553 or data_definition.list_columns is None 1554 ): 1555 reserved_fields = [] 1556 if data_definition is not None: 1557 if data_definition.service_columns is not None: 1558 if data_definition.service_columns.trace_link is not None: 1559 reserved_fields.append(data_definition.service_columns.trace_link) 1560 if data_definition.timestamp is not None: 1561 reserved_fields.append(data_definition.timestamp) 1562 if data_definition.id_column is not None: 1563 reserved_fields.append(data_definition.id_column) 1564 if data_definition.numerical_columns is not None: 1565 reserved_fields.extend(data_definition.numerical_columns) 1566 if data_definition.categorical_columns is not None: 1567 reserved_fields.extend(data_definition.categorical_columns) 1568 if data_definition.datetime_columns is not None: 1569 reserved_fields.extend(data_definition.datetime_columns) 1570 if data_definition.text_columns is not None: 1571 reserved_fields.extend(data_definition.text_columns) 1572 if data_definition.unknown_columns is not None: 1573 reserved_fields.extend(data_definition.unknown_columns) 1574 if data_definition.list_columns is not None: 1575 reserved_fields.extend(data_definition.list_columns) 1576 if data_definition.numerical_descriptors is not None: 1577 reserved_fields.extend(data_definition.numerical_descriptors) 1578 if data_definition.categorical_descriptors is not None: 1579 reserved_fields.extend(data_definition.categorical_descriptors) 1580 generated_data_definition = self._generate_data_definition( 1581 data, 1582 reserved_fields, 1583 data_definition.service_columns if data_definition is not None else None, 1584 ) 1585 if data_definition is None: 1586 self._data_definition = generated_data_definition 1587 else: 1588 self._data_definition = copy.deepcopy(data_definition) 1589 if self._data_definition.datetime_columns is None: 1590 if self._data_definition.timestamp is not None and generated_data_definition.timestamp is not None: 1591 self._data_definition.datetime_columns = [generated_data_definition.timestamp] 1592 else: 1593 self._data_definition.datetime_columns = generated_data_definition.datetime_columns 1594 if self._data_definition.numerical_columns is None: 1595 self._data_definition.numerical_columns = generated_data_definition.numerical_columns 1596 if self._data_definition.categorical_columns is None: 1597 self._data_definition.categorical_columns = generated_data_definition.categorical_columns 1598 if self._data_definition.text_columns is None: 1599 self._data_definition.text_columns = generated_data_definition.text_columns 1600 if self._data_definition.unknown_columns is None: 1601 self._data_definition.unknown_columns = generated_data_definition.unknown_columns 1602 if self._data_definition.list_columns is None: 1603 self._data_definition.list_columns = generated_data_definition.list_columns 1604 if self._data_definition.timestamp is None and generated_data_definition.timestamp is not None: 1605 self._data_definition.timestamp = generated_data_definition.timestamp 1606 if ( 1607 self._data_definition.service_columns is None 1608 and generated_data_definition.service_columns is not None 1609 ): 1610 self._data_definition.service_columns = generated_data_definition.service_columns 1611 else: 1612 self._data_definition = copy.deepcopy(data_definition) 1613 (rows, columns) = data.shape 1614 1615 column_stats = {} 1616 for column in data.columns: 1617 column_stats[column] = self._collect_stats(self._data_definition.get_column_type(column), data[column]) 1618 self._dataset_stats = DatasetStats(rows, columns, column_stats) 1619 self._metadata = metadata or {} 1620 self._tags = tags or [] 1621 1622 def as_dataframe(self) -> pd.DataFrame: 1623 return self._data 1624 1625 def column(self, column_name: str) -> DatasetColumn: 1626 return DatasetColumn(self._data_definition.get_column_type(column_name), self._data[column_name]) 1627 1628 def subdataset(self, column_name: str, label: object): 1629 return PandasDataset(self._data[self._data[column_name] == label], self._data_definition) 1630 1631 def _generate_data_definition( 1632 self, 1633 data: pd.DataFrame, 1634 reserved_fields: List[str], 1635 service_columns: Optional[ServiceColumns] = None, 1636 ) -> DataDefinition: 1637 numerical = [] 1638 categorical = [] 1639 text = [] 1640 datetime = [] 1641 unknown = [] 1642 list_columns = [] 1643 service = None 1644 for column in data.columns: 1645 if column in reserved_fields: 1646 continue 1647 if service_columns is None and column == DEFAULT_TRACE_LINK_COLUMN: 1648 if service is None: 1649 service = ServiceColumns(trace_link=column) 1650 else: 1651 service.trace_link = column 1652 continue 1653 column_type = infer_column_type(data[column]) 1654 if column_type == ColumnType.Numerical: 1655 numerical.append(column) 1656 if column_type == ColumnType.Categorical: 1657 categorical.append(column) 1658 if column_type == ColumnType.Datetime: 1659 datetime.append(column) 1660 if column_type == ColumnType.Text: 1661 text.append(column) 1662 if column_type == ColumnType.Unknown: 1663 unknown.append(column) 1664 if column_type == ColumnType.List: 1665 list_columns.append(column) 1666 1667 return DataDefinition( 1668 timestamp=datetime[0] if len(datetime) == 1 else None, 1669 service_columns=service, 1670 numerical_columns=numerical, 1671 categorical_columns=categorical, 1672 unknown_columns=unknown, 1673 list_columns=list_columns, 1674 datetime_columns=datetime if len(datetime) != 1 else [], 1675 text_columns=text, 1676 ) 1677 1678 def stats(self) -> DatasetStats: 1679 return self._dataset_stats 1680 1681 def add_column(self, key: str, data: DatasetColumn, add_to_descriptor_list: bool = True): 1682 self._dataset_stats.column_count += 1 1683 self._dataset_stats.column_stats[key] = self._collect_stats(data.type, data.data) 1684 self._data[key] = data.data 1685 if add_to_descriptor_list and data.type == ColumnType.Numerical: 1686 self._data_definition.numerical_descriptors.append(key) 1687 if add_to_descriptor_list and data.type == ColumnType.Categorical: 1688 self._data_definition.categorical_descriptors.append(key) 1689 1690 def add_descriptor(self, descriptor: Descriptor, options: AnyOptions = None): 1691 descriptor.validate_input(self._data_definition) 1692 new_columns = descriptor.generate_data(self, Options.from_any_options(options)) 1693 if isinstance(new_columns, DatasetColumn): 1694 new_columns = {descriptor.alias: new_columns} 1695 rename = {} 1696 for col, value in new_columns.items(): 1697 name = _determine_descriptor_column_name(col, self._data.columns.tolist()) 1698 rename[col] = name 1699 self.add_column(name, value, descriptor.add_to_descriptors_list()) 1700 if isinstance(descriptor, ColumnTest): 1701 if self._data_definition.test_descriptors is None: 1702 self._data_definition.test_descriptors = [] 1703 self._data_definition.test_descriptors.append(name) 1704 self.data_definition.special_columns.extend(descriptor.get_special_columns_info(rename)) 1705 for sub in descriptor.get_sub_descriptors(): 1706 self.add_descriptor(sub, options) 1707 1708 def _collect_stats(self, column_type: ColumnType, data: pd.Series): 1709 numerical_stats = None 1710 if column_type == ColumnType.Numerical: 1711 numerical_stats = _collect_numerical_stats(data) 1712 1713 categorical_stats = None 1714 if column_type == ColumnType.Categorical: 1715 categorical_stats = _collect_categorical_stats(data) 1716 1717 return ColumnStats( 1718 general_stats=GeneralColumnStats(missing_values=StatCountValue(0, 0)), 1719 numerical_stats=numerical_stats, 1720 categorical_stats=categorical_stats, 1721 ) 1722 1723 def save(self, uri: str): 1724 if not uri.endswith(f".{EVIDENTLY_DATASET_EXT}"): 1725 uri += f".{EVIDENTLY_DATASET_EXT}" 1726 _write_evidently_dataset(self, uri) 1727 1728 @classmethod 1729 def _can_load(cls, uri: str) -> bool: 1730 split = uri.split(".")[-1] 1731 return split in cls.SUPPORTED_FORMATS or os.path.exists(f"{uri}.{EVIDENTLY_DATASET_EXT}") 1732 1733 @classmethod 1734 def _load(cls, uri: str) -> "Dataset": 1735 ext = uri.split(".")[-1] 1736 if ext not in cls.SUPPORTED_FORMATS: 1737 if os.path.exists(f"{uri}.{EVIDENTLY_DATASET_EXT}"): 1738 ext = EVIDENTLY_DATASET_EXT 1739 uri = f"{uri}.{ext}" 1740 else: 1741 raise ValueError(f"Unsupported format: {ext}") 1742 # todo: load from fsspec stream instead 1743 data = cls.SUPPORTED_FORMATS[ext](uri) # type: ignore[operator] 1744 if isinstance(data, Dataset): 1745 return data 1746 return Dataset.from_pandas(data) 1747 1748 1749 def _collect_numerical_stats(data: pd.Series): 1750 infinite_count = data.groupby(np.isinf(data)).count().get(True, 0) 1751 return NumericalColumnStats( 1752 max=data.max(), 1753 min=data.min(), 1754 mean=data.mean(), 1755 std=data.std(), 1756 quantiles={ 1757 "p25": data.quantile(0.25), 1758 "p75": data.quantile(0.75), 1759 }, 1760 infinite=StatCountValue(infinite_count, infinite_count / data.count()), 1761 ) 1762 1763 1764 def _collect_categorical_stats(data: pd.Series): 1765 total_count = data.count() 1766 return CategoricalColumnStats( 1767 unique_count=data.nunique(), 1768 label_stats={ 1769 label: LabelStats(count=StatCountValue(count, count / total_count)) 1770 for label, count in data.value_counts().items() 1771 }, 1772 )