_text_length.py
1 from typing import Any 2 from typing import Dict 3 from typing import List 4 from typing import Optional 5 from typing import Union 6 7 import numpy as np 8 9 from evidently.core.datasets import AnyDescriptorTest 10 from evidently.core.datasets import Dataset 11 from evidently.core.datasets import DatasetColumn 12 from evidently.core.datasets import Descriptor 13 from evidently.legacy.core import ColumnType 14 from evidently.legacy.options.base import Options 15 16 17 class TextLength(Descriptor): 18 """Compute the length of text in each row of a column.""" 19 20 column_name: str 21 """Name of the text column to measure.""" 22 23 def __init__(self, column_name: str, alias: Optional[str] = None, tests: Optional[List[AnyDescriptorTest]] = None): 24 self.column_name: str = column_name 25 super().__init__(alias=alias or "text_length", tests=tests) 26 27 def generate_data(self, dataset: "Dataset", options: Options) -> Union[DatasetColumn, Dict[str, DatasetColumn]]: 28 """Compute text length for each row.""" 29 column_items_lengths = dataset.as_dataframe()[self.column_name].apply(_apply) 30 return DatasetColumn(type=ColumnType.Numerical, data=column_items_lengths) 31 32 def list_input_columns(self) -> Optional[List[str]]: 33 """Return list of required input column names.""" 34 return [self.column_name] 35 36 37 def _apply(value: Any): 38 """Compute length of value, returning 0 for None or NaN.""" 39 if value is None or (isinstance(value, float) and np.isnan(value)): 40 return 0 41 return len(value)