/ src / evidently / descriptors / _text_length.py
_text_length.py
 1  from typing import Any
 2  from typing import Dict
 3  from typing import List
 4  from typing import Optional
 5  from typing import Union
 6  
 7  import numpy as np
 8  
 9  from evidently.core.datasets import AnyDescriptorTest
10  from evidently.core.datasets import Dataset
11  from evidently.core.datasets import DatasetColumn
12  from evidently.core.datasets import Descriptor
13  from evidently.legacy.core import ColumnType
14  from evidently.legacy.options.base import Options
15  
16  
17  class TextLength(Descriptor):
18      """Compute the length of text in each row of a column."""
19  
20      column_name: str
21      """Name of the text column to measure."""
22  
23      def __init__(self, column_name: str, alias: Optional[str] = None, tests: Optional[List[AnyDescriptorTest]] = None):
24          self.column_name: str = column_name
25          super().__init__(alias=alias or "text_length", tests=tests)
26  
27      def generate_data(self, dataset: "Dataset", options: Options) -> Union[DatasetColumn, Dict[str, DatasetColumn]]:
28          """Compute text length for each row."""
29          column_items_lengths = dataset.as_dataframe()[self.column_name].apply(_apply)
30          return DatasetColumn(type=ColumnType.Numerical, data=column_items_lengths)
31  
32      def list_input_columns(self) -> Optional[List[str]]:
33          """Return list of required input column names."""
34          return [self.column_name]
35  
36  
37  def _apply(value: Any):
38      """Compute length of value, returning 0 for None or NaN."""
39      if value is None or (isinstance(value, float) and np.isnan(value)):
40          return 0
41      return len(value)