/ tests / future / descriptors / test_descriptors.py
test_descriptors.py
  1  import json
  2  from inspect import isabstract
  3  from typing import ClassVar
  4  from typing import Dict
  5  from typing import List
  6  from typing import Optional
  7  from typing import Tuple
  8  from typing import Union
  9  
 10  import pandas as pd
 11  import pytest
 12  
 13  from evidently import ColumnType
 14  from evidently._pydantic_compat import parse_obj_as
 15  from evidently.core.datasets import ColumnTest
 16  from evidently.core.datasets import Dataset
 17  from evidently.core.datasets import DatasetColumn
 18  from evidently.core.datasets import Descriptor
 19  from evidently.core.datasets import FeatureDescriptor
 20  from evidently.core.datasets import TestSummary
 21  from evidently.descriptors import ContextRelevance
 22  from evidently.descriptors import CustomColumnDescriptor
 23  from evidently.descriptors import CustomDescriptor
 24  from evidently.descriptors import LLMJudge
 25  from evidently.descriptors import TextLength
 26  from evidently.descriptors import TextMatch
 27  from evidently.descriptors.llm_judges import GenericLLMDescriptor
 28  from evidently.descriptors.llm_judges import LLMEval
 29  from evidently.legacy.options.base import Options
 30  from evidently.legacy.utils.llm.base import LLMMessage
 31  from evidently.legacy.utils.llm.wrapper import LLMResult
 32  from evidently.legacy.utils.llm.wrapper import LLMWrapper
 33  from evidently.legacy.utils.llm.wrapper import llm_provider
 34  from evidently.llm.prompts.content import TemplatePromptContent
 35  from evidently.llm.templates import BaseLLMPromptTemplate
 36  from evidently.llm.utils.blocks import PromptBlock
 37  from evidently.tests import eq
 38  from tests.conftest import load_all_subtypes
 39  
 40  from .test_feature_descriptors import MockGeneratedFeature
 41  
 42  int_data = pd.Series([1, 2, 3], name="int")
 43  str_data = pd.Series(["a", "b", "c"], name="str")
 44  
 45  
 46  @llm_provider("mock_d", None)
 47  class MockLLMWrapper(LLMWrapper):
 48      def __init__(self, model: str, options: Options):
 49          self.model = model
 50  
 51      async def complete(self, messages: List[LLMMessage], seed: Optional[int] = None) -> LLMResult[str]:
 52          return LLMResult("\n".join(m.content for m in messages), 0, 0)
 53  
 54  
 55  def custom_descr(dataset: Dataset) -> DatasetColumn:
 56      return DatasetColumn(ColumnType.Numerical, pd.Series([1] * len(dataset.as_dataframe())))
 57  
 58  
 59  def custom_col_descr(col: DatasetColumn) -> DatasetColumn:
 60      return DatasetColumn(ColumnType.Numerical, col.data)
 61  
 62  
 63  class MockTemplate(BaseLLMPromptTemplate):
 64      blocks: ClassVar = [PromptBlock.simple("{data}")]
 65  
 66      def list_output_columns(self) -> List[str]:
 67          return ["res"]
 68  
 69      def get_type(self, subcolumn: Optional[str]) -> ColumnType:
 70          return ColumnType.Text
 71  
 72      def get_main_output_column(self) -> str:
 73          return "res"
 74  
 75  
 76  class MockTemplateMulticolumn(BaseLLMPromptTemplate):
 77      blocks: ClassVar = [PromptBlock.simple("{data}"), PromptBlock.json_output(**{"res1": "", "res2": ""})]
 78  
 79      def list_output_columns(self) -> List[str]:
 80          return ["res1", "res2"]
 81  
 82      def get_type(self, subcolumn: Optional[str]) -> ColumnType:
 83          return ColumnType.Text
 84  
 85      def get_main_output_column(self) -> str:
 86          return "res1"
 87  
 88  
 89  @pytest.fixture(autouse=True)
 90  def mock_semantic_scoring(mocker):
 91      from evidently.descriptors._context_relevance import MeanAggregation
 92      from evidently.descriptors._context_relevance import semantic_similarity_scoring as sss
 93  
 94      def semantic_scoring_mock(question: DatasetColumn, context: DatasetColumn, options) -> DatasetColumn:
 95          return DatasetColumn(ColumnType.Numerical, pd.Series([1] * len(question.data)))
 96  
 97      mocker.patch(f"{sss.__module__}.{sss.__name__}", new=semantic_scoring_mock)
 98      mocker.patch(
 99          f"{sss.__module__}.METHODS",
100          new={
101              "semantic_similarity": (semantic_scoring_mock, MeanAggregation),
102          },
103      )
104  
105  
106  all_descriptors: List[Tuple[Descriptor, Union[pd.Series, pd.DataFrame], Dict[str, pd.Series]]] = [
107      (
108          FeatureDescriptor(feature=MockGeneratedFeature(column="str", field="a"), alias="res"),
109          str_data,
110          {"a1702de9f83a993ea3cb4701ca9d17f7.str": pd.Series(["aa", "ba", "ca"])},
111      ),
112      (
113          LLMJudge(provider="mock_d", model="", template=MockTemplate(), input_columns={"aaa": "data"}, alias="res"),
114          pd.DataFrame({"aaa": ["x", "y"]}),
115          {"res": pd.Series(["x", "y"])},
116      ),
117      (TextLength(column_name="str", alias="res"), str_data, {"res": pd.Series([1, 1, 1])}),
118      (CustomColumnDescriptor(column_name="int", func=custom_col_descr, alias="res"), int_data, {"res": int_data}),
119      (CustomDescriptor(func=custom_descr, alias="res"), int_data, {"res": pd.Series([1, 1, 1])}),
120      (TestSummary(alias="res"), int_data, {"res": pd.Series([1, 0, 0])}),
121      (
122          ContextRelevance(alias="res", input="i", contexts="c"),
123          pd.DataFrame({"i": ["input"], "c": ["context"]}),
124          {"res": pd.Series([1])},
125      ),
126      (
127          GenericLLMDescriptor(
128              alias="res",
129              provider="mock_d",
130              model="",
131              input_columns={"aaa": "data"},
132              prompt=[{"role": "system", "content": "a"}, {"role": "user", "content": "{data}"}],
133          ),
134          pd.DataFrame({"aaa": ["x", "y"]}),
135          {"res": pd.Series(["a\nx", "a\ny"])},
136      ),
137      (
138          GenericLLMDescriptor(
139              alias="res",
140              provider="mock_d",
141              model="",
142              input_columns={"aaa": "data"},
143              prompt=TemplatePromptContent(template=MockTemplate()),
144          ),
145          pd.DataFrame({"aaa": ["x", "y"]}),
146          {"res": pd.Series(["x", "y"])},
147      ),
148      (
149          GenericLLMDescriptor(
150              alias="res",
151              provider="mock_d",
152              model="",
153              input_columns={"aaa": "data"},
154              prompt=TemplatePromptContent(template=MockTemplateMulticolumn()),
155          ),
156          pd.DataFrame(
157              {
158                  "aaa": [
159                      json.dumps({"res1": 1, "res2": "a"}),
160                      json.dumps({"res1": 2, "res2": "b"}),
161                  ]
162              }
163          ),
164          {
165              "res res1": pd.Series([1, 2]),
166              "res res2": pd.Series(["a", "b"]),
167          },
168      ),
169      (
170          LLMEval(
171              alias="res",
172              provider="mock_d",
173              model="",
174              input_columns={"aaa": "data"},
175              template=MockTemplate(),
176          ),
177          pd.DataFrame({"aaa": ["x", "y"]}),
178          {"res": pd.Series(["x", "y"])},
179      ),
180      (TextMatch(alias="res", column_name="str", match_items=["a"]), str_data, {"res": pd.Series([True, False, False])}),
181  ]
182  
183  
184  def test_descriptors_tested():
185      tested_desc_set = {type(p) for p, _, _ in all_descriptors}
186      load_all_subtypes(Descriptor)
187      all_desc_types = set(s for s in Descriptor.__subclasses__() if not isabstract(s))
188      assert tested_desc_set == all_desc_types, "Missing tests for descriptors " + ", ".join(
189          f'({t.__name__}(alias="res"), pd.Series(), {{"res":pd.Series()}})' for t in all_desc_types - tested_desc_set
190      )
191  
192  
193  @pytest.mark.parametrize("descriptor,data,result", all_descriptors)
194  def test_descriptors(descriptor: Descriptor, data: Union[pd.Series, pd.DataFrame], result: Dict[str, pd.Series]):
195      df = data if isinstance(data, pd.DataFrame) else pd.DataFrame(data)
196      dataset = Dataset.from_pandas(df)
197      if isinstance(descriptor, TestSummary):
198          dataset.add_descriptor(ColumnTest(str(df.columns[0]), eq(1)))
199      dataset.add_descriptor(descriptor)
200  
201      res_df = dataset.as_dataframe()
202      for col, value in result.items():
203          assert col in set(res_df.columns), f"no column {col}, cols: {res_df.columns}"
204          assert res_df[col].tolist() == value.tolist()
205  
206      payload = json.loads(descriptor.json())
207      descriptor2 = parse_obj_as(Descriptor, payload)
208      assert descriptor2 == descriptor