test_descriptors.py
1 import json 2 from inspect import isabstract 3 from typing import ClassVar 4 from typing import Dict 5 from typing import List 6 from typing import Optional 7 from typing import Tuple 8 from typing import Union 9 10 import pandas as pd 11 import pytest 12 13 from evidently import ColumnType 14 from evidently._pydantic_compat import parse_obj_as 15 from evidently.core.datasets import ColumnTest 16 from evidently.core.datasets import Dataset 17 from evidently.core.datasets import DatasetColumn 18 from evidently.core.datasets import Descriptor 19 from evidently.core.datasets import FeatureDescriptor 20 from evidently.core.datasets import TestSummary 21 from evidently.descriptors import ContextRelevance 22 from evidently.descriptors import CustomColumnDescriptor 23 from evidently.descriptors import CustomDescriptor 24 from evidently.descriptors import LLMJudge 25 from evidently.descriptors import TextLength 26 from evidently.descriptors import TextMatch 27 from evidently.descriptors.llm_judges import GenericLLMDescriptor 28 from evidently.descriptors.llm_judges import LLMEval 29 from evidently.legacy.options.base import Options 30 from evidently.legacy.utils.llm.base import LLMMessage 31 from evidently.legacy.utils.llm.wrapper import LLMResult 32 from evidently.legacy.utils.llm.wrapper import LLMWrapper 33 from evidently.legacy.utils.llm.wrapper import llm_provider 34 from evidently.llm.prompts.content import TemplatePromptContent 35 from evidently.llm.templates import BaseLLMPromptTemplate 36 from evidently.llm.utils.blocks import PromptBlock 37 from evidently.tests import eq 38 from tests.conftest import load_all_subtypes 39 40 from .test_feature_descriptors import MockGeneratedFeature 41 42 int_data = pd.Series([1, 2, 3], name="int") 43 str_data = pd.Series(["a", "b", "c"], name="str") 44 45 46 @llm_provider("mock_d", None) 47 class MockLLMWrapper(LLMWrapper): 48 def __init__(self, model: str, options: Options): 49 self.model = model 50 51 async def complete(self, messages: List[LLMMessage], seed: Optional[int] = None) -> LLMResult[str]: 52 return LLMResult("\n".join(m.content for m in messages), 0, 0) 53 54 55 def custom_descr(dataset: Dataset) -> DatasetColumn: 56 return DatasetColumn(ColumnType.Numerical, pd.Series([1] * len(dataset.as_dataframe()))) 57 58 59 def custom_col_descr(col: DatasetColumn) -> DatasetColumn: 60 return DatasetColumn(ColumnType.Numerical, col.data) 61 62 63 class MockTemplate(BaseLLMPromptTemplate): 64 blocks: ClassVar = [PromptBlock.simple("{data}")] 65 66 def list_output_columns(self) -> List[str]: 67 return ["res"] 68 69 def get_type(self, subcolumn: Optional[str]) -> ColumnType: 70 return ColumnType.Text 71 72 def get_main_output_column(self) -> str: 73 return "res" 74 75 76 class MockTemplateMulticolumn(BaseLLMPromptTemplate): 77 blocks: ClassVar = [PromptBlock.simple("{data}"), PromptBlock.json_output(**{"res1": "", "res2": ""})] 78 79 def list_output_columns(self) -> List[str]: 80 return ["res1", "res2"] 81 82 def get_type(self, subcolumn: Optional[str]) -> ColumnType: 83 return ColumnType.Text 84 85 def get_main_output_column(self) -> str: 86 return "res1" 87 88 89 @pytest.fixture(autouse=True) 90 def mock_semantic_scoring(mocker): 91 from evidently.descriptors._context_relevance import MeanAggregation 92 from evidently.descriptors._context_relevance import semantic_similarity_scoring as sss 93 94 def semantic_scoring_mock(question: DatasetColumn, context: DatasetColumn, options) -> DatasetColumn: 95 return DatasetColumn(ColumnType.Numerical, pd.Series([1] * len(question.data))) 96 97 mocker.patch(f"{sss.__module__}.{sss.__name__}", new=semantic_scoring_mock) 98 mocker.patch( 99 f"{sss.__module__}.METHODS", 100 new={ 101 "semantic_similarity": (semantic_scoring_mock, MeanAggregation), 102 }, 103 ) 104 105 106 all_descriptors: List[Tuple[Descriptor, Union[pd.Series, pd.DataFrame], Dict[str, pd.Series]]] = [ 107 ( 108 FeatureDescriptor(feature=MockGeneratedFeature(column="str", field="a"), alias="res"), 109 str_data, 110 {"a1702de9f83a993ea3cb4701ca9d17f7.str": pd.Series(["aa", "ba", "ca"])}, 111 ), 112 ( 113 LLMJudge(provider="mock_d", model="", template=MockTemplate(), input_columns={"aaa": "data"}, alias="res"), 114 pd.DataFrame({"aaa": ["x", "y"]}), 115 {"res": pd.Series(["x", "y"])}, 116 ), 117 (TextLength(column_name="str", alias="res"), str_data, {"res": pd.Series([1, 1, 1])}), 118 (CustomColumnDescriptor(column_name="int", func=custom_col_descr, alias="res"), int_data, {"res": int_data}), 119 (CustomDescriptor(func=custom_descr, alias="res"), int_data, {"res": pd.Series([1, 1, 1])}), 120 (TestSummary(alias="res"), int_data, {"res": pd.Series([1, 0, 0])}), 121 ( 122 ContextRelevance(alias="res", input="i", contexts="c"), 123 pd.DataFrame({"i": ["input"], "c": ["context"]}), 124 {"res": pd.Series([1])}, 125 ), 126 ( 127 GenericLLMDescriptor( 128 alias="res", 129 provider="mock_d", 130 model="", 131 input_columns={"aaa": "data"}, 132 prompt=[{"role": "system", "content": "a"}, {"role": "user", "content": "{data}"}], 133 ), 134 pd.DataFrame({"aaa": ["x", "y"]}), 135 {"res": pd.Series(["a\nx", "a\ny"])}, 136 ), 137 ( 138 GenericLLMDescriptor( 139 alias="res", 140 provider="mock_d", 141 model="", 142 input_columns={"aaa": "data"}, 143 prompt=TemplatePromptContent(template=MockTemplate()), 144 ), 145 pd.DataFrame({"aaa": ["x", "y"]}), 146 {"res": pd.Series(["x", "y"])}, 147 ), 148 ( 149 GenericLLMDescriptor( 150 alias="res", 151 provider="mock_d", 152 model="", 153 input_columns={"aaa": "data"}, 154 prompt=TemplatePromptContent(template=MockTemplateMulticolumn()), 155 ), 156 pd.DataFrame( 157 { 158 "aaa": [ 159 json.dumps({"res1": 1, "res2": "a"}), 160 json.dumps({"res1": 2, "res2": "b"}), 161 ] 162 } 163 ), 164 { 165 "res res1": pd.Series([1, 2]), 166 "res res2": pd.Series(["a", "b"]), 167 }, 168 ), 169 ( 170 LLMEval( 171 alias="res", 172 provider="mock_d", 173 model="", 174 input_columns={"aaa": "data"}, 175 template=MockTemplate(), 176 ), 177 pd.DataFrame({"aaa": ["x", "y"]}), 178 {"res": pd.Series(["x", "y"])}, 179 ), 180 (TextMatch(alias="res", column_name="str", match_items=["a"]), str_data, {"res": pd.Series([True, False, False])}), 181 ] 182 183 184 def test_descriptors_tested(): 185 tested_desc_set = {type(p) for p, _, _ in all_descriptors} 186 load_all_subtypes(Descriptor) 187 all_desc_types = set(s for s in Descriptor.__subclasses__() if not isabstract(s)) 188 assert tested_desc_set == all_desc_types, "Missing tests for descriptors " + ", ".join( 189 f'({t.__name__}(alias="res"), pd.Series(), {{"res":pd.Series()}})' for t in all_desc_types - tested_desc_set 190 ) 191 192 193 @pytest.mark.parametrize("descriptor,data,result", all_descriptors) 194 def test_descriptors(descriptor: Descriptor, data: Union[pd.Series, pd.DataFrame], result: Dict[str, pd.Series]): 195 df = data if isinstance(data, pd.DataFrame) else pd.DataFrame(data) 196 dataset = Dataset.from_pandas(df) 197 if isinstance(descriptor, TestSummary): 198 dataset.add_descriptor(ColumnTest(str(df.columns[0]), eq(1))) 199 dataset.add_descriptor(descriptor) 200 201 res_df = dataset.as_dataframe() 202 for col, value in result.items(): 203 assert col in set(res_df.columns), f"no column {col}, cols: {res_df.columns}" 204 assert res_df[col].tolist() == value.tolist() 205 206 payload = json.loads(descriptor.json()) 207 descriptor2 = parse_obj_as(Descriptor, payload) 208 assert descriptor2 == descriptor