base.py
1 from abc import ABC 2 from abc import abstractmethod 3 from typing import ClassVar 4 from typing import Optional 5 6 import pandas as pd 7 from typing_extensions import TypeAlias 8 9 from evidently._pydantic_compat import PrivateAttr 10 from evidently.legacy.options.base import Options 11 from evidently.legacy.utils.sync import async_to_sync 12 from evidently.llm.utils.blocks import SimpleBlock 13 from evidently.llm.utils.prompt_render import prompt_command 14 from evidently.llm.utils.wrapper import LLMWrapper 15 from evidently.llm.utils.wrapper import get_llm_wrapper 16 from evidently.pydantic_utils import AutoAliasMixin 17 from evidently.pydantic_utils import EvidentlyBaseModel 18 19 DatasetGeneratorResult: TypeAlias = pd.DataFrame 20 21 22 class BaseDatasetGenerator(AutoAliasMixin, EvidentlyBaseModel, ABC): 23 """Base class for dataset generators. 24 25 Dataset generators create synthetic datasets using various methods 26 (LLM-based, rule-based, etc.). Subclasses implement `agenerate()` to 27 produce a pandas DataFrame. 28 """ 29 30 __alias_type__: ClassVar = "dataset_generator" 31 32 class Config: 33 is_base_type = True 34 extra = "forbid" 35 36 options: Options 37 """Processing options.""" 38 39 @abstractmethod 40 async def agenerate(self) -> DatasetGeneratorResult: 41 """Generate dataset asynchronously. 42 43 Returns: 44 * `pd.DataFrame` with generated data. 45 """ 46 raise NotImplementedError 47 48 def generate(self) -> DatasetGeneratorResult: 49 """Generate dataset synchronously. 50 51 Wrapper around `agenerate()` that handles async execution. 52 53 Returns: 54 * `pd.DataFrame` with generated data. 55 """ 56 return async_to_sync(self.agenerate()) 57 58 59 class BaseLLMDatasetGenerator(BaseDatasetGenerator, ABC): 60 """Base class for LLM-based dataset generators. 61 62 Provides LLM wrapper management for generators that use language models 63 to create synthetic data. 64 """ 65 66 provider: str 67 """LLM provider name.""" 68 model: str 69 """LLM model name.""" 70 _llm_wrapper: Optional[LLMWrapper] = PrivateAttr(None) 71 72 def get_llm_wrapper(self, options: Options) -> LLMWrapper: 73 """Get or create the LLM wrapper for this generator. 74 75 Args: 76 * `options`: Processing options. 77 78 Returns: 79 * `LLMWrapper` instance for the configured provider/model. 80 """ 81 if self._llm_wrapper is None: 82 self._llm_wrapper = get_llm_wrapper(self.provider, self.model, options) 83 return self._llm_wrapper 84 85 @property 86 def wrapper(self): 87 """Get the LLM wrapper using the generator's options. 88 89 Returns: 90 * `LLMWrapper` instance. 91 """ 92 return self.get_llm_wrapper(self.options) 93 94 95 @prompt_command("datagen_instruction") 96 def datagen_instruction_block(number): 97 instruction = f"""Instructions: 98 • Make sure the sentence are not exactly repeats of each other. 99 • Remain faithful to the above context. 100 • Make sure you do not start sentence with hyphen sign. 101 • Make sure you do not end sentence with a newline. 102 • Avoid providing any preamble. 103 • Avoid providing any closing statement. 104 • Ensure the number of generated texts is exactly {number}""" 105 return SimpleBlock(value=instruction)