/ src / evidently / llm / datagen / base.py
base.py
  1  from abc import ABC
  2  from abc import abstractmethod
  3  from typing import ClassVar
  4  from typing import Optional
  5  
  6  import pandas as pd
  7  from typing_extensions import TypeAlias
  8  
  9  from evidently._pydantic_compat import PrivateAttr
 10  from evidently.legacy.options.base import Options
 11  from evidently.legacy.utils.sync import async_to_sync
 12  from evidently.llm.utils.blocks import SimpleBlock
 13  from evidently.llm.utils.prompt_render import prompt_command
 14  from evidently.llm.utils.wrapper import LLMWrapper
 15  from evidently.llm.utils.wrapper import get_llm_wrapper
 16  from evidently.pydantic_utils import AutoAliasMixin
 17  from evidently.pydantic_utils import EvidentlyBaseModel
 18  
 19  DatasetGeneratorResult: TypeAlias = pd.DataFrame
 20  
 21  
 22  class BaseDatasetGenerator(AutoAliasMixin, EvidentlyBaseModel, ABC):
 23      """Base class for dataset generators.
 24  
 25      Dataset generators create synthetic datasets using various methods
 26      (LLM-based, rule-based, etc.). Subclasses implement `agenerate()` to
 27      produce a pandas DataFrame.
 28      """
 29  
 30      __alias_type__: ClassVar = "dataset_generator"
 31  
 32      class Config:
 33          is_base_type = True
 34          extra = "forbid"
 35  
 36      options: Options
 37      """Processing options."""
 38  
 39      @abstractmethod
 40      async def agenerate(self) -> DatasetGeneratorResult:
 41          """Generate dataset asynchronously.
 42  
 43          Returns:
 44          * `pd.DataFrame` with generated data.
 45          """
 46          raise NotImplementedError
 47  
 48      def generate(self) -> DatasetGeneratorResult:
 49          """Generate dataset synchronously.
 50  
 51          Wrapper around `agenerate()` that handles async execution.
 52  
 53          Returns:
 54          * `pd.DataFrame` with generated data.
 55          """
 56          return async_to_sync(self.agenerate())
 57  
 58  
 59  class BaseLLMDatasetGenerator(BaseDatasetGenerator, ABC):
 60      """Base class for LLM-based dataset generators.
 61  
 62      Provides LLM wrapper management for generators that use language models
 63      to create synthetic data.
 64      """
 65  
 66      provider: str
 67      """LLM provider name."""
 68      model: str
 69      """LLM model name."""
 70      _llm_wrapper: Optional[LLMWrapper] = PrivateAttr(None)
 71  
 72      def get_llm_wrapper(self, options: Options) -> LLMWrapper:
 73          """Get or create the LLM wrapper for this generator.
 74  
 75          Args:
 76          * `options`: Processing options.
 77  
 78          Returns:
 79          * `LLMWrapper` instance for the configured provider/model.
 80          """
 81          if self._llm_wrapper is None:
 82              self._llm_wrapper = get_llm_wrapper(self.provider, self.model, options)
 83          return self._llm_wrapper
 84  
 85      @property
 86      def wrapper(self):
 87          """Get the LLM wrapper using the generator's options.
 88  
 89          Returns:
 90          * `LLMWrapper` instance.
 91          """
 92          return self.get_llm_wrapper(self.options)
 93  
 94  
 95  @prompt_command("datagen_instruction")
 96  def datagen_instruction_block(number):
 97      instruction = f"""Instructions:
 98  •	Make sure the sentence are not exactly repeats of each other.
 99  •	Remain faithful to the above context.
100  •	Make sure you do not start sentence with hyphen sign.
101  •	Make sure you do not end sentence with a newline.
102  •	Avoid providing any preamble.
103  •	Avoid providing any closing statement.
104  •	Ensure the number of generated texts is exactly {number}"""
105      return SimpleBlock(value=instruction)