/ tests / future / test_ui / test_data_source.py
test_data_source.py
  1  from io import BytesIO
  2  from unittest.mock import AsyncMock
  3  
  4  import pandas as pd
  5  import pytest
  6  
  7  from evidently.legacy.core import new_id
  8  from evidently.ui.service.datasets.data_source import DatasetDataSource
  9  from evidently.ui.service.datasets.data_source import FileDataSource
 10  from evidently.ui.service.storage.local.base import FSSpecBlobStorage
 11  from evidently.ui.service.storage.local.dataset import DatasetFileStorage
 12  from evidently.ui.service.type_aliases import ZERO_UUID
 13  
 14  
 15  @pytest.fixture
 16  def tmp_path():
 17      """Create a temporary directory."""
 18      import tempfile
 19  
 20      with tempfile.TemporaryDirectory() as tmpdir:
 21          yield tmpdir
 22  
 23  
 24  @pytest.fixture
 25  def blob_storage(tmp_path):
 26      """Create blob storage."""
 27      return FSSpecBlobStorage(base_path=tmp_path)
 28  
 29  
 30  @pytest.fixture
 31  def dataset_file_storage(blob_storage):
 32      """Create dataset file storage."""
 33      return DatasetFileStorage(dataset_blob_storage=blob_storage)
 34  
 35  
 36  @pytest.fixture
 37  def test_user_id():
 38      """Create a test user ID."""
 39      return ZERO_UUID
 40  
 41  
 42  @pytest.fixture
 43  def test_project_id():
 44      """Create a test project ID."""
 45      return new_id()
 46  
 47  
 48  @pytest.fixture
 49  def test_dataset_id():
 50      """Create a test dataset ID."""
 51      return new_id()
 52  
 53  
 54  @pytest.fixture
 55  def sample_dataframe():
 56      """Create a sample dataframe."""
 57      return pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
 58  
 59  
 60  @pytest.fixture
 61  def sample_parquet_data(sample_dataframe):
 62      """Create sample parquet data."""
 63      buf = BytesIO()
 64      sample_dataframe.to_parquet(buf, engine="pyarrow")
 65      return buf.getvalue()
 66  
 67  
 68  @pytest.fixture
 69  def dataset_manager_mock(
 70      dataset_file_storage, test_user_id, test_project_id, test_dataset_id, sample_parquet_data, sample_dataframe
 71  ):
 72      """Create a mock dataset manager."""
 73      from unittest.mock import MagicMock
 74  
 75      from evidently.ui.service.managers.datasets import DatasetManager
 76  
 77      manager = MagicMock(spec=DatasetManager)
 78      manager.dataset_file_storage = dataset_file_storage
 79  
 80      # Store the file in storage
 81      blob_id = dataset_file_storage.put_dataset(
 82          test_user_id, test_project_id, test_dataset_id, "test.parquet", sample_parquet_data
 83      )
 84  
 85      # Mock get_dataset_metadata to return metadata with the file source
 86      from evidently.core.datasets import Dataset
 87      from evidently.ui.service.datasets.metadata import DatasetMetadata
 88      from evidently.ui.service.datasets.metadata import DatasetOrigin
 89  
 90      df = sample_dataframe
 91      data_def = Dataset.from_pandas(df).data_definition
 92      metadata = DatasetMetadata(
 93          id=test_dataset_id,
 94          project_id=test_project_id,
 95          author_id=test_user_id,
 96          name="test_dataset",
 97          description="Test",
 98          source=FileDataSource(project_id=test_project_id, filename=blob_id),
 99          data_definition=data_def,
100          size_bytes=100,
101          row_count=len(df),
102          column_count=len(df.columns),
103          all_columns=list(df.columns),
104          is_draft=False,
105          draft_params=None,
106          origin=DatasetOrigin.file,
107          metadata={},
108          tags=[],
109      )
110  
111      async def get_dataset_metadata(user_id, dataset_id):
112          if dataset_id == test_dataset_id:
113              return metadata
114          return None
115  
116      manager.get_dataset_metadata = AsyncMock(side_effect=get_dataset_metadata)
117      return manager
118  
119  
120  @pytest.mark.asyncio
121  async def test_file_data_source_materialize(
122      dataset_file_storage, test_user_id, test_project_id, test_dataset_id, sample_parquet_data, sample_dataframe
123  ):
124      """Test FileDataSource.materialize()."""
125      from unittest.mock import MagicMock
126  
127      from evidently.ui.service.managers.datasets import DatasetManager
128      from evidently.ui.service.managers.projects import ProjectManager
129  
130      # Store the file
131      blob_id = dataset_file_storage.put_dataset(
132          test_user_id, test_project_id, test_dataset_id, "test.parquet", sample_parquet_data
133      )
134  
135      # Create a minimal dataset manager mock
136      project_manager = MagicMock(spec=ProjectManager)
137      dataset_manager = DatasetManager(
138          project_manager=project_manager,
139          dataset_metadata=MagicMock(),
140          dataset_file_storage=dataset_file_storage,
141          tracing_storage=MagicMock(),
142      )
143  
144      # Create FileDataSource pointing to the stored file
145      file_source = FileDataSource(project_id=test_project_id, filename=blob_id)
146  
147      # Materialize the data source
148      df = await file_source.materialize(dataset_manager)
149  
150      assert isinstance(df, pd.DataFrame)
151      assert len(df) == 3
152      assert list(df.columns) == ["col1", "col2"]
153      pd.testing.assert_frame_equal(df, sample_dataframe)
154  
155  
156  @pytest.mark.asyncio
157  async def test_file_data_source_materialize_missing_file(dataset_file_storage, test_project_id):
158      """Test FileDataSource.materialize() with missing file."""
159      from unittest.mock import MagicMock
160  
161      from evidently.ui.service.datasets.data_source import DatasetReadError
162      from evidently.ui.service.managers.datasets import DatasetManager
163  
164      project_manager = MagicMock()
165      dataset_manager = DatasetManager(
166          project_manager=project_manager,
167          dataset_metadata=MagicMock(),
168          dataset_file_storage=dataset_file_storage,
169          tracing_storage=MagicMock(),
170      )
171  
172      file_source = FileDataSource(project_id=test_project_id, filename="nonexistent/file.parquet")
173  
174      with pytest.raises(DatasetReadError):
175          await file_source.materialize(dataset_manager)
176  
177  
178  @pytest.mark.asyncio
179  async def test_dataset_data_source_materialize(dataset_manager_mock, test_user_id, test_dataset_id, sample_dataframe):
180      """Test DatasetDataSource.materialize()."""
181      dataset_source = DatasetDataSource(
182          user_id=test_user_id,
183          dataset_id=test_dataset_id,
184          filter_by=None,
185          sort_by=None,
186      )
187  
188      df = await dataset_source.materialize(dataset_manager_mock)
189  
190      assert isinstance(df, pd.DataFrame)
191      assert len(df) == 3
192      assert list(df.columns) == ["col1", "col2"]
193      pd.testing.assert_frame_equal(df, sample_dataframe)
194  
195  
196  @pytest.mark.asyncio
197  async def test_dataset_data_source_materialize_with_filters(dataset_manager_mock, test_user_id, test_dataset_id):
198      """Test DatasetDataSource.materialize() with filters."""
199      from evidently.ui.service.datasets.filters import EqualFilter
200  
201      dataset_source = DatasetDataSource(
202          user_id=test_user_id,
203          dataset_id=test_dataset_id,
204          filter_by=[EqualFilter(column="col1", value=1)],
205          sort_by=None,
206      )
207  
208      df = await dataset_source.materialize(dataset_manager_mock)
209  
210      assert isinstance(df, pd.DataFrame)
211      assert len(df) == 1
212      assert df.iloc[0]["col1"] == 1
213      assert df.iloc[0]["col2"] == "a"
214  
215  
216  @pytest.mark.asyncio
217  async def test_dataset_data_source_materialize_with_sorting(dataset_manager_mock, test_user_id, test_dataset_id):
218      """Test DatasetDataSource.materialize() with sorting."""
219      from evidently.ui.service.datasets.data_source import SortBy
220  
221      dataset_source = DatasetDataSource(
222          user_id=test_user_id,
223          dataset_id=test_dataset_id,
224          filter_by=None,
225          sort_by=SortBy(column="col1", ascending=False),
226      )
227  
228      df = await dataset_source.materialize(dataset_manager_mock)
229  
230      assert isinstance(df, pd.DataFrame)
231      assert len(df) == 3
232      assert list(df["col1"].values) == [3, 2, 1]  # Sorted descending
233  
234  
235  @pytest.mark.asyncio
236  async def test_dataset_data_source_materialize_missing_dataset(dataset_manager_mock, test_user_id):
237      """Test DatasetDataSource.materialize() with missing dataset."""
238      from evidently.legacy.core import new_id
239      from evidently.ui.service.errors import DatasetNotFound
240  
241      dataset_source = DatasetDataSource(
242          user_id=test_user_id,
243          dataset_id=new_id(),
244          filter_by=None,
245          sort_by=None,
246      )
247  
248      with pytest.raises(DatasetNotFound):
249          await dataset_source.materialize(dataset_manager_mock)