test_data_source.py
1 from io import BytesIO 2 from unittest.mock import AsyncMock 3 4 import pandas as pd 5 import pytest 6 7 from evidently.legacy.core import new_id 8 from evidently.ui.service.datasets.data_source import DatasetDataSource 9 from evidently.ui.service.datasets.data_source import FileDataSource 10 from evidently.ui.service.storage.local.base import FSSpecBlobStorage 11 from evidently.ui.service.storage.local.dataset import DatasetFileStorage 12 from evidently.ui.service.type_aliases import ZERO_UUID 13 14 15 @pytest.fixture 16 def tmp_path(): 17 """Create a temporary directory.""" 18 import tempfile 19 20 with tempfile.TemporaryDirectory() as tmpdir: 21 yield tmpdir 22 23 24 @pytest.fixture 25 def blob_storage(tmp_path): 26 """Create blob storage.""" 27 return FSSpecBlobStorage(base_path=tmp_path) 28 29 30 @pytest.fixture 31 def dataset_file_storage(blob_storage): 32 """Create dataset file storage.""" 33 return DatasetFileStorage(dataset_blob_storage=blob_storage) 34 35 36 @pytest.fixture 37 def test_user_id(): 38 """Create a test user ID.""" 39 return ZERO_UUID 40 41 42 @pytest.fixture 43 def test_project_id(): 44 """Create a test project ID.""" 45 return new_id() 46 47 48 @pytest.fixture 49 def test_dataset_id(): 50 """Create a test dataset ID.""" 51 return new_id() 52 53 54 @pytest.fixture 55 def sample_dataframe(): 56 """Create a sample dataframe.""" 57 return pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]}) 58 59 60 @pytest.fixture 61 def sample_parquet_data(sample_dataframe): 62 """Create sample parquet data.""" 63 buf = BytesIO() 64 sample_dataframe.to_parquet(buf, engine="pyarrow") 65 return buf.getvalue() 66 67 68 @pytest.fixture 69 def dataset_manager_mock( 70 dataset_file_storage, test_user_id, test_project_id, test_dataset_id, sample_parquet_data, sample_dataframe 71 ): 72 """Create a mock dataset manager.""" 73 from unittest.mock import MagicMock 74 75 from evidently.ui.service.managers.datasets import DatasetManager 76 77 manager = MagicMock(spec=DatasetManager) 78 manager.dataset_file_storage = dataset_file_storage 79 80 # Store the file in storage 81 blob_id = dataset_file_storage.put_dataset( 82 test_user_id, test_project_id, test_dataset_id, "test.parquet", sample_parquet_data 83 ) 84 85 # Mock get_dataset_metadata to return metadata with the file source 86 from evidently.core.datasets import Dataset 87 from evidently.ui.service.datasets.metadata import DatasetMetadata 88 from evidently.ui.service.datasets.metadata import DatasetOrigin 89 90 df = sample_dataframe 91 data_def = Dataset.from_pandas(df).data_definition 92 metadata = DatasetMetadata( 93 id=test_dataset_id, 94 project_id=test_project_id, 95 author_id=test_user_id, 96 name="test_dataset", 97 description="Test", 98 source=FileDataSource(project_id=test_project_id, filename=blob_id), 99 data_definition=data_def, 100 size_bytes=100, 101 row_count=len(df), 102 column_count=len(df.columns), 103 all_columns=list(df.columns), 104 is_draft=False, 105 draft_params=None, 106 origin=DatasetOrigin.file, 107 metadata={}, 108 tags=[], 109 ) 110 111 async def get_dataset_metadata(user_id, dataset_id): 112 if dataset_id == test_dataset_id: 113 return metadata 114 return None 115 116 manager.get_dataset_metadata = AsyncMock(side_effect=get_dataset_metadata) 117 return manager 118 119 120 @pytest.mark.asyncio 121 async def test_file_data_source_materialize( 122 dataset_file_storage, test_user_id, test_project_id, test_dataset_id, sample_parquet_data, sample_dataframe 123 ): 124 """Test FileDataSource.materialize().""" 125 from unittest.mock import MagicMock 126 127 from evidently.ui.service.managers.datasets import DatasetManager 128 from evidently.ui.service.managers.projects import ProjectManager 129 130 # Store the file 131 blob_id = dataset_file_storage.put_dataset( 132 test_user_id, test_project_id, test_dataset_id, "test.parquet", sample_parquet_data 133 ) 134 135 # Create a minimal dataset manager mock 136 project_manager = MagicMock(spec=ProjectManager) 137 dataset_manager = DatasetManager( 138 project_manager=project_manager, 139 dataset_metadata=MagicMock(), 140 dataset_file_storage=dataset_file_storage, 141 tracing_storage=MagicMock(), 142 ) 143 144 # Create FileDataSource pointing to the stored file 145 file_source = FileDataSource(project_id=test_project_id, filename=blob_id) 146 147 # Materialize the data source 148 df = await file_source.materialize(dataset_manager) 149 150 assert isinstance(df, pd.DataFrame) 151 assert len(df) == 3 152 assert list(df.columns) == ["col1", "col2"] 153 pd.testing.assert_frame_equal(df, sample_dataframe) 154 155 156 @pytest.mark.asyncio 157 async def test_file_data_source_materialize_missing_file(dataset_file_storage, test_project_id): 158 """Test FileDataSource.materialize() with missing file.""" 159 from unittest.mock import MagicMock 160 161 from evidently.ui.service.datasets.data_source import DatasetReadError 162 from evidently.ui.service.managers.datasets import DatasetManager 163 164 project_manager = MagicMock() 165 dataset_manager = DatasetManager( 166 project_manager=project_manager, 167 dataset_metadata=MagicMock(), 168 dataset_file_storage=dataset_file_storage, 169 tracing_storage=MagicMock(), 170 ) 171 172 file_source = FileDataSource(project_id=test_project_id, filename="nonexistent/file.parquet") 173 174 with pytest.raises(DatasetReadError): 175 await file_source.materialize(dataset_manager) 176 177 178 @pytest.mark.asyncio 179 async def test_dataset_data_source_materialize(dataset_manager_mock, test_user_id, test_dataset_id, sample_dataframe): 180 """Test DatasetDataSource.materialize().""" 181 dataset_source = DatasetDataSource( 182 user_id=test_user_id, 183 dataset_id=test_dataset_id, 184 filter_by=None, 185 sort_by=None, 186 ) 187 188 df = await dataset_source.materialize(dataset_manager_mock) 189 190 assert isinstance(df, pd.DataFrame) 191 assert len(df) == 3 192 assert list(df.columns) == ["col1", "col2"] 193 pd.testing.assert_frame_equal(df, sample_dataframe) 194 195 196 @pytest.mark.asyncio 197 async def test_dataset_data_source_materialize_with_filters(dataset_manager_mock, test_user_id, test_dataset_id): 198 """Test DatasetDataSource.materialize() with filters.""" 199 from evidently.ui.service.datasets.filters import EqualFilter 200 201 dataset_source = DatasetDataSource( 202 user_id=test_user_id, 203 dataset_id=test_dataset_id, 204 filter_by=[EqualFilter(column="col1", value=1)], 205 sort_by=None, 206 ) 207 208 df = await dataset_source.materialize(dataset_manager_mock) 209 210 assert isinstance(df, pd.DataFrame) 211 assert len(df) == 1 212 assert df.iloc[0]["col1"] == 1 213 assert df.iloc[0]["col2"] == "a" 214 215 216 @pytest.mark.asyncio 217 async def test_dataset_data_source_materialize_with_sorting(dataset_manager_mock, test_user_id, test_dataset_id): 218 """Test DatasetDataSource.materialize() with sorting.""" 219 from evidently.ui.service.datasets.data_source import SortBy 220 221 dataset_source = DatasetDataSource( 222 user_id=test_user_id, 223 dataset_id=test_dataset_id, 224 filter_by=None, 225 sort_by=SortBy(column="col1", ascending=False), 226 ) 227 228 df = await dataset_source.materialize(dataset_manager_mock) 229 230 assert isinstance(df, pd.DataFrame) 231 assert len(df) == 3 232 assert list(df["col1"].values) == [3, 2, 1] # Sorted descending 233 234 235 @pytest.mark.asyncio 236 async def test_dataset_data_source_materialize_missing_dataset(dataset_manager_mock, test_user_id): 237 """Test DatasetDataSource.materialize() with missing dataset.""" 238 from evidently.legacy.core import new_id 239 from evidently.ui.service.errors import DatasetNotFound 240 241 dataset_source = DatasetDataSource( 242 user_id=test_user_id, 243 dataset_id=new_id(), 244 filter_by=None, 245 sort_by=None, 246 ) 247 248 with pytest.raises(DatasetNotFound): 249 await dataset_source.materialize(dataset_manager_mock)