test_dataset_manager.py
1 import datetime 2 from io import BytesIO 3 from unittest.mock import AsyncMock 4 from unittest.mock import MagicMock 5 6 import pandas as pd 7 import pytest 8 from litestar.datastructures import UploadFile 9 10 from evidently.core.datasets import Dataset 11 from evidently.legacy.core import new_id 12 from evidently.ui.service.datasets.metadata import DatasetMetadata 13 from evidently.ui.service.datasets.metadata import DatasetMetadataStorage 14 from evidently.ui.service.datasets.metadata import DatasetOrigin 15 from evidently.ui.service.managers.datasets import DatasetManager 16 from evidently.ui.service.managers.projects import ProjectManager 17 from evidently.ui.service.storage.local.base import FSSpecBlobStorage 18 from evidently.ui.service.storage.local.dataset import DatasetFileStorage 19 from evidently.ui.service.type_aliases import ZERO_UUID 20 from evidently.ui.service.type_aliases import DatasetID 21 from evidently.ui.service.type_aliases import ProjectID 22 from evidently.ui.service.type_aliases import UserID 23 24 25 @pytest.fixture 26 def tmp_path(): 27 """Create a temporary directory.""" 28 import tempfile 29 30 with tempfile.TemporaryDirectory() as tmpdir: 31 yield tmpdir 32 33 34 @pytest.fixture 35 def blob_storage(tmp_path): 36 """Create blob storage.""" 37 return FSSpecBlobStorage(base_path=tmp_path) 38 39 40 @pytest.fixture 41 def dataset_file_storage(blob_storage): 42 """Create dataset file storage.""" 43 return DatasetFileStorage(dataset_blob_storage=blob_storage) 44 45 46 @pytest.fixture 47 def dataset_metadata_storage(): 48 """Create in-memory dataset metadata storage for testing.""" 49 from collections import defaultdict 50 from typing import Dict 51 52 class InMemoryDatasetMetadataStorage(DatasetMetadataStorage): 53 def __init__(self): 54 self._datasets: Dict[DatasetID, DatasetMetadata] = {} 55 self._project_datasets: Dict[ProjectID, list[DatasetID]] = defaultdict(list) 56 57 async def add_dataset_metadata( 58 self, user_id: UserID, project_id: ProjectID, dataset: DatasetMetadata 59 ) -> DatasetID: 60 self._datasets[dataset.id] = dataset 61 self._project_datasets[project_id].append(dataset.id) 62 return dataset.id 63 64 async def update_dataset_metadata(self, dataset_id: DatasetID, new_metadata: DatasetMetadata): 65 if dataset_id in self._datasets: 66 stored = self._datasets[dataset_id] 67 stored.name = new_metadata.name 68 stored.description = new_metadata.description 69 stored.data_definition = new_metadata.data_definition 70 stored.metadata = new_metadata.metadata 71 stored.tags = new_metadata.tags 72 73 async def update_dataset_tracing_metadata(self, dataset_id: DatasetID, tracing_metadata): 74 pass 75 76 async def get_dataset_metadata(self, dataset_id: DatasetID): 77 from evidently.ui.service.datasets.metadata import DatasetMetadataFull 78 79 if dataset_id not in self._datasets: 80 return None 81 ds = self._datasets[dataset_id] 82 return DatasetMetadataFull( 83 id=ds.id, 84 project_id=ds.project_id, 85 author_id=ds.author_id, 86 name=ds.name, 87 description=ds.description, 88 data_definition=ds.data_definition, 89 source=ds.source, 90 size_bytes=ds.size_bytes, 91 row_count=ds.row_count, 92 column_count=ds.column_count, 93 all_columns=ds.all_columns, 94 is_draft=ds.is_draft, 95 draft_params=ds.draft_params, 96 origin=ds.origin, 97 metadata=ds.metadata, 98 tags=ds.tags, 99 tracing_params=ds.tracing_params, 100 created_at=datetime.datetime.now(), 101 updated_at=datetime.datetime.now(), 102 author_name="Test User", 103 ) 104 105 async def mark_dataset_deleted(self, dataset_id: DatasetID): 106 if dataset_id in self._datasets: 107 del self._datasets[dataset_id] 108 109 async def delete_dataset_metadata(self, dataset_id: DatasetID): 110 if dataset_id in self._datasets: 111 del self._datasets[dataset_id] 112 113 async def list_datasets_metadata(self, project_id: ProjectID, limit, origin, draft) -> list: 114 from evidently.ui.service.datasets.metadata import DatasetMetadataFull 115 116 result = [] 117 for ds_id in self._project_datasets.get(project_id, []): 118 if ds_id in self._datasets: 119 ds = self._datasets[ds_id] 120 if origin and ds.origin not in origin: 121 continue 122 if draft is not None and ds.is_draft != draft: 123 continue 124 result.append( 125 DatasetMetadataFull( 126 id=ds.id, 127 project_id=ds.project_id, 128 author_id=ds.author_id, 129 name=ds.name, 130 description=ds.description, 131 data_definition=ds.data_definition, 132 source=ds.source, 133 size_bytes=ds.size_bytes, 134 row_count=ds.row_count, 135 column_count=ds.column_count, 136 all_columns=ds.all_columns, 137 is_draft=ds.is_draft, 138 draft_params=ds.draft_params, 139 origin=ds.origin, 140 metadata=ds.metadata, 141 tags=ds.tags, 142 tracing_params=ds.tracing_params, 143 created_at=datetime.datetime.now(), 144 updated_at=datetime.datetime.now(), 145 author_name="Test User", 146 ) 147 ) 148 return result[:limit] if limit else result 149 150 async def datasets_count(self, project_id: ProjectID) -> int: 151 return len(self._project_datasets.get(project_id, [])) 152 153 return InMemoryDatasetMetadataStorage() 154 155 156 @pytest.fixture 157 def mock_project_manager(): 158 """Create a mock project manager.""" 159 from unittest.mock import AsyncMock 160 161 pm = MagicMock(spec=ProjectManager) 162 pm.get_project = AsyncMock(return_value=None) 163 return pm 164 165 166 @pytest.fixture 167 def dataset_manager(dataset_metadata_storage, dataset_file_storage, mock_project_manager): 168 """Create dataset manager.""" 169 return DatasetManager( 170 project_manager=mock_project_manager, 171 dataset_metadata=dataset_metadata_storage, 172 dataset_file_storage=dataset_file_storage, 173 tracing_storage=MagicMock(), 174 ) 175 176 177 @pytest.fixture 178 def sample_dataframe(): 179 """Create a sample dataframe.""" 180 return pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]}) 181 182 183 @pytest.fixture 184 def sample_upload_file(sample_dataframe): 185 """Create a sample upload file.""" 186 buf = BytesIO() 187 sample_dataframe.to_csv(buf, index=False) 188 buf.seek(0) 189 return UploadFile( 190 content_type="text/csv", 191 filename="test.csv", 192 file_data=buf.getvalue(), 193 ) 194 195 196 @pytest.fixture 197 def test_project_id(): 198 """Create a test project ID.""" 199 return new_id() 200 201 202 @pytest.fixture 203 def test_user_id(): 204 """Create a test user ID.""" 205 return ZERO_UUID 206 207 208 @pytest.mark.asyncio 209 async def test_upload_dataset_from_dataframe( 210 dataset_manager, dataset_metadata_storage, sample_dataframe, test_user_id, test_project_id, mock_project_manager 211 ): 212 """Test uploading a dataset from a dataframe.""" 213 from evidently.ui.service.base import Project 214 215 mock_project_manager.get_project = AsyncMock(return_value=Project(id=test_project_id, name="Test")) 216 data_def = Dataset.from_pandas(sample_dataframe).data_definition 217 dataset = await dataset_manager.upload_dataset( 218 test_user_id, 219 test_project_id, 220 "test_dataset", 221 "Test description", 222 sample_dataframe, 223 data_def, 224 DatasetOrigin.file, 225 {}, 226 [], 227 ) 228 assert dataset.name == "test_dataset" 229 assert dataset.description == "Test description" 230 assert dataset.row_count == len(sample_dataframe) 231 232 233 @pytest.mark.asyncio 234 async def test_upload_dataset_from_file( 235 dataset_manager, 236 dataset_metadata_storage, 237 sample_upload_file, 238 test_user_id, 239 test_project_id, 240 mock_project_manager, 241 ): 242 """Test uploading a dataset from an upload file.""" 243 from evidently.ui.service.base import Project 244 245 mock_project_manager.get_project = AsyncMock(return_value=Project(id=test_project_id, name="Test")) 246 data_def = Dataset.from_pandas(pd.DataFrame()).data_definition 247 dataset = await dataset_manager.upload_dataset( 248 test_user_id, 249 test_project_id, 250 "test_dataset", 251 "Test description", 252 sample_upload_file, 253 data_def, 254 DatasetOrigin.file, 255 {}, 256 [], 257 ) 258 assert dataset.name == "test_dataset" 259 assert dataset.row_count == 3 260 261 262 @pytest.mark.asyncio 263 async def test_get_dataset_metadata_not_found(dataset_manager, test_user_id): 264 """Test getting non-existent dataset metadata.""" 265 from evidently.ui.service.errors import DatasetNotFound 266 267 with pytest.raises(DatasetNotFound): 268 await dataset_manager.get_dataset_metadata(test_user_id, new_id()) 269 270 271 @pytest.mark.asyncio 272 async def test_update_dataset( 273 dataset_manager, 274 dataset_metadata_storage, 275 sample_dataframe, 276 test_user_id, 277 test_project_id, 278 mock_project_manager, 279 ): 280 """Test updating a dataset.""" 281 from evidently.ui.service.base import Project 282 283 mock_project_manager.get_project = AsyncMock(return_value=Project(id=test_project_id, name="Test")) 284 data_def = Dataset.from_pandas(sample_dataframe).data_definition 285 dataset = await dataset_manager.upload_dataset( 286 test_user_id, 287 test_project_id, 288 "test_dataset", 289 "Test description", 290 sample_dataframe, 291 data_def, 292 DatasetOrigin.file, 293 {}, 294 [], 295 ) 296 await dataset_manager.update_dataset( 297 test_user_id, dataset.id, "updated_name", "updated_description", None, None, None 298 ) 299 updated = await dataset_manager.get_dataset_metadata(test_user_id, dataset.id) 300 assert updated.name == "updated_name" 301 assert updated.description == "updated_description" 302 303 304 @pytest.mark.asyncio 305 async def test_delete_dataset( 306 dataset_manager, 307 dataset_metadata_storage, 308 sample_dataframe, 309 test_user_id, 310 test_project_id, 311 mock_project_manager, 312 ): 313 """Test deleting a dataset.""" 314 from evidently.ui.service.base import Project 315 316 mock_project_manager.get_project = AsyncMock(return_value=Project(id=test_project_id, name="Test")) 317 data_def = Dataset.from_pandas(sample_dataframe).data_definition 318 dataset = await dataset_manager.upload_dataset( 319 test_user_id, 320 test_project_id, 321 "test_dataset", 322 "Test description", 323 sample_dataframe, 324 data_def, 325 DatasetOrigin.file, 326 {}, 327 [], 328 ) 329 await dataset_manager.delete_dataset(test_user_id, dataset.id) 330 from evidently.ui.service.errors import DatasetNotFound 331 332 with pytest.raises(DatasetNotFound): 333 await dataset_manager.get_dataset_metadata(test_user_id, dataset.id) 334 335 336 @pytest.mark.asyncio 337 async def test_list_datasets( 338 dataset_manager, 339 dataset_metadata_storage, 340 sample_dataframe, 341 test_user_id, 342 test_project_id, 343 mock_project_manager, 344 ): 345 """Test listing datasets.""" 346 from evidently.ui.service.base import Project 347 348 mock_project_manager.get_project = AsyncMock(return_value=Project(id=test_project_id, name="Test")) 349 data_def = Dataset.from_pandas(sample_dataframe).data_definition 350 await dataset_manager.upload_dataset( 351 test_user_id, 352 test_project_id, 353 "test_dataset", 354 "Test description", 355 sample_dataframe, 356 data_def, 357 DatasetOrigin.file, 358 {}, 359 [], 360 ) 361 datasets = await dataset_manager.list_datasets(test_user_id, test_project_id, None, None, None) 362 assert len(datasets) == 1 363 364 365 @pytest.mark.asyncio 366 async def test_get_dataset_pagination( 367 dataset_manager, 368 dataset_metadata_storage, 369 sample_dataframe, 370 test_user_id, 371 test_project_id, 372 mock_project_manager, 373 ): 374 """Test getting paginated dataset data.""" 375 from evidently.ui.service.base import Project 376 377 mock_project_manager.get_project = AsyncMock(return_value=Project(id=test_project_id, name="Test")) 378 data_def = Dataset.from_pandas(sample_dataframe).data_definition 379 dataset = await dataset_manager.upload_dataset( 380 test_user_id, 381 test_project_id, 382 "test_dataset", 383 "Test description", 384 sample_dataframe, 385 data_def, 386 DatasetOrigin.file, 387 {}, 388 [], 389 ) 390 pagination = await dataset_manager.get_dataset_pagination(test_user_id, dataset.id, 2, 1, None, None) 391 assert pagination.page_size == 2 392 assert pagination.current_page == 1 393 assert len(pagination.items) == 2 394 assert pagination.total_pages == 2 395 396 397 @pytest.mark.asyncio 398 async def test_datasets_count( 399 dataset_manager, 400 dataset_metadata_storage, 401 sample_dataframe, 402 test_user_id, 403 test_project_id, 404 mock_project_manager, 405 ): 406 """Test counting datasets.""" 407 from evidently.ui.service.base import Project 408 409 mock_project_manager.get_project = AsyncMock(return_value=Project(id=test_project_id, name="Test")) 410 data_def = Dataset.from_pandas(sample_dataframe).data_definition 411 await dataset_manager.upload_dataset( 412 test_user_id, 413 test_project_id, 414 "test_dataset", 415 "Test description", 416 sample_dataframe, 417 data_def, 418 DatasetOrigin.file, 419 {}, 420 [], 421 ) 422 count = await dataset_manager.datasets_count(test_user_id, test_project_id) 423 assert count == 1