test_document_cache.py
1 """ 2 Tests for document cache utilities in gateway/platforms/base.py. 3 4 Covers: get_document_cache_dir, cache_document_from_bytes, 5 cleanup_document_cache, SUPPORTED_DOCUMENT_TYPES. 6 """ 7 8 import os 9 import time 10 from pathlib import Path 11 12 import pytest 13 14 from gateway.platforms.base import ( 15 SUPPORTED_DOCUMENT_TYPES, 16 cache_document_from_bytes, 17 cleanup_document_cache, 18 get_document_cache_dir, 19 ) 20 21 # --------------------------------------------------------------------------- 22 # Fixture: redirect DOCUMENT_CACHE_DIR to a temp directory for every test 23 # --------------------------------------------------------------------------- 24 25 @pytest.fixture(autouse=True) 26 def _redirect_cache(tmp_path, monkeypatch): 27 """Point the module-level DOCUMENT_CACHE_DIR to a fresh tmp_path.""" 28 monkeypatch.setattr( 29 "gateway.platforms.base.DOCUMENT_CACHE_DIR", tmp_path / "doc_cache" 30 ) 31 32 33 # --------------------------------------------------------------------------- 34 # TestGetDocumentCacheDir 35 # --------------------------------------------------------------------------- 36 37 class TestGetDocumentCacheDir: 38 def test_creates_directory(self, tmp_path): 39 cache_dir = get_document_cache_dir() 40 assert cache_dir.exists() 41 assert cache_dir.is_dir() 42 43 def test_returns_existing_directory(self): 44 first = get_document_cache_dir() 45 second = get_document_cache_dir() 46 assert first == second 47 assert first.exists() 48 49 50 # --------------------------------------------------------------------------- 51 # TestCacheDocumentFromBytes 52 # --------------------------------------------------------------------------- 53 54 class TestCacheDocumentFromBytes: 55 def test_basic_caching(self): 56 data = b"hello world" 57 path = cache_document_from_bytes(data, "test.txt") 58 assert os.path.exists(path) 59 assert Path(path).read_bytes() == data 60 61 def test_filename_preserved_in_path(self): 62 path = cache_document_from_bytes(b"data", "report.pdf") 63 assert "report.pdf" in os.path.basename(path) 64 65 def test_empty_filename_uses_fallback(self): 66 path = cache_document_from_bytes(b"data", "") 67 assert "document" in os.path.basename(path) 68 69 def test_unique_filenames(self): 70 p1 = cache_document_from_bytes(b"a", "same.txt") 71 p2 = cache_document_from_bytes(b"b", "same.txt") 72 assert p1 != p2 73 74 def test_path_traversal_blocked(self): 75 """Malicious directory components are stripped — only the leaf name survives.""" 76 path = cache_document_from_bytes(b"data", "../../etc/passwd") 77 basename = os.path.basename(path) 78 assert "passwd" in basename 79 # Must NOT contain directory separators 80 assert ".." not in basename 81 # File must reside inside the cache directory 82 cache_dir = get_document_cache_dir() 83 assert Path(path).resolve().is_relative_to(cache_dir.resolve()) 84 85 def test_null_bytes_stripped(self): 86 path = cache_document_from_bytes(b"data", "file\x00.pdf") 87 basename = os.path.basename(path) 88 assert "\x00" not in basename 89 assert "file.pdf" in basename 90 91 def test_dot_dot_filename_handled(self): 92 """A filename that is literally '..' falls back to 'document'.""" 93 path = cache_document_from_bytes(b"data", "..") 94 basename = os.path.basename(path) 95 assert "document" in basename 96 97 def test_none_filename_uses_fallback(self): 98 path = cache_document_from_bytes(b"data", None) 99 assert "document" in os.path.basename(path) 100 101 102 # --------------------------------------------------------------------------- 103 # TestCleanupDocumentCache 104 # --------------------------------------------------------------------------- 105 106 class TestCleanupDocumentCache: 107 def test_removes_old_files(self, tmp_path): 108 cache_dir = get_document_cache_dir() 109 old_file = cache_dir / "old.txt" 110 old_file.write_text("old") 111 # Set modification time to 48 hours ago 112 old_mtime = time.time() - 48 * 3600 113 os.utime(old_file, (old_mtime, old_mtime)) 114 115 removed = cleanup_document_cache(max_age_hours=24) 116 assert removed == 1 117 assert not old_file.exists() 118 119 def test_keeps_recent_files(self): 120 cache_dir = get_document_cache_dir() 121 recent = cache_dir / "recent.txt" 122 recent.write_text("fresh") 123 124 removed = cleanup_document_cache(max_age_hours=24) 125 assert removed == 0 126 assert recent.exists() 127 128 def test_returns_removed_count(self): 129 cache_dir = get_document_cache_dir() 130 old_time = time.time() - 48 * 3600 131 for i in range(3): 132 f = cache_dir / f"old_{i}.txt" 133 f.write_text("x") 134 os.utime(f, (old_time, old_time)) 135 136 assert cleanup_document_cache(max_age_hours=24) == 3 137 138 def test_empty_cache_dir(self): 139 assert cleanup_document_cache(max_age_hours=24) == 0 140 141 142 # --------------------------------------------------------------------------- 143 # TestSupportedDocumentTypes 144 # --------------------------------------------------------------------------- 145 146 class TestSupportedDocumentTypes: 147 def test_all_extensions_have_mime_types(self): 148 for ext, mime in SUPPORTED_DOCUMENT_TYPES.items(): 149 assert ext.startswith("."), f"{ext} missing leading dot" 150 assert "/" in mime, f"{mime} is not a valid MIME type" 151 152 @pytest.mark.parametrize( 153 "ext", 154 [".pdf", ".md", ".txt", ".zip", ".docx", ".xlsx", ".pptx"], 155 ) 156 def test_expected_extensions_present(self, ext): 157 assert ext in SUPPORTED_DOCUMENT_TYPES