/ tests / gateway / test_document_cache.py
test_document_cache.py
  1  """
  2  Tests for document cache utilities in gateway/platforms/base.py.
  3  
  4  Covers: get_document_cache_dir, cache_document_from_bytes,
  5          cleanup_document_cache, SUPPORTED_DOCUMENT_TYPES.
  6  """
  7  
  8  import os
  9  import time
 10  from pathlib import Path
 11  
 12  import pytest
 13  
 14  from gateway.platforms.base import (
 15      SUPPORTED_DOCUMENT_TYPES,
 16      cache_document_from_bytes,
 17      cleanup_document_cache,
 18      get_document_cache_dir,
 19  )
 20  
 21  # ---------------------------------------------------------------------------
 22  # Fixture: redirect DOCUMENT_CACHE_DIR to a temp directory for every test
 23  # ---------------------------------------------------------------------------
 24  
 25  @pytest.fixture(autouse=True)
 26  def _redirect_cache(tmp_path, monkeypatch):
 27      """Point the module-level DOCUMENT_CACHE_DIR to a fresh tmp_path."""
 28      monkeypatch.setattr(
 29          "gateway.platforms.base.DOCUMENT_CACHE_DIR", tmp_path / "doc_cache"
 30      )
 31  
 32  
 33  # ---------------------------------------------------------------------------
 34  # TestGetDocumentCacheDir
 35  # ---------------------------------------------------------------------------
 36  
 37  class TestGetDocumentCacheDir:
 38      def test_creates_directory(self, tmp_path):
 39          cache_dir = get_document_cache_dir()
 40          assert cache_dir.exists()
 41          assert cache_dir.is_dir()
 42  
 43      def test_returns_existing_directory(self):
 44          first = get_document_cache_dir()
 45          second = get_document_cache_dir()
 46          assert first == second
 47          assert first.exists()
 48  
 49  
 50  # ---------------------------------------------------------------------------
 51  # TestCacheDocumentFromBytes
 52  # ---------------------------------------------------------------------------
 53  
 54  class TestCacheDocumentFromBytes:
 55      def test_basic_caching(self):
 56          data = b"hello world"
 57          path = cache_document_from_bytes(data, "test.txt")
 58          assert os.path.exists(path)
 59          assert Path(path).read_bytes() == data
 60  
 61      def test_filename_preserved_in_path(self):
 62          path = cache_document_from_bytes(b"data", "report.pdf")
 63          assert "report.pdf" in os.path.basename(path)
 64  
 65      def test_empty_filename_uses_fallback(self):
 66          path = cache_document_from_bytes(b"data", "")
 67          assert "document" in os.path.basename(path)
 68  
 69      def test_unique_filenames(self):
 70          p1 = cache_document_from_bytes(b"a", "same.txt")
 71          p2 = cache_document_from_bytes(b"b", "same.txt")
 72          assert p1 != p2
 73  
 74      def test_path_traversal_blocked(self):
 75          """Malicious directory components are stripped — only the leaf name survives."""
 76          path = cache_document_from_bytes(b"data", "../../etc/passwd")
 77          basename = os.path.basename(path)
 78          assert "passwd" in basename
 79          # Must NOT contain directory separators
 80          assert ".." not in basename
 81          # File must reside inside the cache directory
 82          cache_dir = get_document_cache_dir()
 83          assert Path(path).resolve().is_relative_to(cache_dir.resolve())
 84  
 85      def test_null_bytes_stripped(self):
 86          path = cache_document_from_bytes(b"data", "file\x00.pdf")
 87          basename = os.path.basename(path)
 88          assert "\x00" not in basename
 89          assert "file.pdf" in basename
 90  
 91      def test_dot_dot_filename_handled(self):
 92          """A filename that is literally '..' falls back to 'document'."""
 93          path = cache_document_from_bytes(b"data", "..")
 94          basename = os.path.basename(path)
 95          assert "document" in basename
 96  
 97      def test_none_filename_uses_fallback(self):
 98          path = cache_document_from_bytes(b"data", None)
 99          assert "document" in os.path.basename(path)
100  
101  
102  # ---------------------------------------------------------------------------
103  # TestCleanupDocumentCache
104  # ---------------------------------------------------------------------------
105  
106  class TestCleanupDocumentCache:
107      def test_removes_old_files(self, tmp_path):
108          cache_dir = get_document_cache_dir()
109          old_file = cache_dir / "old.txt"
110          old_file.write_text("old")
111          # Set modification time to 48 hours ago
112          old_mtime = time.time() - 48 * 3600
113          os.utime(old_file, (old_mtime, old_mtime))
114  
115          removed = cleanup_document_cache(max_age_hours=24)
116          assert removed == 1
117          assert not old_file.exists()
118  
119      def test_keeps_recent_files(self):
120          cache_dir = get_document_cache_dir()
121          recent = cache_dir / "recent.txt"
122          recent.write_text("fresh")
123  
124          removed = cleanup_document_cache(max_age_hours=24)
125          assert removed == 0
126          assert recent.exists()
127  
128      def test_returns_removed_count(self):
129          cache_dir = get_document_cache_dir()
130          old_time = time.time() - 48 * 3600
131          for i in range(3):
132              f = cache_dir / f"old_{i}.txt"
133              f.write_text("x")
134              os.utime(f, (old_time, old_time))
135  
136          assert cleanup_document_cache(max_age_hours=24) == 3
137  
138      def test_empty_cache_dir(self):
139          assert cleanup_document_cache(max_age_hours=24) == 0
140  
141  
142  # ---------------------------------------------------------------------------
143  # TestSupportedDocumentTypes
144  # ---------------------------------------------------------------------------
145  
146  class TestSupportedDocumentTypes:
147      def test_all_extensions_have_mime_types(self):
148          for ext, mime in SUPPORTED_DOCUMENT_TYPES.items():
149              assert ext.startswith("."), f"{ext} missing leading dot"
150              assert "/" in mime, f"{mime} is not a valid MIME type"
151  
152      @pytest.mark.parametrize(
153          "ext",
154          [".pdf", ".md", ".txt", ".zip", ".docx", ".xlsx", ".pptx"],
155      )
156      def test_expected_extensions_present(self, ext):
157          assert ext in SUPPORTED_DOCUMENT_TYPES