/ tests / test_grep_files.py
test_grep_files.py
  1  # System Packages
  2  import pytest
  3  import logging
  4  
  5  from khoj.database.adapters import FileObjectAdapters
  6  from khoj.database.models import KhojUser
  7  from khoj.routers.helpers import grep_files
  8  
  9  logger = logging.getLogger(__name__)
 10  
 11  
 12  @pytest.fixture
 13  @pytest.mark.django_db
 14  @pytest.mark.asyncio
 15  async def default_user():
 16      user, _ = await KhojUser.objects.aget_or_create(
 17          username="test_user",
 18          password="test_password",
 19          email="test@example.com",
 20      )
 21      return user
 22  
 23  
 24  @pytest.mark.django_db
 25  @pytest.mark.asyncio
 26  async def test_grep_files_simple_match(default_user: KhojUser):
 27      user = await default_user
 28      await FileObjectAdapters.adelete_all_file_objects(user=user)
 29      # Arrange
 30      await FileObjectAdapters.acreate_file_object(
 31          user=user,
 32          file_name="test.txt",
 33          raw_text="hello world\nthis is a test\nhello again",
 34      )
 35  
 36      # Act
 37      results = [
 38          result
 39          async for result in grep_files(
 40              regex_pattern="hello",
 41              user=user,
 42          )
 43      ]
 44  
 45      # Assert
 46      assert len(results) == 1
 47      result = results[0]
 48      assert "Found 2 matches for 'hello' in 1 documents" in result["query"]
 49      assert "test.txt:1: hello world" in result["compiled"]
 50      assert "test.txt:3: hello again" in result["compiled"]
 51  
 52  
 53  @pytest.mark.django_db
 54  @pytest.mark.asyncio
 55  async def test_grep_files_no_match(default_user: KhojUser):
 56      user = await default_user
 57      await FileObjectAdapters.adelete_all_file_objects(user=user)
 58      # Arrange
 59      await FileObjectAdapters.acreate_file_object(
 60          user=user,
 61          file_name="test.txt",
 62          raw_text="this is a test",
 63      )
 64  
 65      # Act
 66      results = [
 67          result
 68          async for result in grep_files(
 69              regex_pattern="nonexistent",
 70              user=user,
 71          )
 72      ]
 73  
 74      # Assert
 75      assert len(results) == 1
 76      result = results[0]
 77      assert "Found 0 matches for 'nonexistent' in 0 documents" in result["query"]
 78      assert "No matches found." in result["compiled"]
 79  
 80  
 81  @pytest.mark.django_db
 82  @pytest.mark.asyncio
 83  async def test_grep_files_with_path_prefix(default_user: KhojUser):
 84      user = await default_user
 85      await FileObjectAdapters.adelete_all_file_objects(user=user)
 86      # Arrange
 87      await FileObjectAdapters.acreate_file_object(
 88          user=user,
 89          file_name="dir1/test1.txt",
 90          raw_text="hello from dir1",
 91      )
 92      await FileObjectAdapters.acreate_file_object(
 93          user=user,
 94          file_name="dir2/test2.txt",
 95          raw_text="hello from dir2",
 96      )
 97  
 98      # Act
 99      results = [
100          result
101          async for result in grep_files(
102              regex_pattern="hello",
103              path_prefix="dir1/",
104              user=user,
105          )
106      ]
107  
108      # Assert
109      assert len(results) == 1
110      result = results[0]
111      assert "Found 1 matches for 'hello' in 1 documents" in result["query"]
112      assert "in dir1/" in result["query"]
113      assert "dir1/test1.txt:1: hello from dir1" in result["compiled"]
114      assert "dir2/test2.txt" not in result["compiled"]
115  
116  
117  @pytest.mark.django_db
118  @pytest.mark.asyncio
119  async def test_grep_files_with_context(default_user: KhojUser):
120      user = await default_user
121      await FileObjectAdapters.adelete_all_file_objects(user=user)
122      # Arrange
123      await FileObjectAdapters.acreate_file_object(
124          user=user,
125          file_name="test.txt",
126          raw_text="line 1\nline 2\nline 3 (match)\nline 4\nline 5",
127      )
128  
129      # Act
130      results = [
131          result
132          async for result in grep_files(
133              regex_pattern="match",
134              lines_before=1,
135              lines_after=1,
136              user=user,
137          )
138      ]
139  
140      # Assert
141      assert len(results) == 1
142      result = results[0]
143      assert "Found 1 matches for 'match' in 1 documents" in result["query"]
144      assert "Showing 1 lines before and 1 lines after" in result["query"]
145      assert "test.txt-2-  line 2" in result["compiled"]
146      assert "test.txt:3: line 3 (match)" in result["compiled"]
147      assert "test.txt-4-  line 4" in result["compiled"]
148      assert "line 1" not in result["compiled"]
149      assert "line 5" not in result["compiled"]
150  
151  
152  @pytest.mark.django_db
153  @pytest.mark.asyncio
154  async def test_grep_files_invalid_regex(default_user: KhojUser):
155      user = await default_user
156      await FileObjectAdapters.adelete_all_file_objects(user=user)
157      # Act
158      results = [
159          result
160          async for result in grep_files(
161              regex_pattern="[",
162              user=user,
163          )
164      ]
165  
166      # Assert
167      assert len(results) == 1
168      result = results[0]
169      assert "Invalid regex pattern" in result["compiled"]
170  
171  
172  @pytest.mark.django_db
173  @pytest.mark.asyncio
174  async def test_grep_files_multiple_files(default_user: KhojUser):
175      user = await default_user
176      await FileObjectAdapters.adelete_all_file_objects(user=user)
177      # Arrange
178      await FileObjectAdapters.acreate_file_object(
179          user=user,
180          file_name="file1.txt",
181          raw_text="hello from file1",
182      )
183      await FileObjectAdapters.acreate_file_object(
184          user=user,
185          file_name="file2.txt",
186          raw_text="hello from file2",
187      )
188  
189      # Act
190      results = [
191          result
192          async for result in grep_files(
193              regex_pattern="hello",
194              user=user,
195          )
196      ]
197  
198      # Assert
199      assert len(results) == 1
200      result = results[0]
201      assert "Found 2 matches for 'hello' in 2 documents" in result["query"]
202      assert "file1.txt:1: hello from file1" in result["compiled"]
203      assert "file2.txt:1: hello from file2" in result["compiled"]
204  
205  
206  @pytest.mark.parametrize(
207      "regex_pattern,expected_matches,test_description",
208      [
209          # Test with (?im) inline flags and ^ anchor
210          (r"(?im)^\d{4}-\d{2}-\d{2}.*(sailing|sail|Center for Boats|Captain Sailor)", 1, "inline flags with anchor"),
211          # Test with (?i) flag and ^ anchor
212          (r"(?i)^\d{4}-\d{2}-\d{2}.*(sailing|sail|Center for Boats|Captain Sailor)", 1, "case insensitive with anchor"),
213          # Test without any anchors
214          (
215              r"(?i)\d{4}-\d{2}-\d{2}.*(sailing|sail|Center for Boats|Captain Sailor)",
216              1,
217              "case insensitive without anchor",
218          ),
219          # Test with just the ^ anchor (no inline flags)
220          (r"^\d{4}-\d{2}-\d{2}.*(sailing|sail|Center for Boats|Captain Sailor)", 1, "anchor only"),
221          # Test without anchors or flags (should still work due to re.IGNORECASE in function)
222          (r"\d{4}-\d{2}-\d{2}.*(sailing|sail|center for boats|captain sailor)", 1, "no flags or anchors"),
223      ],
224  )
225  @pytest.mark.django_db
226  @pytest.mark.asyncio
227  async def test_grep_files_financial_entries_regex_patterns(
228      default_user: KhojUser, regex_pattern: str, expected_matches: int, test_description: str
229  ):
230      user = await default_user
231      await FileObjectAdapters.adelete_all_file_objects(user=user)
232  
233      # Arrange - Create file with financial ledger content that has prefix text
234      ledger_content = """This is a financial ledger file
235  
236  1984-06-23 * "Al Zaheer, Mediteranean" "Chicken Gyro Plate, Falafel Sandwhich for Bob" #bob
237    Expenses:Food:Dining                                             11.55 USD
238    Liabilities:People:Bob                                          11.55 USD
239    Liabilities:CreditCard:Chase                            -23.10 USD
240  
241  1984-06-24 * "Center for Boats" "Sailing" #bob
242    Expenses:Sports                                                     30 USD
243    Liabilities:People:Bob                                           30.0 USD
244    Liabilities:CreditCard:Chase                             -60 USD
245  
246  1984-06-24 * "Safeway" "Groceries" #bob
247    Expenses:Food:Groceries                                          11.20 USD
248    Liabilities:People:Bob                                          11.20 USD
249    Liabilities:CreditCard:Chase                          -22.40 USD"""
250  
251      await FileObjectAdapters.acreate_file_object(
252          user=user,
253          file_name="ledger.txt",
254          raw_text=ledger_content,
255      )
256  
257      # Act - Test the regex pattern
258      results = [
259          result
260          async for result in grep_files(
261              regex_pattern=regex_pattern,
262              user=user,
263          )
264      ]
265  
266      # Assert
267      assert len(results) == 1
268      result = results[0]
269      logger.info(f"Testing {test_description}: {regex_pattern}")
270      logger.info(f"Query: {result['query']}")
271      logger.info(f"Compiled: {result['compiled']}")
272  
273      # All patterns should find the sailing entry
274      assert f"Found {expected_matches} matches" in result["query"]
275      assert 'ledger.txt:8: 1984-06-24 * "Center for Boats" "Sailing" #bob' in result["compiled"]