test_grep_files.py
1 # System Packages 2 import pytest 3 import logging 4 5 from khoj.database.adapters import FileObjectAdapters 6 from khoj.database.models import KhojUser 7 from khoj.routers.helpers import grep_files 8 9 logger = logging.getLogger(__name__) 10 11 12 @pytest.fixture 13 @pytest.mark.django_db 14 @pytest.mark.asyncio 15 async def default_user(): 16 user, _ = await KhojUser.objects.aget_or_create( 17 username="test_user", 18 password="test_password", 19 email="test@example.com", 20 ) 21 return user 22 23 24 @pytest.mark.django_db 25 @pytest.mark.asyncio 26 async def test_grep_files_simple_match(default_user: KhojUser): 27 user = await default_user 28 await FileObjectAdapters.adelete_all_file_objects(user=user) 29 # Arrange 30 await FileObjectAdapters.acreate_file_object( 31 user=user, 32 file_name="test.txt", 33 raw_text="hello world\nthis is a test\nhello again", 34 ) 35 36 # Act 37 results = [ 38 result 39 async for result in grep_files( 40 regex_pattern="hello", 41 user=user, 42 ) 43 ] 44 45 # Assert 46 assert len(results) == 1 47 result = results[0] 48 assert "Found 2 matches for 'hello' in 1 documents" in result["query"] 49 assert "test.txt:1: hello world" in result["compiled"] 50 assert "test.txt:3: hello again" in result["compiled"] 51 52 53 @pytest.mark.django_db 54 @pytest.mark.asyncio 55 async def test_grep_files_no_match(default_user: KhojUser): 56 user = await default_user 57 await FileObjectAdapters.adelete_all_file_objects(user=user) 58 # Arrange 59 await FileObjectAdapters.acreate_file_object( 60 user=user, 61 file_name="test.txt", 62 raw_text="this is a test", 63 ) 64 65 # Act 66 results = [ 67 result 68 async for result in grep_files( 69 regex_pattern="nonexistent", 70 user=user, 71 ) 72 ] 73 74 # Assert 75 assert len(results) == 1 76 result = results[0] 77 assert "Found 0 matches for 'nonexistent' in 0 documents" in result["query"] 78 assert "No matches found." in result["compiled"] 79 80 81 @pytest.mark.django_db 82 @pytest.mark.asyncio 83 async def test_grep_files_with_path_prefix(default_user: KhojUser): 84 user = await default_user 85 await FileObjectAdapters.adelete_all_file_objects(user=user) 86 # Arrange 87 await FileObjectAdapters.acreate_file_object( 88 user=user, 89 file_name="dir1/test1.txt", 90 raw_text="hello from dir1", 91 ) 92 await FileObjectAdapters.acreate_file_object( 93 user=user, 94 file_name="dir2/test2.txt", 95 raw_text="hello from dir2", 96 ) 97 98 # Act 99 results = [ 100 result 101 async for result in grep_files( 102 regex_pattern="hello", 103 path_prefix="dir1/", 104 user=user, 105 ) 106 ] 107 108 # Assert 109 assert len(results) == 1 110 result = results[0] 111 assert "Found 1 matches for 'hello' in 1 documents" in result["query"] 112 assert "in dir1/" in result["query"] 113 assert "dir1/test1.txt:1: hello from dir1" in result["compiled"] 114 assert "dir2/test2.txt" not in result["compiled"] 115 116 117 @pytest.mark.django_db 118 @pytest.mark.asyncio 119 async def test_grep_files_with_context(default_user: KhojUser): 120 user = await default_user 121 await FileObjectAdapters.adelete_all_file_objects(user=user) 122 # Arrange 123 await FileObjectAdapters.acreate_file_object( 124 user=user, 125 file_name="test.txt", 126 raw_text="line 1\nline 2\nline 3 (match)\nline 4\nline 5", 127 ) 128 129 # Act 130 results = [ 131 result 132 async for result in grep_files( 133 regex_pattern="match", 134 lines_before=1, 135 lines_after=1, 136 user=user, 137 ) 138 ] 139 140 # Assert 141 assert len(results) == 1 142 result = results[0] 143 assert "Found 1 matches for 'match' in 1 documents" in result["query"] 144 assert "Showing 1 lines before and 1 lines after" in result["query"] 145 assert "test.txt-2- line 2" in result["compiled"] 146 assert "test.txt:3: line 3 (match)" in result["compiled"] 147 assert "test.txt-4- line 4" in result["compiled"] 148 assert "line 1" not in result["compiled"] 149 assert "line 5" not in result["compiled"] 150 151 152 @pytest.mark.django_db 153 @pytest.mark.asyncio 154 async def test_grep_files_invalid_regex(default_user: KhojUser): 155 user = await default_user 156 await FileObjectAdapters.adelete_all_file_objects(user=user) 157 # Act 158 results = [ 159 result 160 async for result in grep_files( 161 regex_pattern="[", 162 user=user, 163 ) 164 ] 165 166 # Assert 167 assert len(results) == 1 168 result = results[0] 169 assert "Invalid regex pattern" in result["compiled"] 170 171 172 @pytest.mark.django_db 173 @pytest.mark.asyncio 174 async def test_grep_files_multiple_files(default_user: KhojUser): 175 user = await default_user 176 await FileObjectAdapters.adelete_all_file_objects(user=user) 177 # Arrange 178 await FileObjectAdapters.acreate_file_object( 179 user=user, 180 file_name="file1.txt", 181 raw_text="hello from file1", 182 ) 183 await FileObjectAdapters.acreate_file_object( 184 user=user, 185 file_name="file2.txt", 186 raw_text="hello from file2", 187 ) 188 189 # Act 190 results = [ 191 result 192 async for result in grep_files( 193 regex_pattern="hello", 194 user=user, 195 ) 196 ] 197 198 # Assert 199 assert len(results) == 1 200 result = results[0] 201 assert "Found 2 matches for 'hello' in 2 documents" in result["query"] 202 assert "file1.txt:1: hello from file1" in result["compiled"] 203 assert "file2.txt:1: hello from file2" in result["compiled"] 204 205 206 @pytest.mark.parametrize( 207 "regex_pattern,expected_matches,test_description", 208 [ 209 # Test with (?im) inline flags and ^ anchor 210 (r"(?im)^\d{4}-\d{2}-\d{2}.*(sailing|sail|Center for Boats|Captain Sailor)", 1, "inline flags with anchor"), 211 # Test with (?i) flag and ^ anchor 212 (r"(?i)^\d{4}-\d{2}-\d{2}.*(sailing|sail|Center for Boats|Captain Sailor)", 1, "case insensitive with anchor"), 213 # Test without any anchors 214 ( 215 r"(?i)\d{4}-\d{2}-\d{2}.*(sailing|sail|Center for Boats|Captain Sailor)", 216 1, 217 "case insensitive without anchor", 218 ), 219 # Test with just the ^ anchor (no inline flags) 220 (r"^\d{4}-\d{2}-\d{2}.*(sailing|sail|Center for Boats|Captain Sailor)", 1, "anchor only"), 221 # Test without anchors or flags (should still work due to re.IGNORECASE in function) 222 (r"\d{4}-\d{2}-\d{2}.*(sailing|sail|center for boats|captain sailor)", 1, "no flags or anchors"), 223 ], 224 ) 225 @pytest.mark.django_db 226 @pytest.mark.asyncio 227 async def test_grep_files_financial_entries_regex_patterns( 228 default_user: KhojUser, regex_pattern: str, expected_matches: int, test_description: str 229 ): 230 user = await default_user 231 await FileObjectAdapters.adelete_all_file_objects(user=user) 232 233 # Arrange - Create file with financial ledger content that has prefix text 234 ledger_content = """This is a financial ledger file 235 236 1984-06-23 * "Al Zaheer, Mediteranean" "Chicken Gyro Plate, Falafel Sandwhich for Bob" #bob 237 Expenses:Food:Dining 11.55 USD 238 Liabilities:People:Bob 11.55 USD 239 Liabilities:CreditCard:Chase -23.10 USD 240 241 1984-06-24 * "Center for Boats" "Sailing" #bob 242 Expenses:Sports 30 USD 243 Liabilities:People:Bob 30.0 USD 244 Liabilities:CreditCard:Chase -60 USD 245 246 1984-06-24 * "Safeway" "Groceries" #bob 247 Expenses:Food:Groceries 11.20 USD 248 Liabilities:People:Bob 11.20 USD 249 Liabilities:CreditCard:Chase -22.40 USD""" 250 251 await FileObjectAdapters.acreate_file_object( 252 user=user, 253 file_name="ledger.txt", 254 raw_text=ledger_content, 255 ) 256 257 # Act - Test the regex pattern 258 results = [ 259 result 260 async for result in grep_files( 261 regex_pattern=regex_pattern, 262 user=user, 263 ) 264 ] 265 266 # Assert 267 assert len(results) == 1 268 result = results[0] 269 logger.info(f"Testing {test_description}: {regex_pattern}") 270 logger.info(f"Query: {result['query']}") 271 logger.info(f"Compiled: {result['compiled']}") 272 273 # All patterns should find the sailing entry 274 assert f"Found {expected_matches} matches" in result["query"] 275 assert 'ledger.txt:8: 1984-06-24 * "Center for Boats" "Sailing" #bob' in result["compiled"]