evaluate_test.go
1 package evaluate 2 3 import ( 4 "bytes" 5 "errors" 6 "os" 7 "path/filepath" 8 "sort" 9 "strings" 10 "testing" 11 "time" 12 13 "github.com/stretchr/testify/assert" 14 "github.com/stretchr/testify/mock" 15 "github.com/stretchr/testify/require" 16 "github.com/zimmski/osutil" 17 "github.com/zimmski/osutil/bytesutil" 18 19 "github.com/symflower/eval-dev-quality/evaluate/metrics" 20 metricstesting "github.com/symflower/eval-dev-quality/evaluate/metrics/testing" 21 evaluatetask "github.com/symflower/eval-dev-quality/evaluate/task" 22 "github.com/symflower/eval-dev-quality/language" 23 "github.com/symflower/eval-dev-quality/language/golang" 24 "github.com/symflower/eval-dev-quality/log" 25 "github.com/symflower/eval-dev-quality/model" 26 evalmodel "github.com/symflower/eval-dev-quality/model" 27 "github.com/symflower/eval-dev-quality/model/llm" 28 modeltesting "github.com/symflower/eval-dev-quality/model/testing" 29 "github.com/symflower/eval-dev-quality/provider" 30 providertesting "github.com/symflower/eval-dev-quality/provider/testing" 31 "github.com/symflower/eval-dev-quality/task" 32 ) 33 34 var ( 35 // ErrEmptyResponseFromModel indicates the model returned an empty response. 36 ErrEmptyResponseFromModel = errors.New("empty response from model") 37 ) 38 39 // file represents a file with path and content. 40 type file struct { 41 Path string 42 Content string 43 } 44 45 // testFiles holds common test files. 46 var testFiles = map[string]file{ 47 "plain": file{ 48 Path: "plain_test.go", 49 Content: bytesutil.StringTrimIndentations(` 50 package plain 51 52 import "testing" 53 54 func TestFunction(t *testing.T){} 55 `), 56 }, 57 "plain-with-assert": file{ 58 Path: "plain_test.go", 59 Content: bytesutil.StringTrimIndentations(` 60 package plain 61 62 import ( 63 "testing" 64 65 "github.com/stretchr/testify/assert" 66 ) 67 68 func TestFunction(t *testing.T){ 69 assert.True(t, true) 70 } 71 `), 72 }, 73 } 74 75 func TestEvaluate(t *testing.T) { 76 type testCase struct { 77 Name string 78 79 Before func(t *testing.T, logger *log.Logger, resultPath string) 80 After func(t *testing.T, logger *log.Logger, resultPath string) 81 82 Context *Context 83 84 ExpectedAssessments metricstesting.AssessmentTuples 85 ExpectedTotalScore uint64 86 ExpectedOutputValidate func(t *testing.T, output string, resultPath string) 87 ExpectedResultFiles map[string]func(t *testing.T, filePath string, data string) 88 } 89 90 validate := func(t *testing.T, tc *testCase) { 91 t.Run(tc.Name, func(t *testing.T) { 92 temporaryPath := t.TempDir() 93 94 logOutput, logger := log.Buffer() 95 defer func() { 96 log.CloseOpenLogFiles() 97 98 if t.Failed() { 99 t.Logf("Logging output: %s", logOutput.String()) 100 } 101 }() 102 103 resultPath := temporaryPath 104 logger = logger.With(log.AttributeKeyResultPath, resultPath) 105 106 tc.Context.Log = logger 107 if tc.Context.QueryAttempts == 0 { 108 tc.Context.QueryAttempts = 1 109 } 110 tc.Context.ResultPath = resultPath 111 if tc.Context.TestdataPath == "" { 112 tc.Context.TestdataPath = filepath.Join("..", "testdata") 113 } 114 if tc.Context.Runs == 0 { 115 tc.Context.Runs = 1 116 } 117 118 if tc.Before != nil { 119 tc.Before(t, logger, temporaryPath) 120 } 121 if tc.After != nil { 122 defer tc.After(t, logger, temporaryPath) 123 } 124 125 assessmentStore, actualTotalScore := Evaluate(tc.Context) 126 127 var actualAssessments metricstesting.AssessmentTuples 128 require.NoError(t, assessmentStore.Walk(func(m evalmodel.Model, l language.Language, r string, ti task.Identifier, a metrics.Assessments) error { 129 // Normalize assessments. 130 if v, ok := a[metrics.AssessmentKeyProcessingTime]; ok { 131 if assert.Greater(t, v, uint64(0)) { 132 delete(a, metrics.AssessmentKeyProcessingTime) 133 } 134 } 135 136 actualAssessments = append(actualAssessments, &metricstesting.AssessmentTuple{ 137 Model: m, 138 Language: l, 139 RepositoryPath: r, 140 Task: ti, 141 Assessment: a, 142 }) 143 144 return nil 145 })) 146 147 assert.ElementsMatch(t, tc.ExpectedAssessments, actualAssessments) 148 assert.Equal(t, tc.ExpectedTotalScore, actualTotalScore) 149 150 if tc.ExpectedOutputValidate != nil { 151 tc.ExpectedOutputValidate(t, logOutput.String(), temporaryPath) 152 } 153 154 actualResultFiles, err := osutil.FilesRecursive(temporaryPath) 155 require.NoError(t, err) 156 for i, p := range actualResultFiles { 157 actualResultFiles[i], err = filepath.Rel(temporaryPath, p) 158 require.NoError(t, err) 159 } 160 sort.Strings(actualResultFiles) 161 expectedResultFiles := make([]string, 0, len(tc.ExpectedResultFiles)) 162 for filePath, validate := range tc.ExpectedResultFiles { 163 expectedResultFiles = append(expectedResultFiles, filePath) 164 165 if validate != nil { 166 data, err := os.ReadFile(filepath.Join(temporaryPath, filePath)) 167 if assert.NoError(t, err) { 168 validate(t, filePath, string(data)) 169 } 170 } 171 } 172 sort.Strings(expectedResultFiles) 173 assert.Equal(t, expectedResultFiles, actualResultFiles) 174 }) 175 } 176 177 { 178 languageGolang := &golang.Language{} 179 mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, "empty-response-model") 180 repositoryPath := filepath.Join("golang", "plain") 181 182 validate(t, &testCase{ 183 Name: "Empty model responses are errors", 184 185 Before: func(t *testing.T, logger *log.Logger, resultPath string) { 186 // Set up mocks, when test is running. 187 mockedModel.MockCapabilityWriteTests.On("WriteTests", mock.Anything).Return(nil, ErrEmptyResponseFromModel) 188 }, 189 190 Context: &Context{ 191 Languages: []language.Language{ 192 &golang.Language{}, 193 }, 194 195 Models: []evalmodel.Model{ 196 mockedModel, 197 }, 198 }, 199 200 ExpectedAssessments: []*metricstesting.AssessmentTuple{ 201 &metricstesting.AssessmentTuple{ 202 Model: mockedModel, 203 Language: languageGolang, 204 RepositoryPath: repositoryPath, 205 Task: evaluatetask.IdentifierWriteTests, 206 Assessment: metrics.Assessments{ 207 metrics.AssessmentKeyFilesExecutedMaximumReachable: 1, 208 }, 209 }, 210 &metricstesting.AssessmentTuple{ 211 Model: mockedModel, 212 Language: languageGolang, 213 RepositoryPath: repositoryPath, 214 Task: evaluatetask.IdentifierWriteTestsSymflowerFix, 215 Assessment: metrics.Assessments{ 216 metrics.AssessmentKeyFilesExecutedMaximumReachable: 1, 217 }, 218 }, 219 }, 220 ExpectedTotalScore: 2, 221 ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ 222 "evaluation.log": nil, 223 filepath.Join(string(evaluatetask.IdentifierWriteTests), mockedModel.ID(), "golang", "golang", "plain", "evaluation.log"): nil, 224 "evaluation.csv": nil, 225 }, 226 }) 227 } 228 229 t.Run("Failing model queries", func(t *testing.T) { 230 { 231 languageGolang := &golang.Language{} 232 mockedModelID := "testing-provider/empty-response-model" 233 mockedQuery := providertesting.NewMockQuery(t) 234 mockedModel := llm.NewModel(mockedQuery, mockedModelID) 235 repositoryPath := filepath.Join("golang", "plain") 236 237 validate(t, &testCase{ 238 Name: "Single try fails", 239 240 Before: func(t *testing.T, logger *log.Logger, resultPath string) { 241 // Set up mocks, when test is running. 242 mockedQuery.On("Query", mock.Anything, mockedModelID, mock.Anything).Return("", ErrEmptyResponseFromModel) 243 }, 244 After: func(t *testing.T, logger *log.Logger, resultPath string) { 245 mockedQuery.AssertNumberOfCalls(t, "Query", 1) 246 }, 247 248 Context: &Context{ 249 Languages: []language.Language{ 250 languageGolang, 251 }, 252 253 Models: []evalmodel.Model{ 254 mockedModel, 255 }, 256 QueryAttempts: 1, 257 }, 258 259 ExpectedAssessments: []*metricstesting.AssessmentTuple{ 260 &metricstesting.AssessmentTuple{ 261 Model: mockedModel, 262 Language: languageGolang, 263 RepositoryPath: repositoryPath, 264 Task: evaluatetask.IdentifierWriteTests, 265 Assessment: metrics.Assessments{ 266 metrics.AssessmentKeyFilesExecutedMaximumReachable: 1, 267 }, 268 }, 269 &metricstesting.AssessmentTuple{ 270 Model: mockedModel, 271 Language: languageGolang, 272 RepositoryPath: repositoryPath, 273 Task: evaluatetask.IdentifierWriteTestsSymflowerFix, 274 Assessment: metrics.Assessments{ 275 metrics.AssessmentKeyFilesExecutedMaximumReachable: 1, 276 }, 277 }, 278 }, 279 ExpectedTotalScore: 2, 280 ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ 281 "evaluation.log": nil, 282 filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): func(t *testing.T, filePath, data string) { 283 assert.Contains(t, data, ErrEmptyResponseFromModel.Error()) 284 }, 285 "evaluation.csv": nil, 286 }, 287 }) 288 } 289 { 290 languageGolang := &golang.Language{} 291 mockedModelID := "testing-provider/empty-response-model" 292 mockedQuery := providertesting.NewMockQuery(t) 293 mockedModel := llm.NewModel(mockedQuery, mockedModelID) 294 repositoryPath := filepath.Join("golang", "plain") 295 296 validate(t, &testCase{ 297 Name: "Success after retry", 298 299 Before: func(t *testing.T, logger *log.Logger, resultPath string) { 300 // Set up mocks, when test is running. 301 mockedQuery.On("Query", mock.Anything, mockedModelID, mock.Anything).Return("", ErrEmptyResponseFromModel).Once() 302 mockedQuery.On("Query", mock.Anything, mockedModelID, mock.Anything).Return("model-response", nil).Once().After(10 * time.Millisecond) // Simulate a model response delay because our internal safety measures trigger when a query is done in 0 milliseconds. 303 }, 304 After: func(t *testing.T, logger *log.Logger, resultPath string) { 305 mockedQuery.AssertNumberOfCalls(t, "Query", 2) 306 }, 307 308 Context: &Context{ 309 Languages: []language.Language{ 310 &golang.Language{}, 311 }, 312 313 Models: []evalmodel.Model{ 314 mockedModel, 315 }, 316 QueryAttempts: 3, 317 318 RepositoryPaths: []string{ 319 repositoryPath, 320 }, 321 }, 322 323 ExpectedAssessments: []*metricstesting.AssessmentTuple{ 324 &metricstesting.AssessmentTuple{ 325 Model: mockedModel, 326 Language: languageGolang, 327 RepositoryPath: repositoryPath, 328 Task: evaluatetask.IdentifierWriteTests, 329 Assessment: map[metrics.AssessmentKey]uint64{ 330 metrics.AssessmentKeyFilesExecutedMaximumReachable: 1, 331 metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 14, 332 metrics.AssessmentKeyResponseCharacterCount: 14, 333 metrics.AssessmentKeyResponseNoError: 1, 334 }, 335 }, 336 &metricstesting.AssessmentTuple{ 337 Model: mockedModel, 338 Language: languageGolang, 339 RepositoryPath: repositoryPath, 340 Task: evaluatetask.IdentifierWriteTestsSymflowerFix, 341 Assessment: map[metrics.AssessmentKey]uint64{ 342 metrics.AssessmentKeyFilesExecutedMaximumReachable: 1, 343 metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 14, 344 metrics.AssessmentKeyResponseCharacterCount: 14, 345 metrics.AssessmentKeyResponseNoError: 1, 346 }, 347 }, 348 }, 349 ExpectedTotalScore: 2, 350 ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ 351 "evaluation.log": nil, 352 filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): func(t *testing.T, filePath, data string) { 353 assert.Contains(t, data, "Attempt 1/3: "+ErrEmptyResponseFromModel.Error()) 354 }, 355 filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "response-1.log"): nil, 356 "evaluation.csv": nil, 357 }, 358 }) 359 } 360 { 361 languageGolang := &golang.Language{} 362 mockedModelID := "testing-provider/empty-response-model" 363 mockedQuery := providertesting.NewMockQuery(t) 364 mockedModel := llm.NewModel(mockedQuery, mockedModelID) 365 repositoryPath := filepath.Join("golang", "plain") 366 367 validate(t, &testCase{ 368 Name: "Immediate success", 369 370 Before: func(t *testing.T, logger *log.Logger, resultPath string) { 371 // Set up mocks, when test is running. 372 mockedQuery.On("Query", mock.Anything, mockedModelID, mock.Anything).Return("model-response", nil).After(10 * time.Millisecond) // Simulate a model response delay because our internal safety measures trigger when a query is done in 0 milliseconds. 373 }, 374 After: func(t *testing.T, logger *log.Logger, resultPath string) { 375 mockedQuery.AssertNumberOfCalls(t, "Query", 1) 376 }, 377 378 Context: &Context{ 379 Languages: []language.Language{ 380 &golang.Language{}, 381 }, 382 383 Models: []evalmodel.Model{ 384 mockedModel, 385 }, 386 QueryAttempts: 3, 387 388 RepositoryPaths: []string{ 389 repositoryPath, 390 }, 391 }, 392 393 ExpectedAssessments: []*metricstesting.AssessmentTuple{ 394 &metricstesting.AssessmentTuple{ 395 Model: mockedModel, 396 Language: languageGolang, 397 RepositoryPath: repositoryPath, 398 Task: evaluatetask.IdentifierWriteTests, 399 Assessment: map[metrics.AssessmentKey]uint64{ 400 metrics.AssessmentKeyFilesExecutedMaximumReachable: 1, 401 metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 14, 402 metrics.AssessmentKeyResponseCharacterCount: 14, 403 metrics.AssessmentKeyResponseNoError: 1, 404 }, 405 }, 406 &metricstesting.AssessmentTuple{ 407 Model: mockedModel, 408 Language: languageGolang, 409 RepositoryPath: repositoryPath, 410 Task: evaluatetask.IdentifierWriteTestsSymflowerFix, 411 Assessment: map[metrics.AssessmentKey]uint64{ 412 metrics.AssessmentKeyFilesExecutedMaximumReachable: 1, 413 metrics.AssessmentKeyGenerateTestsForFileCharacterCount: 14, 414 metrics.AssessmentKeyResponseCharacterCount: 14, 415 metrics.AssessmentKeyResponseNoError: 1, 416 }, 417 }, 418 }, 419 ExpectedTotalScore: 2, 420 ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ 421 "evaluation.log": nil, 422 filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): func(t *testing.T, filePath, data string) { 423 assert.Contains(t, data, "DONE 0 tests, 1 error") 424 }, 425 filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "response-1.log"): nil, 426 "evaluation.csv": nil, 427 }, 428 }) 429 } 430 }) 431 432 t.Run("Failing basic language checks should exclude model", func(t *testing.T) { 433 repositoryPlainPath := filepath.Join("golang", "plain") 434 repositoryNextPath := filepath.Join("golang", "next") 435 436 temporaryTestdataPath := t.TempDir() 437 assert.NoError(t, osutil.CopyTree(filepath.Join("..", "testdata", repositoryPlainPath), filepath.Join(temporaryTestdataPath, repositoryPlainPath))) 438 assert.NoError(t, osutil.CopyTree(filepath.Join("..", "testdata", repositoryPlainPath), filepath.Join(temporaryTestdataPath, repositoryNextPath))) 439 repositoryNextConfigPath := filepath.Join(temporaryTestdataPath, repositoryNextPath, "go.mod") 440 d, err := os.ReadFile(repositoryNextConfigPath) 441 require.NoError(t, err) 442 d = bytes.ReplaceAll(d, []byte("plain"), []byte("next")) 443 require.NoError(t, os.WriteFile(repositoryNextConfigPath, d, 0)) 444 445 generateTestsForFilePlainError := errors.New("generateTestsForFile error") 446 447 generateSuccess := func(mockedModel *modeltesting.MockModelCapabilityWriteTests) { 448 mockedModel.RegisterGenerateSuccess(t, testFiles["plain"].Path, testFiles["plain"].Content, metricstesting.AssessmentsWithProcessingTime).Once() 449 } 450 generateError := func(mockedModel *modeltesting.MockModelCapabilityWriteTests) { 451 mockedModel.RegisterGenerateError(generateTestsForFilePlainError).Once() 452 } 453 454 { 455 languageGolang := &golang.Language{} 456 mockedModelID := "mocked-generation-model" 457 mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) 458 459 validate(t, &testCase{ 460 Name: "Problems of previous runs shouldn't cancel successive runs", 461 462 Before: func(t *testing.T, logger *log.Logger, resultPath string) { 463 // Set up mocks, when test is running. 464 { 465 // Succeed on both "plain" runs. 466 generateSuccess(mockedModel) 467 generateSuccess(mockedModel) 468 469 // Error on the first run for the "next" repository. 470 generateError(mockedModel) 471 // Succeed on the second run for the "next" repository. 472 generateSuccess(mockedModel) 473 } 474 }, 475 After: func(t *testing.T, logger *log.Logger, resultPath string) { 476 mockedModel.MockCapabilityWriteTests.AssertNumberOfCalls(t, "WriteTests", 4) 477 }, 478 479 Context: &Context{ 480 Languages: []language.Language{ 481 &golang.Language{}, 482 }, 483 484 Models: []evalmodel.Model{ 485 mockedModel, 486 }, 487 488 RepositoryPaths: []string{ 489 repositoryPlainPath, 490 repositoryNextPath, 491 }, 492 TestdataPath: temporaryTestdataPath, 493 494 Runs: 2, 495 }, 496 497 ExpectedAssessments: []*metricstesting.AssessmentTuple{ 498 &metricstesting.AssessmentTuple{ 499 Model: mockedModel, 500 Language: languageGolang, 501 RepositoryPath: repositoryNextPath, 502 Task: evaluatetask.IdentifierWriteTests, 503 Assessment: map[metrics.AssessmentKey]uint64{ 504 metrics.AssessmentKeyCoverage: 0, 505 metrics.AssessmentKeyFilesExecuted: 1, 506 metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, 507 metrics.AssessmentKeyResponseNoError: 1, 508 }, 509 }, 510 &metricstesting.AssessmentTuple{ 511 Model: mockedModel, 512 Language: languageGolang, 513 RepositoryPath: repositoryNextPath, 514 Task: evaluatetask.IdentifierWriteTestsSymflowerFix, 515 Assessment: map[metrics.AssessmentKey]uint64{ 516 metrics.AssessmentKeyCoverage: 0, 517 metrics.AssessmentKeyFilesExecuted: 1, 518 metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, 519 metrics.AssessmentKeyResponseNoError: 1, 520 }, 521 }, 522 &metricstesting.AssessmentTuple{ 523 Model: mockedModel, 524 Language: languageGolang, 525 RepositoryPath: repositoryPlainPath, 526 Task: evaluatetask.IdentifierWriteTests, 527 Assessment: map[metrics.AssessmentKey]uint64{ 528 metrics.AssessmentKeyCoverage: 0, 529 metrics.AssessmentKeyFilesExecuted: 2, 530 metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, 531 metrics.AssessmentKeyResponseNoError: 2, 532 }, 533 }, 534 &metricstesting.AssessmentTuple{ 535 Model: mockedModel, 536 Language: languageGolang, 537 RepositoryPath: repositoryPlainPath, 538 Task: evaluatetask.IdentifierWriteTestsSymflowerFix, 539 Assessment: map[metrics.AssessmentKey]uint64{ 540 metrics.AssessmentKeyCoverage: 0, 541 metrics.AssessmentKeyFilesExecuted: 2, 542 metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, 543 metrics.AssessmentKeyResponseNoError: 2, 544 }, 545 }, 546 }, 547 ExpectedTotalScore: 0, 548 ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ 549 "evaluation.log": nil, 550 filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil, 551 filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "next", "evaluation.log"): nil, 552 "evaluation.csv": nil, 553 }, 554 }) 555 } 556 { 557 languageGolang := &golang.Language{} 558 mockedModelID := "mocked-generation-model" 559 mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) 560 561 validate(t, &testCase{ 562 Name: "Solving basic checks once is enough", 563 564 Before: func(t *testing.T, logger *log.Logger, resultPath string) { 565 // Set up mocks, when test is running. 566 { 567 // Succeed on only one "plain" run. 568 generateError(mockedModel) 569 generateSuccess(mockedModel) 570 571 // Succeed on both "next" runs. 572 generateSuccess(mockedModel) 573 generateSuccess(mockedModel) 574 } 575 }, 576 After: func(t *testing.T, logger *log.Logger, resultPath string) { 577 mockedModel.MockCapabilityWriteTests.AssertNumberOfCalls(t, "WriteTests", 4) 578 }, 579 580 Context: &Context{ 581 Languages: []language.Language{ 582 &golang.Language{}, 583 }, 584 585 Models: []evalmodel.Model{ 586 mockedModel, 587 }, 588 589 RepositoryPaths: []string{ 590 repositoryPlainPath, 591 repositoryNextPath, 592 }, 593 TestdataPath: temporaryTestdataPath, 594 595 Runs: 2, 596 }, 597 598 ExpectedAssessments: []*metricstesting.AssessmentTuple{ 599 &metricstesting.AssessmentTuple{ 600 Model: mockedModel, 601 Language: languageGolang, 602 RepositoryPath: repositoryNextPath, 603 Task: evaluatetask.IdentifierWriteTests, 604 Assessment: map[metrics.AssessmentKey]uint64{ 605 metrics.AssessmentKeyCoverage: 0, 606 metrics.AssessmentKeyFilesExecuted: 2, 607 metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, 608 metrics.AssessmentKeyResponseNoError: 2, 609 }, 610 }, 611 &metricstesting.AssessmentTuple{ 612 Model: mockedModel, 613 Language: languageGolang, 614 RepositoryPath: repositoryNextPath, 615 Task: evaluatetask.IdentifierWriteTestsSymflowerFix, 616 Assessment: map[metrics.AssessmentKey]uint64{ 617 metrics.AssessmentKeyCoverage: 0, 618 metrics.AssessmentKeyFilesExecuted: 2, 619 metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, 620 metrics.AssessmentKeyResponseNoError: 2, 621 }, 622 }, 623 &metricstesting.AssessmentTuple{ 624 Model: mockedModel, 625 Language: languageGolang, 626 RepositoryPath: repositoryPlainPath, 627 Task: evaluatetask.IdentifierWriteTests, 628 Assessment: map[metrics.AssessmentKey]uint64{ 629 metrics.AssessmentKeyCoverage: 0, 630 metrics.AssessmentKeyFilesExecuted: 1, 631 metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, 632 metrics.AssessmentKeyResponseNoError: 1, 633 }, 634 }, 635 &metricstesting.AssessmentTuple{ 636 Model: mockedModel, 637 Language: languageGolang, 638 RepositoryPath: repositoryPlainPath, 639 Task: evaluatetask.IdentifierWriteTestsSymflowerFix, 640 Assessment: map[metrics.AssessmentKey]uint64{ 641 metrics.AssessmentKeyCoverage: 0, 642 metrics.AssessmentKeyFilesExecuted: 1, 643 metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, 644 metrics.AssessmentKeyResponseNoError: 1, 645 }, 646 }, 647 }, 648 ExpectedTotalScore: 0, 649 ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ 650 "evaluation.log": nil, 651 filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil, 652 filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "next", "evaluation.log"): nil, 653 "evaluation.csv": nil, 654 }, 655 }) 656 } 657 { 658 languageGolang := &golang.Language{} 659 mockedModelID := "mocked-generation-model" 660 mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) 661 662 validate(t, &testCase{ 663 Name: "Never solving basic checks leads to exclusion", 664 665 Before: func(t *testing.T, logger *log.Logger, resultPath string) { 666 // Set up mocks, when test is running. 667 { 668 // Error on every "plain" run. 669 generateError(mockedModel) 670 generateError(mockedModel) 671 } 672 }, 673 After: func(t *testing.T, logger *log.Logger, resultPath string) { 674 mockedModel.MockCapabilityWriteTests.AssertNumberOfCalls(t, "WriteTests", 2) 675 }, 676 677 Context: &Context{ 678 Languages: []language.Language{ 679 &golang.Language{}, 680 }, 681 682 Models: []evalmodel.Model{ 683 mockedModel, 684 }, 685 686 RepositoryPaths: []string{ 687 repositoryPlainPath, 688 repositoryNextPath, 689 }, 690 TestdataPath: temporaryTestdataPath, 691 692 Runs: 2, 693 }, 694 695 ExpectedAssessments: []*metricstesting.AssessmentTuple{ 696 &metricstesting.AssessmentTuple{ 697 Model: mockedModel, 698 Language: languageGolang, 699 RepositoryPath: repositoryPlainPath, 700 Task: evaluatetask.IdentifierWriteTests, 701 Assessment: map[metrics.AssessmentKey]uint64{ 702 metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, 703 }, 704 }, 705 &metricstesting.AssessmentTuple{ 706 Model: mockedModel, 707 Language: languageGolang, 708 RepositoryPath: repositoryPlainPath, 709 Task: evaluatetask.IdentifierWriteTestsSymflowerFix, 710 Assessment: map[metrics.AssessmentKey]uint64{ 711 metrics.AssessmentKeyFilesExecutedMaximumReachable: 2, 712 }, 713 }, 714 }, 715 ExpectedTotalScore: 0, 716 ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ 717 "evaluation.log": nil, 718 filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil, 719 "evaluation.csv": nil, 720 }, 721 }) 722 } 723 }) 724 t.Run("Runs", func(t *testing.T) { 725 generateSuccess := func(mockedModel *modeltesting.MockModelCapabilityWriteTests) { 726 mockedModel.RegisterGenerateSuccess(t, testFiles["plain"].Path, testFiles["plain"].Content, metricstesting.AssessmentsWithProcessingTime) 727 } 728 { 729 languageGolang := &golang.Language{} 730 mockedModelID := "mocked-generation-model" 731 mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) 732 733 repositoryPath := filepath.Join("golang", "plain") 734 validate(t, &testCase{ 735 Name: "Interleaved", 736 737 Before: func(t *testing.T, logger *log.Logger, resultPath string) { 738 generateSuccess(mockedModel) 739 }, 740 741 Context: &Context{ 742 Languages: []language.Language{ 743 &golang.Language{}, 744 }, 745 746 Models: []evalmodel.Model{ 747 mockedModel, 748 }, 749 750 RepositoryPaths: []string{ 751 repositoryPath, 752 }, 753 754 Runs: 3, 755 RunsSequential: false, 756 }, 757 758 ExpectedAssessments: []*metricstesting.AssessmentTuple{ 759 &metricstesting.AssessmentTuple{ 760 Model: mockedModel, 761 Language: languageGolang, 762 RepositoryPath: repositoryPath, 763 Task: evaluatetask.IdentifierWriteTests, 764 Assessment: map[metrics.AssessmentKey]uint64{ 765 metrics.AssessmentKeyCoverage: 0, 766 metrics.AssessmentKeyFilesExecuted: 3, 767 metrics.AssessmentKeyFilesExecutedMaximumReachable: 3, 768 metrics.AssessmentKeyResponseNoError: 3, 769 }, 770 }, 771 &metricstesting.AssessmentTuple{ 772 Model: mockedModel, 773 Language: languageGolang, 774 RepositoryPath: repositoryPath, 775 Task: evaluatetask.IdentifierWriteTestsSymflowerFix, 776 Assessment: map[metrics.AssessmentKey]uint64{ 777 metrics.AssessmentKeyCoverage: 0, 778 metrics.AssessmentKeyFilesExecuted: 3, 779 metrics.AssessmentKeyFilesExecutedMaximumReachable: 3, 780 metrics.AssessmentKeyResponseNoError: 3, 781 }, 782 }, 783 }, 784 ExpectedTotalScore: 6, 785 ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ 786 "evaluation.log": nil, 787 filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil, 788 "evaluation.csv": nil, 789 }, 790 ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) { 791 assert.Contains(t, output, "Run 1/3") 792 assert.Contains(t, output, "Run 2/3") 793 assert.Contains(t, output, "Run 3/3") 794 assert.NotRegexp(t, `Run \d+/\d+ for model`, output) 795 796 assert.Equal(t, 1, strings.Count(output, "Creating temporary repository"), "create only one temporary repository") 797 }, 798 }) 799 } 800 { 801 languageGolang := &golang.Language{} 802 mockedModelID := "mocked-generation-model" 803 mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) 804 805 repositoryPath := filepath.Join("golang", "plain") 806 validate(t, &testCase{ 807 Name: "Sequential", 808 809 Before: func(t *testing.T, logger *log.Logger, resultPath string) { 810 generateSuccess(mockedModel) 811 }, 812 813 Context: &Context{ 814 Languages: []language.Language{ 815 &golang.Language{}, 816 }, 817 818 Models: []evalmodel.Model{ 819 mockedModel, 820 }, 821 822 RepositoryPaths: []string{ 823 repositoryPath, 824 }, 825 826 Runs: 3, 827 RunsSequential: true, 828 }, 829 830 ExpectedAssessments: []*metricstesting.AssessmentTuple{ 831 &metricstesting.AssessmentTuple{ 832 Model: mockedModel, 833 Language: languageGolang, 834 RepositoryPath: repositoryPath, 835 Task: evaluatetask.IdentifierWriteTests, 836 Assessment: map[metrics.AssessmentKey]uint64{ 837 metrics.AssessmentKeyCoverage: 0, 838 metrics.AssessmentKeyFilesExecuted: 3, 839 metrics.AssessmentKeyFilesExecutedMaximumReachable: 3, 840 metrics.AssessmentKeyResponseNoError: 3, 841 }, 842 }, 843 &metricstesting.AssessmentTuple{ 844 Model: mockedModel, 845 Language: languageGolang, 846 RepositoryPath: repositoryPath, 847 Task: evaluatetask.IdentifierWriteTestsSymflowerFix, 848 Assessment: map[metrics.AssessmentKey]uint64{ 849 metrics.AssessmentKeyCoverage: 0, 850 metrics.AssessmentKeyFilesExecuted: 3, 851 metrics.AssessmentKeyFilesExecutedMaximumReachable: 3, 852 metrics.AssessmentKeyResponseNoError: 3, 853 }, 854 }, 855 }, 856 ExpectedTotalScore: 6, 857 ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ 858 "evaluation.log": nil, 859 filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil, 860 "evaluation.csv": nil, 861 }, 862 ExpectedOutputValidate: func(t *testing.T, output string, resultPath string) { 863 assert.Contains(t, output, "Run 1/3 for model") 864 assert.Contains(t, output, "Run 2/3 for model") 865 assert.Contains(t, output, "Run 3/3 for model") 866 assert.NotRegexp(t, `Run \d+/\d+$`, output) 867 868 assert.Equal(t, 1, strings.Count(output, "Creating temporary repository"), "create only one temporary repository") 869 }, 870 }) 871 } 872 }) 873 874 t.Run("Preloading", func(t *testing.T) { 875 generateSuccess := func(mockedModel *modeltesting.MockModelCapabilityWriteTests) { 876 mockedModel.RegisterGenerateSuccess(t, testFiles["plain"].Path, testFiles["plain"].Content, metricstesting.AssessmentsWithProcessingTime) 877 } 878 879 { 880 // Setup provider and model mocking. 881 languageGolang := &golang.Language{} 882 mockedModelID := "testing-provider/testing-model" 883 mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) 884 mockedProviderID := "testing-provider" 885 mockedProvider := providertesting.NewMockProviderNamedWithModels(t, mockedProviderID, []model.Model{mockedModel}) 886 mockedLoader := providertesting.NewMockLoader(t) 887 embeddedProvider := &struct { 888 provider.Provider 889 provider.Loader 890 }{ 891 Provider: mockedProvider, 892 Loader: mockedLoader, 893 } 894 repositoryPath := filepath.Join("golang", "plain") 895 896 validate(t, &testCase{ 897 Name: "Once for combined runs", 898 899 Before: func(t *testing.T, logger *log.Logger, resultPath string) { 900 generateSuccess(mockedModel) 901 mockedLoader.On("Load", mockedModelID).Return(nil) 902 mockedLoader.On("Unload", mockedModelID).Return(nil) 903 }, 904 After: func(t *testing.T, logger *log.Logger, resultPath string) { 905 delete(provider.Providers, mockedProviderID) 906 907 mockedLoader.AssertNumberOfCalls(t, "Load", 1) 908 mockedLoader.AssertNumberOfCalls(t, "Unload", 1) 909 }, 910 911 Context: &Context{ 912 Languages: []language.Language{ 913 languageGolang, 914 }, 915 916 Models: []evalmodel.Model{ 917 mockedModel, 918 }, 919 ProviderForModel: map[evalmodel.Model]provider.Provider{ 920 mockedModel: embeddedProvider, 921 }, 922 923 RepositoryPaths: []string{ 924 repositoryPath, 925 }, 926 927 Runs: 3, 928 RunsSequential: true, 929 }, 930 931 ExpectedAssessments: []*metricstesting.AssessmentTuple{ 932 &metricstesting.AssessmentTuple{ 933 Model: mockedModel, 934 Language: languageGolang, 935 RepositoryPath: repositoryPath, 936 Task: evaluatetask.IdentifierWriteTests, 937 Assessment: map[metrics.AssessmentKey]uint64{ 938 metrics.AssessmentKeyCoverage: 0, 939 metrics.AssessmentKeyFilesExecuted: 3, 940 metrics.AssessmentKeyFilesExecutedMaximumReachable: 3, 941 metrics.AssessmentKeyResponseNoError: 3, 942 }, 943 }, 944 &metricstesting.AssessmentTuple{ 945 Model: mockedModel, 946 Language: languageGolang, 947 RepositoryPath: repositoryPath, 948 Task: evaluatetask.IdentifierWriteTestsSymflowerFix, 949 Assessment: map[metrics.AssessmentKey]uint64{ 950 metrics.AssessmentKeyCoverage: 0, 951 metrics.AssessmentKeyFilesExecuted: 3, 952 metrics.AssessmentKeyFilesExecutedMaximumReachable: 3, 953 metrics.AssessmentKeyResponseNoError: 3, 954 }, 955 }, 956 }, 957 ExpectedTotalScore: 6, 958 ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ 959 "evaluation.log": nil, 960 filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil, 961 "evaluation.csv": nil, 962 }, 963 }) 964 } 965 { 966 // Setup provider and model mocking. 967 languageGolang := &golang.Language{} 968 mockedModelID := "testing-provider/testing-model" 969 mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) 970 mockedProviderID := "testing-provider" 971 mockedProvider := providertesting.NewMockProviderNamedWithModels(t, mockedProviderID, []model.Model{mockedModel}) 972 mockedLoader := providertesting.NewMockLoader(t) 973 embeddedProvider := &struct { 974 provider.Provider 975 provider.Loader 976 }{ 977 Provider: mockedProvider, 978 Loader: mockedLoader, 979 } 980 repositoryPath := filepath.Join("golang", "plain") 981 validate(t, &testCase{ 982 Name: "Multiple times for interleaved runs", 983 984 Before: func(t *testing.T, logger *log.Logger, resultPath string) { 985 generateSuccess(mockedModel) 986 mockedLoader.On("Load", mockedModelID).Return(nil) 987 mockedLoader.On("Unload", mockedModelID).Return(nil) 988 }, 989 After: func(t *testing.T, logger *log.Logger, resultPath string) { 990 delete(provider.Providers, "testing-provider") 991 992 mockedLoader.AssertNumberOfCalls(t, "Load", 3) 993 mockedLoader.AssertNumberOfCalls(t, "Unload", 3) 994 }, 995 996 Context: &Context{ 997 Languages: []language.Language{ 998 languageGolang, 999 }, 1000 1001 Models: []evalmodel.Model{ 1002 mockedModel, 1003 }, 1004 ProviderForModel: map[evalmodel.Model]provider.Provider{ 1005 mockedModel: embeddedProvider, 1006 }, 1007 1008 RepositoryPaths: []string{ 1009 repositoryPath, 1010 }, 1011 1012 Runs: 3, 1013 }, 1014 1015 ExpectedAssessments: []*metricstesting.AssessmentTuple{ 1016 &metricstesting.AssessmentTuple{ 1017 Model: mockedModel, 1018 Language: languageGolang, 1019 RepositoryPath: repositoryPath, 1020 Task: evaluatetask.IdentifierWriteTests, 1021 Assessment: map[metrics.AssessmentKey]uint64{ 1022 metrics.AssessmentKeyCoverage: 0, 1023 metrics.AssessmentKeyFilesExecuted: 3, 1024 metrics.AssessmentKeyFilesExecutedMaximumReachable: 3, 1025 metrics.AssessmentKeyResponseNoError: 3, 1026 }, 1027 }, 1028 &metricstesting.AssessmentTuple{ 1029 Model: mockedModel, 1030 Language: languageGolang, 1031 RepositoryPath: repositoryPath, 1032 Task: evaluatetask.IdentifierWriteTestsSymflowerFix, 1033 Assessment: map[metrics.AssessmentKey]uint64{ 1034 metrics.AssessmentKeyCoverage: 0, 1035 metrics.AssessmentKeyFilesExecuted: 3, 1036 metrics.AssessmentKeyFilesExecutedMaximumReachable: 3, 1037 metrics.AssessmentKeyResponseNoError: 3, 1038 }, 1039 }, 1040 }, 1041 ExpectedTotalScore: 6, 1042 ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ 1043 "evaluation.log": nil, 1044 filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil, 1045 "evaluation.csv": nil, 1046 }, 1047 }) 1048 } 1049 }) 1050 { 1051 // Setup provider and model mocking. 1052 languageGolang := &golang.Language{} 1053 mockedModelID := "testing-provider/testing-model" 1054 mockedModel := modeltesting.NewMockCapabilityWriteTestsNamed(t, mockedModelID) 1055 1056 repositoryPath := filepath.Join("golang", "plain") 1057 1058 validate(t, &testCase{ 1059 Name: "Download Go dependencies", 1060 1061 Before: func(t *testing.T, logger *log.Logger, resultPath string) { 1062 mockedModel.RegisterGenerateSuccess(t, testFiles["plain-with-assert"].Path, testFiles["plain-with-assert"].Content, metricstesting.AssessmentsWithProcessingTime) 1063 }, 1064 1065 Context: &Context{ 1066 Languages: []language.Language{ 1067 languageGolang, 1068 }, 1069 1070 Models: []evalmodel.Model{ 1071 mockedModel, 1072 }, 1073 1074 RepositoryPaths: []string{ 1075 repositoryPath, 1076 }, 1077 1078 Runs: 1, 1079 }, 1080 1081 ExpectedAssessments: []*metricstesting.AssessmentTuple{ 1082 &metricstesting.AssessmentTuple{ 1083 Model: mockedModel, 1084 Language: languageGolang, 1085 RepositoryPath: repositoryPath, 1086 Task: evaluatetask.IdentifierWriteTests, 1087 Assessment: map[metrics.AssessmentKey]uint64{ 1088 metrics.AssessmentKeyCoverage: 0, 1089 metrics.AssessmentKeyFilesExecuted: 1, 1090 metrics.AssessmentKeyFilesExecutedMaximumReachable: 1, 1091 metrics.AssessmentKeyResponseNoError: 1, 1092 }, 1093 }, 1094 &metricstesting.AssessmentTuple{ 1095 Model: mockedModel, 1096 Language: languageGolang, 1097 RepositoryPath: repositoryPath, 1098 Task: evaluatetask.IdentifierWriteTestsSymflowerFix, 1099 Assessment: map[metrics.AssessmentKey]uint64{ 1100 metrics.AssessmentKeyCoverage: 0, 1101 metrics.AssessmentKeyFilesExecuted: 1, 1102 metrics.AssessmentKeyFilesExecutedMaximumReachable: 1, 1103 metrics.AssessmentKeyResponseNoError: 1, 1104 }, 1105 }, 1106 }, 1107 ExpectedTotalScore: 2, 1108 ExpectedResultFiles: map[string]func(t *testing.T, filePath string, data string){ 1109 "evaluation.log": nil, 1110 filepath.Join(string(evaluatetask.IdentifierWriteTests), log.CleanModelNameForFileSystem(mockedModelID), "golang", "golang", "plain", "evaluation.log"): nil, 1111 "evaluation.csv": nil, 1112 }, 1113 }) 1114 } 1115 }