version_set.cc
1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 5 #include "db/version_set.h" 6 7 #include <stdio.h> 8 9 #include <algorithm> 10 11 #include "db/filename.h" 12 #include "db/log_reader.h" 13 #include "db/log_writer.h" 14 #include "db/memtable.h" 15 #include "db/table_cache.h" 16 #include "leveldb/env.h" 17 #include "leveldb/table_builder.h" 18 #include "table/merger.h" 19 #include "table/two_level_iterator.h" 20 #include "util/coding.h" 21 #include "util/logging.h" 22 23 namespace leveldb { 24 25 static size_t TargetFileSize(const Options* options) { 26 return options->max_file_size; 27 } 28 29 // Maximum bytes of overlaps in grandparent (i.e., level+2) before we 30 // stop building a single file in a level->level+1 compaction. 31 static int64_t MaxGrandParentOverlapBytes(const Options* options) { 32 return 10 * TargetFileSize(options); 33 } 34 35 // Maximum number of bytes in all compacted files. We avoid expanding 36 // the lower level file set of a compaction if it would make the 37 // total compaction cover more than this many bytes. 38 static int64_t ExpandedCompactionByteSizeLimit(const Options* options) { 39 return 25 * TargetFileSize(options); 40 } 41 42 static double MaxBytesForLevel(const Options* options, int level) { 43 // Note: the result for level zero is not really used since we set 44 // the level-0 compaction threshold based on number of files. 45 46 // Result for both level-0 and level-1 47 double result = 10. * 1048576.0; 48 while (level > 1) { 49 result *= 10; 50 level--; 51 } 52 return result; 53 } 54 55 static uint64_t MaxFileSizeForLevel(const Options* options, int level) { 56 // We could vary per level to reduce number of files? 57 return TargetFileSize(options); 58 } 59 60 static int64_t TotalFileSize(const std::vector<FileMetaData*>& files) { 61 int64_t sum = 0; 62 for (size_t i = 0; i < files.size(); i++) { 63 sum += files[i]->file_size; 64 } 65 return sum; 66 } 67 68 Version::~Version() { 69 assert(refs_ == 0); 70 71 // Remove from linked list 72 prev_->next_ = next_; 73 next_->prev_ = prev_; 74 75 // Drop references to files 76 for (int level = 0; level < config::kNumLevels; level++) { 77 for (size_t i = 0; i < files_[level].size(); i++) { 78 FileMetaData* f = files_[level][i]; 79 assert(f->refs > 0); 80 f->refs--; 81 if (f->refs <= 0) { 82 delete f; 83 } 84 } 85 } 86 } 87 88 int FindFile(const InternalKeyComparator& icmp, 89 const std::vector<FileMetaData*>& files, const Slice& key) { 90 uint32_t left = 0; 91 uint32_t right = files.size(); 92 while (left < right) { 93 uint32_t mid = (left + right) / 2; 94 const FileMetaData* f = files[mid]; 95 if (icmp.InternalKeyComparator::Compare(f->largest.Encode(), key) < 0) { 96 // Key at "mid.largest" is < "target". Therefore all 97 // files at or before "mid" are uninteresting. 98 left = mid + 1; 99 } else { 100 // Key at "mid.largest" is >= "target". Therefore all files 101 // after "mid" are uninteresting. 102 right = mid; 103 } 104 } 105 return right; 106 } 107 108 static bool AfterFile(const Comparator* ucmp, const Slice* user_key, 109 const FileMetaData* f) { 110 // null user_key occurs before all keys and is therefore never after *f 111 return (user_key != nullptr && 112 ucmp->Compare(*user_key, f->largest.user_key()) > 0); 113 } 114 115 static bool BeforeFile(const Comparator* ucmp, const Slice* user_key, 116 const FileMetaData* f) { 117 // null user_key occurs after all keys and is therefore never before *f 118 return (user_key != nullptr && 119 ucmp->Compare(*user_key, f->smallest.user_key()) < 0); 120 } 121 122 bool SomeFileOverlapsRange(const InternalKeyComparator& icmp, 123 bool disjoint_sorted_files, 124 const std::vector<FileMetaData*>& files, 125 const Slice* smallest_user_key, 126 const Slice* largest_user_key) { 127 const Comparator* ucmp = icmp.user_comparator(); 128 if (!disjoint_sorted_files) { 129 // Need to check against all files 130 for (size_t i = 0; i < files.size(); i++) { 131 const FileMetaData* f = files[i]; 132 if (AfterFile(ucmp, smallest_user_key, f) || 133 BeforeFile(ucmp, largest_user_key, f)) { 134 // No overlap 135 } else { 136 return true; // Overlap 137 } 138 } 139 return false; 140 } 141 142 // Binary search over file list 143 uint32_t index = 0; 144 if (smallest_user_key != nullptr) { 145 // Find the earliest possible internal key for smallest_user_key 146 InternalKey small_key(*smallest_user_key, kMaxSequenceNumber, 147 kValueTypeForSeek); 148 index = FindFile(icmp, files, small_key.Encode()); 149 } 150 151 if (index >= files.size()) { 152 // beginning of range is after all files, so no overlap. 153 return false; 154 } 155 156 return !BeforeFile(ucmp, largest_user_key, files[index]); 157 } 158 159 // An internal iterator. For a given version/level pair, yields 160 // information about the files in the level. For a given entry, key() 161 // is the largest key that occurs in the file, and value() is an 162 // 16-byte value containing the file number and file size, both 163 // encoded using EncodeFixed64. 164 class Version::LevelFileNumIterator : public Iterator { 165 public: 166 LevelFileNumIterator(const InternalKeyComparator& icmp, 167 const std::vector<FileMetaData*>* flist) 168 : icmp_(icmp), flist_(flist), index_(flist->size()) { // Marks as invalid 169 } 170 bool Valid() const override { return index_ < flist_->size(); } 171 void Seek(const Slice& target) override { 172 index_ = FindFile(icmp_, *flist_, target); 173 } 174 void SeekToFirst() override { index_ = 0; } 175 void SeekToLast() override { 176 index_ = flist_->empty() ? 0 : flist_->size() - 1; 177 } 178 void Next() override { 179 assert(Valid()); 180 index_++; 181 } 182 void Prev() override { 183 assert(Valid()); 184 if (index_ == 0) { 185 index_ = flist_->size(); // Marks as invalid 186 } else { 187 index_--; 188 } 189 } 190 Slice key() const override { 191 assert(Valid()); 192 return (*flist_)[index_]->largest.Encode(); 193 } 194 Slice value() const override { 195 assert(Valid()); 196 EncodeFixed64(value_buf_, (*flist_)[index_]->number); 197 EncodeFixed64(value_buf_ + 8, (*flist_)[index_]->file_size); 198 return Slice(value_buf_, sizeof(value_buf_)); 199 } 200 Status status() const override { return Status::OK(); } 201 202 private: 203 const InternalKeyComparator icmp_; 204 const std::vector<FileMetaData*>* const flist_; 205 uint32_t index_; 206 207 // Backing store for value(). Holds the file number and size. 208 mutable char value_buf_[16]; 209 }; 210 211 static Iterator* GetFileIterator(void* arg, const ReadOptions& options, 212 const Slice& file_value) { 213 TableCache* cache = reinterpret_cast<TableCache*>(arg); 214 if (file_value.size() != 16) { 215 return NewErrorIterator( 216 Status::Corruption("FileReader invoked with unexpected value")); 217 } else { 218 return cache->NewIterator(options, DecodeFixed64(file_value.data()), 219 DecodeFixed64(file_value.data() + 8)); 220 } 221 } 222 223 Iterator* Version::NewConcatenatingIterator(const ReadOptions& options, 224 int level) const { 225 return NewTwoLevelIterator( 226 new LevelFileNumIterator(vset_->icmp_, &files_[level]), &GetFileIterator, 227 vset_->table_cache_, options); 228 } 229 230 void Version::AddIterators(const ReadOptions& options, 231 std::vector<Iterator*>* iters) { 232 // Merge all level zero files together since they may overlap 233 for (size_t i = 0; i < files_[0].size(); i++) { 234 iters->push_back(vset_->table_cache_->NewIterator( 235 options, files_[0][i]->number, files_[0][i]->file_size)); 236 } 237 238 // For levels > 0, we can use a concatenating iterator that sequentially 239 // walks through the non-overlapping files in the level, opening them 240 // lazily. 241 for (int level = 1; level < config::kNumLevels; level++) { 242 if (!files_[level].empty()) { 243 iters->push_back(NewConcatenatingIterator(options, level)); 244 } 245 } 246 } 247 248 // Callback from TableCache::Get() 249 namespace { 250 enum SaverState { 251 kNotFound, 252 kFound, 253 kDeleted, 254 kCorrupt, 255 }; 256 struct Saver { 257 SaverState state; 258 const Comparator* ucmp; 259 Slice user_key; 260 std::string* value; 261 }; 262 } // namespace 263 static void SaveValue(void* arg, const Slice& ikey, const Slice& v) { 264 Saver* s = reinterpret_cast<Saver*>(arg); 265 ParsedInternalKey parsed_key; 266 if (!ParseInternalKey(ikey, &parsed_key)) { 267 s->state = kCorrupt; 268 } else { 269 if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) { 270 s->state = (parsed_key.type == kTypeValue) ? kFound : kDeleted; 271 if (s->state == kFound) { 272 s->value->assign(v.data(), v.size()); 273 } 274 } 275 } 276 } 277 278 static bool NewestFirst(FileMetaData* a, FileMetaData* b) { 279 return a->number > b->number; 280 } 281 282 void Version::ForEachOverlapping(Slice user_key, Slice internal_key, void* arg, 283 bool (*func)(void*, int, FileMetaData*)) { 284 const Comparator* ucmp = vset_->icmp_.user_comparator(); 285 286 // Search level-0 in order from newest to oldest. 287 std::vector<FileMetaData*> tmp; 288 tmp.reserve(files_[0].size()); 289 for (uint32_t i = 0; i < files_[0].size(); i++) { 290 FileMetaData* f = files_[0][i]; 291 if (ucmp->Compare(user_key, f->smallest.user_key()) >= 0 && 292 ucmp->Compare(user_key, f->largest.user_key()) <= 0) { 293 tmp.push_back(f); 294 } 295 } 296 if (!tmp.empty()) { 297 std::sort(tmp.begin(), tmp.end(), NewestFirst); 298 for (uint32_t i = 0; i < tmp.size(); i++) { 299 if (!(*func)(arg, 0, tmp[i])) { 300 return; 301 } 302 } 303 } 304 305 // Search other levels. 306 for (int level = 1; level < config::kNumLevels; level++) { 307 size_t num_files = files_[level].size(); 308 if (num_files == 0) continue; 309 310 // Binary search to find earliest index whose largest key >= internal_key. 311 uint32_t index = FindFile(vset_->icmp_, files_[level], internal_key); 312 if (index < num_files) { 313 FileMetaData* f = files_[level][index]; 314 if (ucmp->Compare(user_key, f->smallest.user_key()) < 0) { 315 // All of "f" is past any data for user_key 316 } else { 317 if (!(*func)(arg, level, f)) { 318 return; 319 } 320 } 321 } 322 } 323 } 324 325 Status Version::Get(const ReadOptions& options, const LookupKey& k, 326 std::string* value, GetStats* stats) { 327 stats->seek_file = nullptr; 328 stats->seek_file_level = -1; 329 330 struct State { 331 Saver saver; 332 GetStats* stats; 333 const ReadOptions* options; 334 Slice ikey; 335 FileMetaData* last_file_read; 336 int last_file_read_level; 337 338 VersionSet* vset; 339 Status s; 340 bool found; 341 342 static bool Match(void* arg, int level, FileMetaData* f) { 343 State* state = reinterpret_cast<State*>(arg); 344 345 if (state->stats->seek_file == nullptr && 346 state->last_file_read != nullptr) { 347 // We have had more than one seek for this read. Charge the 1st file. 348 state->stats->seek_file = state->last_file_read; 349 state->stats->seek_file_level = state->last_file_read_level; 350 } 351 352 state->last_file_read = f; 353 state->last_file_read_level = level; 354 355 state->s = state->vset->table_cache_->Get(*state->options, f->number, 356 f->file_size, state->ikey, 357 &state->saver, SaveValue); 358 if (!state->s.ok()) { 359 state->found = true; 360 return false; 361 } 362 switch (state->saver.state) { 363 case kNotFound: 364 return true; // Keep searching in other files 365 case kFound: 366 state->found = true; 367 return false; 368 case kDeleted: 369 return false; 370 case kCorrupt: 371 state->s = 372 Status::Corruption("corrupted key for ", state->saver.user_key); 373 state->found = true; 374 return false; 375 } 376 377 // Not reached. Added to avoid false compilation warnings of 378 // "control reaches end of non-void function". 379 return false; 380 } 381 }; 382 383 State state; 384 state.found = false; 385 state.stats = stats; 386 state.last_file_read = nullptr; 387 state.last_file_read_level = -1; 388 389 state.options = &options; 390 state.ikey = k.internal_key(); 391 state.vset = vset_; 392 393 state.saver.state = kNotFound; 394 state.saver.ucmp = vset_->icmp_.user_comparator(); 395 state.saver.user_key = k.user_key(); 396 state.saver.value = value; 397 398 ForEachOverlapping(state.saver.user_key, state.ikey, &state, &State::Match); 399 400 return state.found ? state.s : Status::NotFound(Slice()); 401 } 402 403 bool Version::UpdateStats(const GetStats& stats) { 404 FileMetaData* f = stats.seek_file; 405 if (f != nullptr) { 406 f->allowed_seeks--; 407 if (f->allowed_seeks <= 0 && file_to_compact_ == nullptr) { 408 file_to_compact_ = f; 409 file_to_compact_level_ = stats.seek_file_level; 410 return true; 411 } 412 } 413 return false; 414 } 415 416 bool Version::RecordReadSample(Slice internal_key) { 417 ParsedInternalKey ikey; 418 if (!ParseInternalKey(internal_key, &ikey)) { 419 return false; 420 } 421 422 struct State { 423 GetStats stats; // Holds first matching file 424 int matches; 425 426 static bool Match(void* arg, int level, FileMetaData* f) { 427 State* state = reinterpret_cast<State*>(arg); 428 state->matches++; 429 if (state->matches == 1) { 430 // Remember first match. 431 state->stats.seek_file = f; 432 state->stats.seek_file_level = level; 433 } 434 // We can stop iterating once we have a second match. 435 return state->matches < 2; 436 } 437 }; 438 439 State state; 440 state.matches = 0; 441 ForEachOverlapping(ikey.user_key, internal_key, &state, &State::Match); 442 443 // Must have at least two matches since we want to merge across 444 // files. But what if we have a single file that contains many 445 // overwrites and deletions? Should we have another mechanism for 446 // finding such files? 447 if (state.matches >= 2) { 448 // 1MB cost is about 1 seek (see comment in Builder::Apply). 449 return UpdateStats(state.stats); 450 } 451 return false; 452 } 453 454 void Version::Ref() { ++refs_; } 455 456 void Version::Unref() { 457 assert(this != &vset_->dummy_versions_); 458 assert(refs_ >= 1); 459 --refs_; 460 if (refs_ == 0) { 461 delete this; 462 } 463 } 464 465 bool Version::OverlapInLevel(int level, const Slice* smallest_user_key, 466 const Slice* largest_user_key) { 467 return SomeFileOverlapsRange(vset_->icmp_, (level > 0), files_[level], 468 smallest_user_key, largest_user_key); 469 } 470 471 int Version::PickLevelForMemTableOutput(const Slice& smallest_user_key, 472 const Slice& largest_user_key) { 473 int level = 0; 474 if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) { 475 // Push to next level if there is no overlap in next level, 476 // and the #bytes overlapping in the level after that are limited. 477 InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek); 478 InternalKey limit(largest_user_key, 0, static_cast<ValueType>(0)); 479 std::vector<FileMetaData*> overlaps; 480 while (level < config::kMaxMemCompactLevel) { 481 if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) { 482 break; 483 } 484 if (level + 2 < config::kNumLevels) { 485 // Check that file does not overlap too many grandparent bytes. 486 GetOverlappingInputs(level + 2, &start, &limit, &overlaps); 487 const int64_t sum = TotalFileSize(overlaps); 488 if (sum > MaxGrandParentOverlapBytes(vset_->options_)) { 489 break; 490 } 491 } 492 level++; 493 } 494 } 495 return level; 496 } 497 498 // Store in "*inputs" all files in "level" that overlap [begin,end] 499 void Version::GetOverlappingInputs(int level, const InternalKey* begin, 500 const InternalKey* end, 501 std::vector<FileMetaData*>* inputs) { 502 assert(level >= 0); 503 assert(level < config::kNumLevels); 504 inputs->clear(); 505 Slice user_begin, user_end; 506 if (begin != nullptr) { 507 user_begin = begin->user_key(); 508 } 509 if (end != nullptr) { 510 user_end = end->user_key(); 511 } 512 const Comparator* user_cmp = vset_->icmp_.user_comparator(); 513 for (size_t i = 0; i < files_[level].size();) { 514 FileMetaData* f = files_[level][i++]; 515 const Slice file_start = f->smallest.user_key(); 516 const Slice file_limit = f->largest.user_key(); 517 if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) { 518 // "f" is completely before specified range; skip it 519 } else if (end != nullptr && user_cmp->Compare(file_start, user_end) > 0) { 520 // "f" is completely after specified range; skip it 521 } else { 522 inputs->push_back(f); 523 if (level == 0) { 524 // Level-0 files may overlap each other. So check if the newly 525 // added file has expanded the range. If so, restart search. 526 if (begin != nullptr && user_cmp->Compare(file_start, user_begin) < 0) { 527 user_begin = file_start; 528 inputs->clear(); 529 i = 0; 530 } else if (end != nullptr && 531 user_cmp->Compare(file_limit, user_end) > 0) { 532 user_end = file_limit; 533 inputs->clear(); 534 i = 0; 535 } 536 } 537 } 538 } 539 } 540 541 std::string Version::DebugString() const { 542 std::string r; 543 for (int level = 0; level < config::kNumLevels; level++) { 544 // E.g., 545 // --- level 1 --- 546 // 17:123['a' .. 'd'] 547 // 20:43['e' .. 'g'] 548 r.append("--- level "); 549 AppendNumberTo(&r, level); 550 r.append(" ---\n"); 551 const std::vector<FileMetaData*>& files = files_[level]; 552 for (size_t i = 0; i < files.size(); i++) { 553 r.push_back(' '); 554 AppendNumberTo(&r, files[i]->number); 555 r.push_back(':'); 556 AppendNumberTo(&r, files[i]->file_size); 557 r.append("["); 558 r.append(files[i]->smallest.DebugString()); 559 r.append(" .. "); 560 r.append(files[i]->largest.DebugString()); 561 r.append("]\n"); 562 } 563 } 564 return r; 565 } 566 567 // A helper class so we can efficiently apply a whole sequence 568 // of edits to a particular state without creating intermediate 569 // Versions that contain full copies of the intermediate state. 570 class VersionSet::Builder { 571 private: 572 // Helper to sort by v->files_[file_number].smallest 573 struct BySmallestKey { 574 const InternalKeyComparator* internal_comparator; 575 576 bool operator()(FileMetaData* f1, FileMetaData* f2) const { 577 int r = internal_comparator->Compare(f1->smallest, f2->smallest); 578 if (r != 0) { 579 return (r < 0); 580 } else { 581 // Break ties by file number 582 return (f1->number < f2->number); 583 } 584 } 585 }; 586 587 typedef std::set<FileMetaData*, BySmallestKey> FileSet; 588 struct LevelState { 589 std::set<uint64_t> deleted_files; 590 FileSet* added_files; 591 }; 592 593 VersionSet* vset_; 594 Version* base_; 595 LevelState levels_[config::kNumLevels]; 596 597 public: 598 // Initialize a builder with the files from *base and other info from *vset 599 Builder(VersionSet* vset, Version* base) : vset_(vset), base_(base) { 600 base_->Ref(); 601 BySmallestKey cmp; 602 cmp.internal_comparator = &vset_->icmp_; 603 for (int level = 0; level < config::kNumLevels; level++) { 604 levels_[level].added_files = new FileSet(cmp); 605 } 606 } 607 608 ~Builder() { 609 for (int level = 0; level < config::kNumLevels; level++) { 610 const FileSet* added = levels_[level].added_files; 611 std::vector<FileMetaData*> to_unref; 612 to_unref.reserve(added->size()); 613 for (FileSet::const_iterator it = added->begin(); it != added->end(); 614 ++it) { 615 to_unref.push_back(*it); 616 } 617 delete added; 618 for (uint32_t i = 0; i < to_unref.size(); i++) { 619 FileMetaData* f = to_unref[i]; 620 f->refs--; 621 if (f->refs <= 0) { 622 delete f; 623 } 624 } 625 } 626 base_->Unref(); 627 } 628 629 // Apply all of the edits in *edit to the current state. 630 void Apply(VersionEdit* edit) { 631 // Update compaction pointers 632 for (size_t i = 0; i < edit->compact_pointers_.size(); i++) { 633 const int level = edit->compact_pointers_[i].first; 634 vset_->compact_pointer_[level] = 635 edit->compact_pointers_[i].second.Encode().ToString(); 636 } 637 638 // Delete files 639 for (const auto& deleted_file_set_kvp : edit->deleted_files_) { 640 const int level = deleted_file_set_kvp.first; 641 const uint64_t number = deleted_file_set_kvp.second; 642 levels_[level].deleted_files.insert(number); 643 } 644 645 // Add new files 646 for (size_t i = 0; i < edit->new_files_.size(); i++) { 647 const int level = edit->new_files_[i].first; 648 FileMetaData* f = new FileMetaData(edit->new_files_[i].second); 649 f->refs = 1; 650 651 // We arrange to automatically compact this file after 652 // a certain number of seeks. Let's assume: 653 // (1) One seek costs 10ms 654 // (2) Writing or reading 1MB costs 10ms (100MB/s) 655 // (3) A compaction of 1MB does 25MB of IO: 656 // 1MB read from this level 657 // 10-12MB read from next level (boundaries may be misaligned) 658 // 10-12MB written to next level 659 // This implies that 25 seeks cost the same as the compaction 660 // of 1MB of data. I.e., one seek costs approximately the 661 // same as the compaction of 40KB of data. We are a little 662 // conservative and allow approximately one seek for every 16KB 663 // of data before triggering a compaction. 664 f->allowed_seeks = static_cast<int>((f->file_size / 16384U)); 665 if (f->allowed_seeks < 100) f->allowed_seeks = 100; 666 667 levels_[level].deleted_files.erase(f->number); 668 levels_[level].added_files->insert(f); 669 } 670 } 671 672 // Save the current state in *v. 673 void SaveTo(Version* v) { 674 BySmallestKey cmp; 675 cmp.internal_comparator = &vset_->icmp_; 676 for (int level = 0; level < config::kNumLevels; level++) { 677 // Merge the set of added files with the set of pre-existing files. 678 // Drop any deleted files. Store the result in *v. 679 const std::vector<FileMetaData*>& base_files = base_->files_[level]; 680 std::vector<FileMetaData*>::const_iterator base_iter = base_files.begin(); 681 std::vector<FileMetaData*>::const_iterator base_end = base_files.end(); 682 const FileSet* added_files = levels_[level].added_files; 683 v->files_[level].reserve(base_files.size() + added_files->size()); 684 for (const auto& added_file : *added_files) { 685 // Add all smaller files listed in base_ 686 for (std::vector<FileMetaData*>::const_iterator bpos = 687 std::upper_bound(base_iter, base_end, added_file, cmp); 688 base_iter != bpos; ++base_iter) { 689 MaybeAddFile(v, level, *base_iter); 690 } 691 692 MaybeAddFile(v, level, added_file); 693 } 694 695 // Add remaining base files 696 for (; base_iter != base_end; ++base_iter) { 697 MaybeAddFile(v, level, *base_iter); 698 } 699 700 #ifndef NDEBUG 701 // Make sure there is no overlap in levels > 0 702 if (level > 0) { 703 for (uint32_t i = 1; i < v->files_[level].size(); i++) { 704 const InternalKey& prev_end = v->files_[level][i - 1]->largest; 705 const InternalKey& this_begin = v->files_[level][i]->smallest; 706 if (vset_->icmp_.Compare(prev_end, this_begin) >= 0) { 707 fprintf(stderr, "overlapping ranges in same level %s vs. %s\n", 708 prev_end.DebugString().c_str(), 709 this_begin.DebugString().c_str()); 710 abort(); 711 } 712 } 713 } 714 #endif 715 } 716 } 717 718 void MaybeAddFile(Version* v, int level, FileMetaData* f) { 719 if (levels_[level].deleted_files.count(f->number) > 0) { 720 // File is deleted: do nothing 721 } else { 722 std::vector<FileMetaData*>* files = &v->files_[level]; 723 if (level > 0 && !files->empty()) { 724 // Must not overlap 725 assert(vset_->icmp_.Compare((*files)[files->size() - 1]->largest, 726 f->smallest) < 0); 727 } 728 f->refs++; 729 files->push_back(f); 730 } 731 } 732 }; 733 734 VersionSet::VersionSet(const std::string& dbname, const Options* options, 735 TableCache* table_cache, 736 const InternalKeyComparator* cmp) 737 : env_(options->env), 738 dbname_(dbname), 739 options_(options), 740 table_cache_(table_cache), 741 icmp_(*cmp), 742 next_file_number_(2), 743 manifest_file_number_(0), // Filled by Recover() 744 last_sequence_(0), 745 log_number_(0), 746 prev_log_number_(0), 747 descriptor_file_(nullptr), 748 descriptor_log_(nullptr), 749 dummy_versions_(this), 750 current_(nullptr) { 751 AppendVersion(new Version(this)); 752 } 753 754 VersionSet::~VersionSet() { 755 current_->Unref(); 756 assert(dummy_versions_.next_ == &dummy_versions_); // List must be empty 757 delete descriptor_log_; 758 delete descriptor_file_; 759 } 760 761 void VersionSet::AppendVersion(Version* v) { 762 // Make "v" current 763 assert(v->refs_ == 0); 764 assert(v != current_); 765 if (current_ != nullptr) { 766 current_->Unref(); 767 } 768 current_ = v; 769 v->Ref(); 770 771 // Append to linked list 772 v->prev_ = dummy_versions_.prev_; 773 v->next_ = &dummy_versions_; 774 v->prev_->next_ = v; 775 v->next_->prev_ = v; 776 } 777 778 Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu) { 779 if (edit->has_log_number_) { 780 assert(edit->log_number_ >= log_number_); 781 assert(edit->log_number_ < next_file_number_); 782 } else { 783 edit->SetLogNumber(log_number_); 784 } 785 786 if (!edit->has_prev_log_number_) { 787 edit->SetPrevLogNumber(prev_log_number_); 788 } 789 790 edit->SetNextFile(next_file_number_); 791 edit->SetLastSequence(last_sequence_); 792 793 Version* v = new Version(this); 794 { 795 Builder builder(this, current_); 796 builder.Apply(edit); 797 builder.SaveTo(v); 798 } 799 Finalize(v); 800 801 // Initialize new descriptor log file if necessary by creating 802 // a temporary file that contains a snapshot of the current version. 803 std::string new_manifest_file; 804 Status s; 805 if (descriptor_log_ == nullptr) { 806 // No reason to unlock *mu here since we only hit this path in the 807 // first call to LogAndApply (when opening the database). 808 assert(descriptor_file_ == nullptr); 809 new_manifest_file = DescriptorFileName(dbname_, manifest_file_number_); 810 edit->SetNextFile(next_file_number_); 811 s = env_->NewWritableFile(new_manifest_file, &descriptor_file_); 812 if (s.ok()) { 813 descriptor_log_ = new log::Writer(descriptor_file_); 814 s = WriteSnapshot(descriptor_log_); 815 } 816 } 817 818 // Unlock during expensive MANIFEST log write 819 { 820 mu->Unlock(); 821 822 // Write new record to MANIFEST log 823 if (s.ok()) { 824 std::string record; 825 edit->EncodeTo(&record); 826 s = descriptor_log_->AddRecord(record); 827 if (s.ok()) { 828 s = descriptor_file_->Sync(); 829 } 830 if (!s.ok()) { 831 Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str()); 832 } 833 } 834 835 // If we just created a new descriptor file, install it by writing a 836 // new CURRENT file that points to it. 837 if (s.ok() && !new_manifest_file.empty()) { 838 s = SetCurrentFile(env_, dbname_, manifest_file_number_); 839 } 840 841 mu->Lock(); 842 } 843 844 // Install the new version 845 if (s.ok()) { 846 AppendVersion(v); 847 log_number_ = edit->log_number_; 848 prev_log_number_ = edit->prev_log_number_; 849 } else { 850 delete v; 851 if (!new_manifest_file.empty()) { 852 delete descriptor_log_; 853 delete descriptor_file_; 854 descriptor_log_ = nullptr; 855 descriptor_file_ = nullptr; 856 env_->DeleteFile(new_manifest_file); 857 } 858 } 859 860 return s; 861 } 862 863 Status VersionSet::Recover(bool* save_manifest) { 864 struct LogReporter : public log::Reader::Reporter { 865 Status* status; 866 void Corruption(size_t bytes, const Status& s) override { 867 if (this->status->ok()) *this->status = s; 868 } 869 }; 870 871 // Read "CURRENT" file, which contains a pointer to the current manifest file 872 std::string current; 873 Status s = ReadFileToString(env_, CurrentFileName(dbname_), ¤t); 874 if (!s.ok()) { 875 return s; 876 } 877 if (current.empty() || current[current.size() - 1] != '\n') { 878 return Status::Corruption("CURRENT file does not end with newline"); 879 } 880 current.resize(current.size() - 1); 881 882 std::string dscname = dbname_ + "/" + current; 883 SequentialFile* file; 884 s = env_->NewSequentialFile(dscname, &file); 885 if (!s.ok()) { 886 if (s.IsNotFound()) { 887 return Status::Corruption("CURRENT points to a non-existent file", 888 s.ToString()); 889 } 890 return s; 891 } 892 893 bool have_log_number = false; 894 bool have_prev_log_number = false; 895 bool have_next_file = false; 896 bool have_last_sequence = false; 897 uint64_t next_file = 0; 898 uint64_t last_sequence = 0; 899 uint64_t log_number = 0; 900 uint64_t prev_log_number = 0; 901 Builder builder(this, current_); 902 903 { 904 LogReporter reporter; 905 reporter.status = &s; 906 log::Reader reader(file, &reporter, true /*checksum*/, 907 0 /*initial_offset*/); 908 Slice record; 909 std::string scratch; 910 while (reader.ReadRecord(&record, &scratch) && s.ok()) { 911 VersionEdit edit; 912 s = edit.DecodeFrom(record); 913 if (s.ok()) { 914 if (edit.has_comparator_ && 915 edit.comparator_ != icmp_.user_comparator()->Name()) { 916 s = Status::InvalidArgument( 917 edit.comparator_ + " does not match existing comparator ", 918 icmp_.user_comparator()->Name()); 919 } 920 } 921 922 if (s.ok()) { 923 builder.Apply(&edit); 924 } 925 926 if (edit.has_log_number_) { 927 log_number = edit.log_number_; 928 have_log_number = true; 929 } 930 931 if (edit.has_prev_log_number_) { 932 prev_log_number = edit.prev_log_number_; 933 have_prev_log_number = true; 934 } 935 936 if (edit.has_next_file_number_) { 937 next_file = edit.next_file_number_; 938 have_next_file = true; 939 } 940 941 if (edit.has_last_sequence_) { 942 last_sequence = edit.last_sequence_; 943 have_last_sequence = true; 944 } 945 } 946 } 947 delete file; 948 file = nullptr; 949 950 if (s.ok()) { 951 if (!have_next_file) { 952 s = Status::Corruption("no meta-nextfile entry in descriptor"); 953 } else if (!have_log_number) { 954 s = Status::Corruption("no meta-lognumber entry in descriptor"); 955 } else if (!have_last_sequence) { 956 s = Status::Corruption("no last-sequence-number entry in descriptor"); 957 } 958 959 if (!have_prev_log_number) { 960 prev_log_number = 0; 961 } 962 963 MarkFileNumberUsed(prev_log_number); 964 MarkFileNumberUsed(log_number); 965 } 966 967 if (s.ok()) { 968 Version* v = new Version(this); 969 builder.SaveTo(v); 970 // Install recovered version 971 Finalize(v); 972 AppendVersion(v); 973 manifest_file_number_ = next_file; 974 next_file_number_ = next_file + 1; 975 last_sequence_ = last_sequence; 976 log_number_ = log_number; 977 prev_log_number_ = prev_log_number; 978 979 // See if we can reuse the existing MANIFEST file. 980 if (ReuseManifest(dscname, current)) { 981 // No need to save new manifest 982 } else { 983 *save_manifest = true; 984 } 985 } 986 987 return s; 988 } 989 990 bool VersionSet::ReuseManifest(const std::string& dscname, 991 const std::string& dscbase) { 992 if (!options_->reuse_logs) { 993 return false; 994 } 995 FileType manifest_type; 996 uint64_t manifest_number; 997 uint64_t manifest_size; 998 if (!ParseFileName(dscbase, &manifest_number, &manifest_type) || 999 manifest_type != kDescriptorFile || 1000 !env_->GetFileSize(dscname, &manifest_size).ok() || 1001 // Make new compacted MANIFEST if old one is too big 1002 manifest_size >= TargetFileSize(options_)) { 1003 return false; 1004 } 1005 1006 assert(descriptor_file_ == nullptr); 1007 assert(descriptor_log_ == nullptr); 1008 Status r = env_->NewAppendableFile(dscname, &descriptor_file_); 1009 if (!r.ok()) { 1010 Log(options_->info_log, "Reuse MANIFEST: %s\n", r.ToString().c_str()); 1011 assert(descriptor_file_ == nullptr); 1012 return false; 1013 } 1014 1015 Log(options_->info_log, "Reusing MANIFEST %s\n", dscname.c_str()); 1016 descriptor_log_ = new log::Writer(descriptor_file_, manifest_size); 1017 manifest_file_number_ = manifest_number; 1018 return true; 1019 } 1020 1021 void VersionSet::MarkFileNumberUsed(uint64_t number) { 1022 if (next_file_number_ <= number) { 1023 next_file_number_ = number + 1; 1024 } 1025 } 1026 1027 void VersionSet::Finalize(Version* v) { 1028 // Precomputed best level for next compaction 1029 int best_level = -1; 1030 double best_score = -1; 1031 1032 for (int level = 0; level < config::kNumLevels - 1; level++) { 1033 double score; 1034 if (level == 0) { 1035 // We treat level-0 specially by bounding the number of files 1036 // instead of number of bytes for two reasons: 1037 // 1038 // (1) With larger write-buffer sizes, it is nice not to do too 1039 // many level-0 compactions. 1040 // 1041 // (2) The files in level-0 are merged on every read and 1042 // therefore we wish to avoid too many files when the individual 1043 // file size is small (perhaps because of a small write-buffer 1044 // setting, or very high compression ratios, or lots of 1045 // overwrites/deletions). 1046 score = v->files_[level].size() / 1047 static_cast<double>(config::kL0_CompactionTrigger); 1048 } else { 1049 // Compute the ratio of current size to size limit. 1050 const uint64_t level_bytes = TotalFileSize(v->files_[level]); 1051 score = 1052 static_cast<double>(level_bytes) / MaxBytesForLevel(options_, level); 1053 } 1054 1055 if (score > best_score) { 1056 best_level = level; 1057 best_score = score; 1058 } 1059 } 1060 1061 v->compaction_level_ = best_level; 1062 v->compaction_score_ = best_score; 1063 } 1064 1065 Status VersionSet::WriteSnapshot(log::Writer* log) { 1066 // TODO: Break up into multiple records to reduce memory usage on recovery? 1067 1068 // Save metadata 1069 VersionEdit edit; 1070 edit.SetComparatorName(icmp_.user_comparator()->Name()); 1071 1072 // Save compaction pointers 1073 for (int level = 0; level < config::kNumLevels; level++) { 1074 if (!compact_pointer_[level].empty()) { 1075 InternalKey key; 1076 key.DecodeFrom(compact_pointer_[level]); 1077 edit.SetCompactPointer(level, key); 1078 } 1079 } 1080 1081 // Save files 1082 for (int level = 0; level < config::kNumLevels; level++) { 1083 const std::vector<FileMetaData*>& files = current_->files_[level]; 1084 for (size_t i = 0; i < files.size(); i++) { 1085 const FileMetaData* f = files[i]; 1086 edit.AddFile(level, f->number, f->file_size, f->smallest, f->largest); 1087 } 1088 } 1089 1090 std::string record; 1091 edit.EncodeTo(&record); 1092 return log->AddRecord(record); 1093 } 1094 1095 int VersionSet::NumLevelFiles(int level) const { 1096 assert(level >= 0); 1097 assert(level < config::kNumLevels); 1098 return current_->files_[level].size(); 1099 } 1100 1101 const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const { 1102 // Update code if kNumLevels changes 1103 static_assert(config::kNumLevels == 7, ""); 1104 snprintf(scratch->buffer, sizeof(scratch->buffer), 1105 "files[ %d %d %d %d %d %d %d ]", int(current_->files_[0].size()), 1106 int(current_->files_[1].size()), int(current_->files_[2].size()), 1107 int(current_->files_[3].size()), int(current_->files_[4].size()), 1108 int(current_->files_[5].size()), int(current_->files_[6].size())); 1109 return scratch->buffer; 1110 } 1111 1112 uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { 1113 uint64_t result = 0; 1114 for (int level = 0; level < config::kNumLevels; level++) { 1115 const std::vector<FileMetaData*>& files = v->files_[level]; 1116 for (size_t i = 0; i < files.size(); i++) { 1117 if (icmp_.Compare(files[i]->largest, ikey) <= 0) { 1118 // Entire file is before "ikey", so just add the file size 1119 result += files[i]->file_size; 1120 } else if (icmp_.Compare(files[i]->smallest, ikey) > 0) { 1121 // Entire file is after "ikey", so ignore 1122 if (level > 0) { 1123 // Files other than level 0 are sorted by meta->smallest, so 1124 // no further files in this level will contain data for 1125 // "ikey". 1126 break; 1127 } 1128 } else { 1129 // "ikey" falls in the range for this table. Add the 1130 // approximate offset of "ikey" within the table. 1131 Table* tableptr; 1132 Iterator* iter = table_cache_->NewIterator( 1133 ReadOptions(), files[i]->number, files[i]->file_size, &tableptr); 1134 if (tableptr != nullptr) { 1135 result += tableptr->ApproximateOffsetOf(ikey.Encode()); 1136 } 1137 delete iter; 1138 } 1139 } 1140 } 1141 return result; 1142 } 1143 1144 void VersionSet::AddLiveFiles(std::set<uint64_t>* live) { 1145 for (Version* v = dummy_versions_.next_; v != &dummy_versions_; 1146 v = v->next_) { 1147 for (int level = 0; level < config::kNumLevels; level++) { 1148 const std::vector<FileMetaData*>& files = v->files_[level]; 1149 for (size_t i = 0; i < files.size(); i++) { 1150 live->insert(files[i]->number); 1151 } 1152 } 1153 } 1154 } 1155 1156 int64_t VersionSet::NumLevelBytes(int level) const { 1157 assert(level >= 0); 1158 assert(level < config::kNumLevels); 1159 return TotalFileSize(current_->files_[level]); 1160 } 1161 1162 int64_t VersionSet::MaxNextLevelOverlappingBytes() { 1163 int64_t result = 0; 1164 std::vector<FileMetaData*> overlaps; 1165 for (int level = 1; level < config::kNumLevels - 1; level++) { 1166 for (size_t i = 0; i < current_->files_[level].size(); i++) { 1167 const FileMetaData* f = current_->files_[level][i]; 1168 current_->GetOverlappingInputs(level + 1, &f->smallest, &f->largest, 1169 &overlaps); 1170 const int64_t sum = TotalFileSize(overlaps); 1171 if (sum > result) { 1172 result = sum; 1173 } 1174 } 1175 } 1176 return result; 1177 } 1178 1179 // Stores the minimal range that covers all entries in inputs in 1180 // *smallest, *largest. 1181 // REQUIRES: inputs is not empty 1182 void VersionSet::GetRange(const std::vector<FileMetaData*>& inputs, 1183 InternalKey* smallest, InternalKey* largest) { 1184 assert(!inputs.empty()); 1185 smallest->Clear(); 1186 largest->Clear(); 1187 for (size_t i = 0; i < inputs.size(); i++) { 1188 FileMetaData* f = inputs[i]; 1189 if (i == 0) { 1190 *smallest = f->smallest; 1191 *largest = f->largest; 1192 } else { 1193 if (icmp_.Compare(f->smallest, *smallest) < 0) { 1194 *smallest = f->smallest; 1195 } 1196 if (icmp_.Compare(f->largest, *largest) > 0) { 1197 *largest = f->largest; 1198 } 1199 } 1200 } 1201 } 1202 1203 // Stores the minimal range that covers all entries in inputs1 and inputs2 1204 // in *smallest, *largest. 1205 // REQUIRES: inputs is not empty 1206 void VersionSet::GetRange2(const std::vector<FileMetaData*>& inputs1, 1207 const std::vector<FileMetaData*>& inputs2, 1208 InternalKey* smallest, InternalKey* largest) { 1209 std::vector<FileMetaData*> all = inputs1; 1210 all.insert(all.end(), inputs2.begin(), inputs2.end()); 1211 GetRange(all, smallest, largest); 1212 } 1213 1214 Iterator* VersionSet::MakeInputIterator(Compaction* c) { 1215 ReadOptions options; 1216 options.verify_checksums = options_->paranoid_checks; 1217 options.fill_cache = false; 1218 1219 // Level-0 files have to be merged together. For other levels, 1220 // we will make a concatenating iterator per level. 1221 // TODO(opt): use concatenating iterator for level-0 if there is no overlap 1222 const int space = (c->level() == 0 ? c->inputs_[0].size() + 1 : 2); 1223 Iterator** list = new Iterator*[space]; 1224 int num = 0; 1225 for (int which = 0; which < 2; which++) { 1226 if (!c->inputs_[which].empty()) { 1227 if (c->level() + which == 0) { 1228 const std::vector<FileMetaData*>& files = c->inputs_[which]; 1229 for (size_t i = 0; i < files.size(); i++) { 1230 list[num++] = table_cache_->NewIterator(options, files[i]->number, 1231 files[i]->file_size); 1232 } 1233 } else { 1234 // Create concatenating iterator for the files from this level 1235 list[num++] = NewTwoLevelIterator( 1236 new Version::LevelFileNumIterator(icmp_, &c->inputs_[which]), 1237 &GetFileIterator, table_cache_, options); 1238 } 1239 } 1240 } 1241 assert(num <= space); 1242 Iterator* result = NewMergingIterator(&icmp_, list, num); 1243 delete[] list; 1244 return result; 1245 } 1246 1247 Compaction* VersionSet::PickCompaction() { 1248 Compaction* c; 1249 int level; 1250 1251 // We prefer compactions triggered by too much data in a level over 1252 // the compactions triggered by seeks. 1253 const bool size_compaction = (current_->compaction_score_ >= 1); 1254 const bool seek_compaction = (current_->file_to_compact_ != nullptr); 1255 if (size_compaction) { 1256 level = current_->compaction_level_; 1257 assert(level >= 0); 1258 assert(level + 1 < config::kNumLevels); 1259 c = new Compaction(options_, level); 1260 1261 // Pick the first file that comes after compact_pointer_[level] 1262 for (size_t i = 0; i < current_->files_[level].size(); i++) { 1263 FileMetaData* f = current_->files_[level][i]; 1264 if (compact_pointer_[level].empty() || 1265 icmp_.Compare(f->largest.Encode(), compact_pointer_[level]) > 0) { 1266 c->inputs_[0].push_back(f); 1267 break; 1268 } 1269 } 1270 if (c->inputs_[0].empty()) { 1271 // Wrap-around to the beginning of the key space 1272 c->inputs_[0].push_back(current_->files_[level][0]); 1273 } 1274 } else if (seek_compaction) { 1275 level = current_->file_to_compact_level_; 1276 c = new Compaction(options_, level); 1277 c->inputs_[0].push_back(current_->file_to_compact_); 1278 } else { 1279 return nullptr; 1280 } 1281 1282 c->input_version_ = current_; 1283 c->input_version_->Ref(); 1284 1285 // Files in level 0 may overlap each other, so pick up all overlapping ones 1286 if (level == 0) { 1287 InternalKey smallest, largest; 1288 GetRange(c->inputs_[0], &smallest, &largest); 1289 // Note that the next call will discard the file we placed in 1290 // c->inputs_[0] earlier and replace it with an overlapping set 1291 // which will include the picked file. 1292 current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]); 1293 assert(!c->inputs_[0].empty()); 1294 } 1295 1296 SetupOtherInputs(c); 1297 1298 return c; 1299 } 1300 1301 // Finds the largest key in a vector of files. Returns true if files it not 1302 // empty. 1303 bool FindLargestKey(const InternalKeyComparator& icmp, 1304 const std::vector<FileMetaData*>& files, 1305 InternalKey* largest_key) { 1306 if (files.empty()) { 1307 return false; 1308 } 1309 *largest_key = files[0]->largest; 1310 for (size_t i = 1; i < files.size(); ++i) { 1311 FileMetaData* f = files[i]; 1312 if (icmp.Compare(f->largest, *largest_key) > 0) { 1313 *largest_key = f->largest; 1314 } 1315 } 1316 return true; 1317 } 1318 1319 // Finds minimum file b2=(l2, u2) in level file for which l2 > u1 and 1320 // user_key(l2) = user_key(u1) 1321 FileMetaData* FindSmallestBoundaryFile( 1322 const InternalKeyComparator& icmp, 1323 const std::vector<FileMetaData*>& level_files, 1324 const InternalKey& largest_key) { 1325 const Comparator* user_cmp = icmp.user_comparator(); 1326 FileMetaData* smallest_boundary_file = nullptr; 1327 for (size_t i = 0; i < level_files.size(); ++i) { 1328 FileMetaData* f = level_files[i]; 1329 if (icmp.Compare(f->smallest, largest_key) > 0 && 1330 user_cmp->Compare(f->smallest.user_key(), largest_key.user_key()) == 1331 0) { 1332 if (smallest_boundary_file == nullptr || 1333 icmp.Compare(f->smallest, smallest_boundary_file->smallest) < 0) { 1334 smallest_boundary_file = f; 1335 } 1336 } 1337 } 1338 return smallest_boundary_file; 1339 } 1340 1341 // Extracts the largest file b1 from |compaction_files| and then searches for a 1342 // b2 in |level_files| for which user_key(u1) = user_key(l2). If it finds such a 1343 // file b2 (known as a boundary file) it adds it to |compaction_files| and then 1344 // searches again using this new upper bound. 1345 // 1346 // If there are two blocks, b1=(l1, u1) and b2=(l2, u2) and 1347 // user_key(u1) = user_key(l2), and if we compact b1 but not b2 then a 1348 // subsequent get operation will yield an incorrect result because it will 1349 // return the record from b2 in level i rather than from b1 because it searches 1350 // level by level for records matching the supplied user key. 1351 // 1352 // parameters: 1353 // in level_files: List of files to search for boundary files. 1354 // in/out compaction_files: List of files to extend by adding boundary files. 1355 void AddBoundaryInputs(const InternalKeyComparator& icmp, 1356 const std::vector<FileMetaData*>& level_files, 1357 std::vector<FileMetaData*>* compaction_files) { 1358 InternalKey largest_key; 1359 1360 // Quick return if compaction_files is empty. 1361 if (!FindLargestKey(icmp, *compaction_files, &largest_key)) { 1362 return; 1363 } 1364 1365 bool continue_searching = true; 1366 while (continue_searching) { 1367 FileMetaData* smallest_boundary_file = 1368 FindSmallestBoundaryFile(icmp, level_files, largest_key); 1369 1370 // If a boundary file was found advance largest_key, otherwise we're done. 1371 if (smallest_boundary_file != NULL) { 1372 compaction_files->push_back(smallest_boundary_file); 1373 largest_key = smallest_boundary_file->largest; 1374 } else { 1375 continue_searching = false; 1376 } 1377 } 1378 } 1379 1380 void VersionSet::SetupOtherInputs(Compaction* c) { 1381 const int level = c->level(); 1382 InternalKey smallest, largest; 1383 1384 AddBoundaryInputs(icmp_, current_->files_[level], &c->inputs_[0]); 1385 GetRange(c->inputs_[0], &smallest, &largest); 1386 1387 current_->GetOverlappingInputs(level + 1, &smallest, &largest, 1388 &c->inputs_[1]); 1389 1390 // Get entire range covered by compaction 1391 InternalKey all_start, all_limit; 1392 GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); 1393 1394 // See if we can grow the number of inputs in "level" without 1395 // changing the number of "level+1" files we pick up. 1396 if (!c->inputs_[1].empty()) { 1397 std::vector<FileMetaData*> expanded0; 1398 current_->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0); 1399 AddBoundaryInputs(icmp_, current_->files_[level], &expanded0); 1400 const int64_t inputs0_size = TotalFileSize(c->inputs_[0]); 1401 const int64_t inputs1_size = TotalFileSize(c->inputs_[1]); 1402 const int64_t expanded0_size = TotalFileSize(expanded0); 1403 if (expanded0.size() > c->inputs_[0].size() && 1404 inputs1_size + expanded0_size < 1405 ExpandedCompactionByteSizeLimit(options_)) { 1406 InternalKey new_start, new_limit; 1407 GetRange(expanded0, &new_start, &new_limit); 1408 std::vector<FileMetaData*> expanded1; 1409 current_->GetOverlappingInputs(level + 1, &new_start, &new_limit, 1410 &expanded1); 1411 if (expanded1.size() == c->inputs_[1].size()) { 1412 Log(options_->info_log, 1413 "Expanding@%d %d+%d (%ld+%ld bytes) to %d+%d (%ld+%ld bytes)\n", 1414 level, int(c->inputs_[0].size()), int(c->inputs_[1].size()), 1415 long(inputs0_size), long(inputs1_size), int(expanded0.size()), 1416 int(expanded1.size()), long(expanded0_size), long(inputs1_size)); 1417 smallest = new_start; 1418 largest = new_limit; 1419 c->inputs_[0] = expanded0; 1420 c->inputs_[1] = expanded1; 1421 GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); 1422 } 1423 } 1424 } 1425 1426 // Compute the set of grandparent files that overlap this compaction 1427 // (parent == level+1; grandparent == level+2) 1428 if (level + 2 < config::kNumLevels) { 1429 current_->GetOverlappingInputs(level + 2, &all_start, &all_limit, 1430 &c->grandparents_); 1431 } 1432 1433 // Update the place where we will do the next compaction for this level. 1434 // We update this immediately instead of waiting for the VersionEdit 1435 // to be applied so that if the compaction fails, we will try a different 1436 // key range next time. 1437 compact_pointer_[level] = largest.Encode().ToString(); 1438 c->edit_.SetCompactPointer(level, largest); 1439 } 1440 1441 Compaction* VersionSet::CompactRange(int level, const InternalKey* begin, 1442 const InternalKey* end) { 1443 std::vector<FileMetaData*> inputs; 1444 current_->GetOverlappingInputs(level, begin, end, &inputs); 1445 if (inputs.empty()) { 1446 return nullptr; 1447 } 1448 1449 // Avoid compacting too much in one shot in case the range is large. 1450 // But we cannot do this for level-0 since level-0 files can overlap 1451 // and we must not pick one file and drop another older file if the 1452 // two files overlap. 1453 if (level > 0) { 1454 const uint64_t limit = MaxFileSizeForLevel(options_, level); 1455 uint64_t total = 0; 1456 for (size_t i = 0; i < inputs.size(); i++) { 1457 uint64_t s = inputs[i]->file_size; 1458 total += s; 1459 if (total >= limit) { 1460 inputs.resize(i + 1); 1461 break; 1462 } 1463 } 1464 } 1465 1466 Compaction* c = new Compaction(options_, level); 1467 c->input_version_ = current_; 1468 c->input_version_->Ref(); 1469 c->inputs_[0] = inputs; 1470 SetupOtherInputs(c); 1471 return c; 1472 } 1473 1474 Compaction::Compaction(const Options* options, int level) 1475 : level_(level), 1476 max_output_file_size_(MaxFileSizeForLevel(options, level)), 1477 input_version_(nullptr), 1478 grandparent_index_(0), 1479 seen_key_(false), 1480 overlapped_bytes_(0) { 1481 for (int i = 0; i < config::kNumLevels; i++) { 1482 level_ptrs_[i] = 0; 1483 } 1484 } 1485 1486 Compaction::~Compaction() { 1487 if (input_version_ != nullptr) { 1488 input_version_->Unref(); 1489 } 1490 } 1491 1492 bool Compaction::IsTrivialMove() const { 1493 const VersionSet* vset = input_version_->vset_; 1494 // Avoid a move if there is lots of overlapping grandparent data. 1495 // Otherwise, the move could create a parent file that will require 1496 // a very expensive merge later on. 1497 return (num_input_files(0) == 1 && num_input_files(1) == 0 && 1498 TotalFileSize(grandparents_) <= 1499 MaxGrandParentOverlapBytes(vset->options_)); 1500 } 1501 1502 void Compaction::AddInputDeletions(VersionEdit* edit) { 1503 for (int which = 0; which < 2; which++) { 1504 for (size_t i = 0; i < inputs_[which].size(); i++) { 1505 edit->DeleteFile(level_ + which, inputs_[which][i]->number); 1506 } 1507 } 1508 } 1509 1510 bool Compaction::IsBaseLevelForKey(const Slice& user_key) { 1511 // Maybe use binary search to find right entry instead of linear search? 1512 const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); 1513 for (int lvl = level_ + 2; lvl < config::kNumLevels; lvl++) { 1514 const std::vector<FileMetaData*>& files = input_version_->files_[lvl]; 1515 while (level_ptrs_[lvl] < files.size()) { 1516 FileMetaData* f = files[level_ptrs_[lvl]]; 1517 if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { 1518 // We've advanced far enough 1519 if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { 1520 // Key falls in this file's range, so definitely not base level 1521 return false; 1522 } 1523 break; 1524 } 1525 level_ptrs_[lvl]++; 1526 } 1527 } 1528 return true; 1529 } 1530 1531 bool Compaction::ShouldStopBefore(const Slice& internal_key) { 1532 const VersionSet* vset = input_version_->vset_; 1533 // Scan to find earliest grandparent file that contains key. 1534 const InternalKeyComparator* icmp = &vset->icmp_; 1535 while (grandparent_index_ < grandparents_.size() && 1536 icmp->Compare(internal_key, 1537 grandparents_[grandparent_index_]->largest.Encode()) > 1538 0) { 1539 if (seen_key_) { 1540 overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; 1541 } 1542 grandparent_index_++; 1543 } 1544 seen_key_ = true; 1545 1546 if (overlapped_bytes_ > MaxGrandParentOverlapBytes(vset->options_)) { 1547 // Too much overlap for current output; start new output 1548 overlapped_bytes_ = 0; 1549 return true; 1550 } else { 1551 return false; 1552 } 1553 } 1554 1555 void Compaction::ReleaseInputs() { 1556 if (input_version_ != nullptr) { 1557 input_version_->Unref(); 1558 input_version_ = nullptr; 1559 } 1560 } 1561 1562 } // namespace leveldb