index.pyx
1 cimport cython 2 3 import numpy as np 4 5 cimport numpy as cnp 6 from numpy cimport ( 7 float32_t, 8 float64_t, 9 int8_t, 10 int16_t, 11 int32_t, 12 int64_t, 13 intp_t, 14 ndarray, 15 uint8_t, 16 uint16_t, 17 uint32_t, 18 uint64_t, 19 ) 20 21 cnp.import_array() 22 23 24 from pandas._libs cimport util 25 from pandas._libs.hashtable cimport HashTable 26 from pandas._libs.tslibs.nattype cimport c_NaT as NaT 27 from pandas._libs.tslibs.period cimport is_period_object 28 from pandas._libs.tslibs.timedeltas cimport _Timedelta 29 from pandas._libs.tslibs.timestamps cimport _Timestamp 30 31 from pandas._libs import ( 32 algos, 33 hashtable as _hash, 34 ) 35 36 from pandas._libs.lib cimport eq_NA_compat 37 from pandas._libs.missing cimport ( 38 C_NA as NA, 39 checknull, 40 is_matching_na, 41 ) 42 43 44 cdef inline bint is_definitely_invalid_key(object val): 45 try: 46 hash(val) 47 except TypeError: 48 return True 49 return False 50 51 52 cdef ndarray _get_bool_indexer(ndarray values, object val): 53 """ 54 Return a ndarray[bool] of locations where val matches self.values. 55 56 If val is not NA, this is equivalent to `self.values == val` 57 """ 58 # Caller is responsible for ensuring _check_type has already been called 59 cdef: 60 ndarray[uint8_t, ndim=1, cast=True] indexer 61 Py_ssize_t i 62 object item 63 64 if values.descr.type_num == cnp.NPY_OBJECT: 65 # i.e. values.dtype == object 66 if not checknull(val): 67 indexer = eq_NA_compat(values, val) 68 69 else: 70 # We need to check for _matching_ NA values 71 indexer = np.empty(len(values), dtype=np.uint8) 72 73 for i in range(len(values)): 74 item = values[i] 75 indexer[i] = is_matching_na(item, val) 76 77 else: 78 if util.is_nan(val): 79 indexer = np.isnan(values) 80 else: 81 indexer = values == val 82 83 return indexer.view(bool) 84 85 86 # Don't populate hash tables in monotonic indexes larger than this 87 _SIZE_CUTOFF = 1_000_000 88 89 90 cdef _unpack_bool_indexer(ndarray[uint8_t, ndim=1, cast=True] indexer, object val): 91 """ 92 Possibly unpack a boolean mask to a single indexer. 93 """ 94 # Returns ndarray[bool] or int 95 cdef: 96 ndarray[intp_t, ndim=1] found 97 int count 98 99 found = np.where(indexer)[0] 100 count = len(found) 101 102 if count > 1: 103 return indexer 104 if count == 1: 105 return int(found[0]) 106 107 raise KeyError(val) 108 109 110 @cython.freelist(32) 111 cdef class IndexEngine: 112 113 cdef readonly: 114 ndarray values 115 HashTable mapping 116 bint over_size_threshold 117 118 cdef: 119 bint unique, monotonic_inc, monotonic_dec 120 bint need_monotonic_check, need_unique_check 121 object _np_type 122 123 def __init__(self, ndarray values): 124 self.values = values 125 126 self.over_size_threshold = len(values) >= _SIZE_CUTOFF 127 self.clear_mapping() 128 self._np_type = values.dtype.type 129 130 def __contains__(self, val: object) -> bool: 131 hash(val) 132 try: 133 self.get_loc(val) 134 except KeyError: 135 return False 136 return True 137 138 cpdef get_loc(self, object val): 139 # -> Py_ssize_t | slice | ndarray[bool] 140 cdef: 141 Py_ssize_t loc 142 143 if is_definitely_invalid_key(val): 144 raise TypeError(f"'{val}' is an invalid key") 145 146 val = self._check_type(val) 147 148 if self.over_size_threshold and self.is_monotonic_increasing: 149 if not self.is_unique: 150 return self._get_loc_duplicates(val) 151 values = self.values 152 153 loc = self._searchsorted_left(val) 154 if loc >= len(values): 155 raise KeyError(val) 156 if values[loc] != val: 157 raise KeyError(val) 158 return loc 159 160 self._ensure_mapping_populated() 161 if not self.unique: 162 return self._get_loc_duplicates(val) 163 164 try: 165 return self.mapping.get_item(val) 166 except OverflowError as err: 167 # GH#41775 OverflowError e.g. if we are uint64 and val is -1 168 # or if we are int64 and value is np.iinfo(np.int64).max+1 169 # (the uint64 with -1 case should actually be excluded by _check_type) 170 raise KeyError(val) from err 171 172 cdef Py_ssize_t _searchsorted_left(self, val) except? -1: 173 """ 174 See ObjectEngine._searchsorted_left.__doc__. 175 """ 176 # Caller is responsible for ensuring _check_type has already been called 177 loc = self.values.searchsorted(self._np_type(val), side="left") 178 return loc 179 180 cdef inline _get_loc_duplicates(self, object val): 181 # -> Py_ssize_t | slice | ndarray[bool] 182 cdef: 183 Py_ssize_t diff, left, right 184 185 if self.is_monotonic_increasing: 186 values = self.values 187 try: 188 left = values.searchsorted(val, side='left') 189 right = values.searchsorted(val, side='right') 190 except TypeError: 191 # e.g. GH#29189 get_loc(None) with a Float64Index 192 # 2021-09-29 Now only reached for object-dtype 193 raise KeyError(val) 194 195 diff = right - left 196 if diff == 0: 197 raise KeyError(val) 198 elif diff == 1: 199 return left 200 else: 201 return slice(left, right) 202 203 return self._maybe_get_bool_indexer(val) 204 205 cdef _maybe_get_bool_indexer(self, object val): 206 # Returns ndarray[bool] or int 207 cdef: 208 ndarray[uint8_t, ndim=1, cast=True] indexer 209 210 indexer = _get_bool_indexer(self.values, val) 211 return _unpack_bool_indexer(indexer, val) 212 213 def sizeof(self, deep: bool = False) -> int: 214 """ return the sizeof our mapping """ 215 if not self.is_mapping_populated: 216 return 0 217 return self.mapping.sizeof(deep=deep) 218 219 def __sizeof__(self) -> int: 220 return self.sizeof() 221 222 @property 223 def is_unique(self) -> bool: 224 if self.need_unique_check: 225 self._do_unique_check() 226 227 return self.unique == 1 228 229 cdef inline _do_unique_check(self): 230 231 # this de-facto the same 232 self._ensure_mapping_populated() 233 234 @property 235 def is_monotonic_increasing(self) -> bool: 236 if self.need_monotonic_check: 237 self._do_monotonic_check() 238 239 return self.monotonic_inc == 1 240 241 @property 242 def is_monotonic_decreasing(self) -> bool: 243 if self.need_monotonic_check: 244 self._do_monotonic_check() 245 246 return self.monotonic_dec == 1 247 248 cdef inline _do_monotonic_check(self): 249 cdef: 250 bint is_unique 251 try: 252 values = self.values 253 self.monotonic_inc, self.monotonic_dec, is_unique = \ 254 self._call_monotonic(values) 255 except TypeError: 256 self.monotonic_inc = 0 257 self.monotonic_dec = 0 258 is_unique = 0 259 260 self.need_monotonic_check = 0 261 262 # we can only be sure of uniqueness if is_unique=1 263 if is_unique: 264 self.unique = 1 265 self.need_unique_check = 0 266 267 cdef _call_monotonic(self, values): 268 return algos.is_monotonic(values, timelike=False) 269 270 cdef _make_hash_table(self, Py_ssize_t n): 271 raise NotImplementedError # pragma: no cover 272 273 cdef _check_type(self, object val): 274 hash(val) 275 return val 276 277 @property 278 def is_mapping_populated(self) -> bool: 279 return self.mapping is not None 280 281 cdef inline _ensure_mapping_populated(self): 282 # this populates the mapping 283 # if its not already populated 284 # also satisfies the need_unique_check 285 286 if not self.is_mapping_populated: 287 288 values = self.values 289 self.mapping = self._make_hash_table(len(values)) 290 self.mapping.map_locations(values) 291 292 if len(self.mapping) == len(values): 293 self.unique = 1 294 295 self.need_unique_check = 0 296 297 def clear_mapping(self): 298 self.mapping = None 299 self.need_monotonic_check = 1 300 self.need_unique_check = 1 301 302 self.unique = 0 303 self.monotonic_inc = 0 304 self.monotonic_dec = 0 305 306 def get_indexer(self, ndarray values) -> np.ndarray: 307 self._ensure_mapping_populated() 308 return self.mapping.lookup(values) 309 310 def get_indexer_non_unique(self, ndarray targets): 311 """ 312 Return an indexer suitable for taking from a non unique index 313 return the labels in the same order as the target 314 and a missing indexer into the targets (which correspond 315 to the -1 indices in the results 316 317 Returns 318 ------- 319 indexer : np.ndarray[np.intp] 320 missing : np.ndarray[np.intp] 321 """ 322 cdef: 323 ndarray values 324 ndarray[intp_t] result, missing 325 set stargets, remaining_stargets, found_nas 326 dict d = {} 327 object val 328 Py_ssize_t count = 0, count_missing = 0 329 Py_ssize_t i, j, n, n_t, n_alloc, start, end 330 bint check_na_values = False 331 332 values = self.values 333 stargets = set(targets) 334 335 n = len(values) 336 n_t = len(targets) 337 if n > 10_000: 338 n_alloc = 10_000 339 else: 340 n_alloc = n 341 342 result = np.empty(n_alloc, dtype=np.intp) 343 missing = np.empty(n_t, dtype=np.intp) 344 345 # map each starget to its position in the index 346 if ( 347 stargets and 348 len(stargets) < 5 and 349 not any([checknull(t) for t in stargets]) and 350 self.is_monotonic_increasing 351 ): 352 # if there are few enough stargets and the index is monotonically 353 # increasing, then use binary search for each starget 354 remaining_stargets = set() 355 for starget in stargets: 356 try: 357 start = values.searchsorted(starget, side='left') 358 end = values.searchsorted(starget, side='right') 359 except TypeError: # e.g. if we tried to search for string in int array 360 remaining_stargets.add(starget) 361 else: 362 if start != end: 363 d[starget] = list(range(start, end)) 364 365 stargets = remaining_stargets 366 367 if stargets: 368 # otherwise, map by iterating through all items in the index 369 370 # short-circuit na check 371 if values.dtype == object: 372 check_na_values = True 373 # keep track of nas in values 374 found_nas = set() 375 376 for i in range(n): 377 val = values[i] 378 379 # GH#43870 380 # handle lookup for nas 381 # (ie. np.nan, float("NaN"), Decimal("NaN"), dt64nat, td64nat) 382 if check_na_values and checknull(val): 383 match = [na for na in found_nas if is_matching_na(val, na)] 384 385 # matching na not found 386 if not len(match): 387 found_nas.add(val) 388 389 # add na to stargets to utilize `in` for stargets/d lookup 390 match_stargets = [ 391 x for x in stargets if is_matching_na(val, x) 392 ] 393 394 if len(match_stargets): 395 # add our 'standardized' na 396 stargets.add(val) 397 398 # matching na found 399 else: 400 assert len(match) == 1 401 val = match[0] 402 403 if val in stargets: 404 if val not in d: 405 d[val] = [] 406 d[val].append(i) 407 408 for i in range(n_t): 409 val = targets[i] 410 411 # ensure there are nas in values before looking for a matching na 412 if check_na_values and checknull(val): 413 match = [na for na in found_nas if is_matching_na(val, na)] 414 if len(match): 415 assert len(match) == 1 416 val = match[0] 417 418 # found 419 if val in d: 420 key = val 421 422 for j in d[key]: 423 424 # realloc if needed 425 if count >= n_alloc: 426 n_alloc += 10_000 427 result = np.resize(result, n_alloc) 428 429 result[count] = j 430 count += 1 431 432 # value not found 433 else: 434 435 if count >= n_alloc: 436 n_alloc += 10_000 437 result = np.resize(result, n_alloc) 438 result[count] = -1 439 count += 1 440 missing[count_missing] = i 441 count_missing += 1 442 443 return result[0:count], missing[0:count_missing] 444 445 446 cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: 447 # GH#1757 ndarray.searchsorted is not safe to use with array of tuples 448 # (treats a tuple `val` as a sequence of keys instead of a single key), 449 # so we implement something similar. 450 # This is equivalent to the stdlib's bisect.bisect_left 451 452 cdef: 453 Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1 454 object pval 455 456 if hi == 0 or (hi > 0 and val > values[hi]): 457 return len(values) 458 459 while lo < hi: 460 mid = (lo + hi) // 2 461 pval = values[mid] 462 if val < pval: 463 hi = mid 464 elif val > pval: 465 lo = mid + 1 466 else: 467 while mid > 0 and val == values[mid - 1]: 468 mid -= 1 469 return mid 470 471 if val <= values[mid]: 472 return mid 473 else: 474 return mid + 1 475 476 477 cdef class ObjectEngine(IndexEngine): 478 """ 479 Index Engine for use with object-dtype Index, namely the base class Index. 480 """ 481 cdef _make_hash_table(self, Py_ssize_t n): 482 return _hash.PyObjectHashTable(n) 483 484 cdef Py_ssize_t _searchsorted_left(self, val) except? -1: 485 # using values.searchsorted here would treat a tuple `val` as a sequence 486 # instead of a single key, so we use a different implementation 487 try: 488 loc = _bin_search(self.values, val) 489 except TypeError as err: 490 raise KeyError(val) from err 491 return loc 492 493 494 cdef class DatetimeEngine(Int64Engine): 495 496 cdef int64_t _unbox_scalar(self, scalar) except? -1: 497 # NB: caller is responsible for ensuring tzawareness compat 498 # before we get here 499 if not (isinstance(scalar, _Timestamp) or scalar is NaT): 500 raise TypeError(scalar) 501 return scalar.value 502 503 def __contains__(self, val: object) -> bool: 504 # We assume before we get here: 505 # - val is hashable 506 self._unbox_scalar(val) 507 try: 508 self.get_loc(val) 509 return True 510 except KeyError: 511 return False 512 513 cdef _call_monotonic(self, values): 514 return algos.is_monotonic(values, timelike=True) 515 516 cpdef get_loc(self, object val): 517 # NB: the caller is responsible for ensuring that we are called 518 # with either a Timestamp or NaT (Timedelta or NaT for TimedeltaEngine) 519 520 cdef: 521 Py_ssize_t loc 522 523 if is_definitely_invalid_key(val): 524 raise TypeError(f"'{val}' is an invalid key") 525 526 try: 527 conv = self._unbox_scalar(val) 528 except TypeError: 529 raise KeyError(val) 530 531 # Welcome to the spaghetti factory 532 if self.over_size_threshold and self.is_monotonic_increasing: 533 if not self.is_unique: 534 return self._get_loc_duplicates(conv) 535 values = self.values 536 537 loc = values.searchsorted(conv, side='left') 538 539 if loc == len(values) or values[loc] != conv: 540 raise KeyError(val) 541 return loc 542 543 self._ensure_mapping_populated() 544 if not self.unique: 545 return self._get_loc_duplicates(conv) 546 547 try: 548 return self.mapping.get_item(conv) 549 except KeyError: 550 raise KeyError(val) 551 552 553 cdef class TimedeltaEngine(DatetimeEngine): 554 555 cdef int64_t _unbox_scalar(self, scalar) except? -1: 556 if not (isinstance(scalar, _Timedelta) or scalar is NaT): 557 raise TypeError(scalar) 558 return scalar.value 559 560 561 cdef class PeriodEngine(Int64Engine): 562 563 cdef int64_t _unbox_scalar(self, scalar) except? -1: 564 if scalar is NaT: 565 return scalar.value 566 if is_period_object(scalar): 567 # NB: we assume that we have the correct freq here. 568 return scalar.ordinal 569 raise TypeError(scalar) 570 571 cpdef get_loc(self, object val): 572 # NB: the caller is responsible for ensuring that we are called 573 # with either a Period or NaT 574 cdef: 575 int64_t conv 576 577 try: 578 conv = self._unbox_scalar(val) 579 except TypeError: 580 raise KeyError(val) 581 582 return Int64Engine.get_loc(self, conv) 583 584 cdef _call_monotonic(self, values): 585 return algos.is_monotonic(values, timelike=True) 586 587 588 cdef class BaseMultiIndexCodesEngine: 589 """ 590 Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which 591 represent each label in a MultiIndex as an integer, by juxtaposing the bits 592 encoding each level, with appropriate offsets. 593 594 For instance: if 3 levels have respectively 3, 6 and 1 possible values, 595 then their labels can be represented using respectively 2, 3 and 1 bits, 596 as follows: 597 _ _ _ _____ _ __ __ __ 598 |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level) 599 — — — ————— — —— —— —— 600 |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level) 601 — — — ————— — —— —— —— 602 |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels) 603 ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ 604 and the resulting unsigned integer representation will be: 605 _ _ _ _____ _ __ __ __ __ __ __ 606 |0|0|0| ... |0|c0|b2|b1|b0|a1|a0| 607 ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ 608 609 Offsets are calculated at initialization, labels are transformed by method 610 _codes_to_ints. 611 612 Keys are located by first locating each component against the respective 613 level, then locating (the integer representation of) codes. 614 """ 615 def __init__(self, object levels, object labels, 616 ndarray[uint64_t, ndim=1] offsets): 617 """ 618 Parameters 619 ---------- 620 levels : list-like of numpy arrays 621 Levels of the MultiIndex. 622 labels : list-like of numpy arrays of integer dtype 623 Labels of the MultiIndex. 624 offsets : numpy array of uint64 dtype 625 Pre-calculated offsets, one for each level of the index. 626 """ 627 self.levels = levels 628 self.offsets = offsets 629 630 # Transform labels in a single array, and add 1 so that we are working 631 # with positive integers (-1 for NaN becomes 0): 632 codes = (np.array(labels, dtype='int64').T + 1).astype('uint64', 633 copy=False) 634 635 # Map each codes combination in the index to an integer unambiguously 636 # (no collisions possible), based on the "offsets", which describe the 637 # number of bits to switch labels for each level: 638 lab_ints = self._codes_to_ints(codes) 639 640 # Initialize underlying index (e.g. libindex.UInt64Engine) with 641 # integers representing labels: we will use its get_loc and get_indexer 642 self._base.__init__(self, lab_ints) 643 644 def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray: 645 raise NotImplementedError("Implemented by subclass") # pragma: no cover 646 647 def _extract_level_codes(self, target) -> np.ndarray: 648 """ 649 Map the requested list of (tuple) keys to their integer representations 650 for searching in the underlying integer index. 651 652 Parameters 653 ---------- 654 target : MultiIndex 655 656 Returns 657 ------ 658 int_keys : 1-dimensional array of dtype uint64 or object 659 Integers representing one combination each 660 """ 661 zt = [target._get_level_values(i) for i in range(target.nlevels)] 662 level_codes = [lev.get_indexer_for(codes) + 1 for lev, codes 663 in zip(self.levels, zt)] 664 return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) 665 666 def get_indexer(self, target: np.ndarray) -> np.ndarray: 667 """ 668 Returns an array giving the positions of each value of `target` in 669 `self.values`, where -1 represents a value in `target` which does not 670 appear in `self.values` 671 672 Parameters 673 ---------- 674 target : np.ndarray 675 676 Returns 677 ------- 678 np.ndarray[intp_t, ndim=1] of the indexer of `target` into 679 `self.values` 680 """ 681 return self._base.get_indexer(self, target) 682 683 def get_indexer_with_fill(self, ndarray target, ndarray values, 684 str method, object limit) -> np.ndarray: 685 """ 686 Returns an array giving the positions of each value of `target` in 687 `values`, where -1 represents a value in `target` which does not 688 appear in `values` 689 690 If `method` is "backfill" then the position for a value in `target` 691 which does not appear in `values` is that of the next greater value 692 in `values` (if one exists), and -1 if there is no such value. 693 694 Similarly, if the method is "pad" then the position for a value in 695 `target` which does not appear in `values` is that of the next smaller 696 value in `values` (if one exists), and -1 if there is no such value. 697 698 Parameters 699 ---------- 700 target: ndarray[object] of tuples 701 need not be sorted, but all must have the same length, which must be 702 the same as the length of all tuples in `values` 703 values : ndarray[object] of tuples 704 must be sorted and all have the same length. Should be the set of 705 the MultiIndex's values. 706 method: string 707 "backfill" or "pad" 708 limit: int or None 709 if provided, limit the number of fills to this value 710 711 Returns 712 ------- 713 np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`, 714 filled with the `method` (and optionally `limit`) specified 715 """ 716 assert method in ("backfill", "pad") 717 cdef: 718 int64_t i, j, next_code 719 int64_t num_values, num_target_values 720 ndarray[int64_t, ndim=1] target_order 721 ndarray[object, ndim=1] target_values 722 ndarray[int64_t, ndim=1] new_codes, new_target_codes 723 ndarray[intp_t, ndim=1] sorted_indexer 724 725 target_order = np.argsort(target).astype('int64') 726 target_values = target[target_order] 727 num_values, num_target_values = len(values), len(target_values) 728 new_codes, new_target_codes = ( 729 np.empty((num_values,)).astype('int64'), 730 np.empty((num_target_values,)).astype('int64'), 731 ) 732 733 # `values` and `target_values` are both sorted, so we walk through them 734 # and memoize the (ordered) set of indices in the (implicit) merged-and 735 # sorted list of the two which belong to each of them 736 # the effect of this is to create a factorization for the (sorted) 737 # merger of the index values, where `new_codes` and `new_target_codes` 738 # are the subset of the factors which appear in `values` and `target`, 739 # respectively 740 i, j, next_code = 0, 0, 0 741 while i < num_values and j < num_target_values: 742 val, target_val = values[i], target_values[j] 743 if val <= target_val: 744 new_codes[i] = next_code 745 i += 1 746 if target_val <= val: 747 new_target_codes[j] = next_code 748 j += 1 749 next_code += 1 750 751 # at this point, at least one should have reached the end 752 # the remaining values of the other should be added to the end 753 assert i == num_values or j == num_target_values 754 while i < num_values: 755 new_codes[i] = next_code 756 i += 1 757 next_code += 1 758 while j < num_target_values: 759 new_target_codes[j] = next_code 760 j += 1 761 next_code += 1 762 763 # get the indexer, and undo the sorting of `target.values` 764 algo = algos.backfill if method == "backfill" else algos.pad 765 sorted_indexer = algo(new_codes, new_target_codes, limit=limit) 766 return sorted_indexer[np.argsort(target_order)] 767 768 def get_loc(self, object key): 769 if is_definitely_invalid_key(key): 770 raise TypeError(f"'{key}' is an invalid key") 771 if not isinstance(key, tuple): 772 raise KeyError(key) 773 try: 774 indices = [0 if checknull(v) else lev.get_loc(v) + 1 775 for lev, v in zip(self.levels, key)] 776 except KeyError: 777 raise KeyError(key) 778 779 # Transform indices into single integer: 780 lab_int = self._codes_to_ints(np.array(indices, dtype='uint64')) 781 782 return self._base.get_loc(self, lab_int) 783 784 def get_indexer_non_unique(self, target: np.ndarray) -> np.ndarray: 785 indexer = self._base.get_indexer_non_unique(self, target) 786 787 return indexer 788 789 def __contains__(self, val: object) -> bool: 790 # We assume before we get here: 791 # - val is hashable 792 # Default __contains__ looks in the underlying mapping, which in this 793 # case only contains integer representations. 794 try: 795 self.get_loc(val) 796 return True 797 except (KeyError, TypeError, ValueError): 798 return False 799 800 801 # Generated from template. 802 include "index_class_helper.pxi" 803 804 805 cdef class BoolEngine(UInt8Engine): 806 cdef _check_type(self, object val): 807 if not util.is_bool_object(val): 808 raise KeyError(val) 809 return <uint8_t>val 810 811 812 @cython.internal 813 @cython.freelist(32) 814 cdef class SharedEngine: 815 cdef readonly: 816 object values # ExtensionArray 817 bint over_size_threshold 818 819 cdef: 820 bint unique, monotonic_inc, monotonic_dec 821 bint need_monotonic_check, need_unique_check 822 823 def __contains__(self, val: object) -> bool: 824 # We assume before we get here: 825 # - val is hashable 826 try: 827 self.get_loc(val) 828 return True 829 except KeyError: 830 return False 831 832 def clear_mapping(self): 833 # for compat with IndexEngine 834 pass 835 836 @property 837 def is_unique(self) -> bool: 838 if self.need_unique_check: 839 arr = self.values.unique() 840 self.unique = len(arr) == len(self.values) 841 842 self.need_unique_check = False 843 return self.unique 844 845 cdef _do_monotonic_check(self): 846 raise NotImplementedError 847 848 @property 849 def is_monotonic_increasing(self) -> bool: 850 if self.need_monotonic_check: 851 self._do_monotonic_check() 852 853 return self.monotonic_inc == 1 854 855 @property 856 def is_monotonic_decreasing(self) -> bool: 857 if self.need_monotonic_check: 858 self._do_monotonic_check() 859 860 return self.monotonic_dec == 1 861 862 cdef _call_monotonic(self, values): 863 return algos.is_monotonic(values, timelike=False) 864 865 def sizeof(self, deep: bool = False) -> int: 866 """ return the sizeof our mapping """ 867 return 0 868 869 def __sizeof__(self) -> int: 870 return self.sizeof() 871 872 cdef _check_type(self, object obj): 873 raise NotImplementedError 874 875 cpdef get_loc(self, object val): 876 # -> Py_ssize_t | slice | ndarray[bool] 877 cdef: 878 Py_ssize_t loc 879 880 if is_definitely_invalid_key(val): 881 raise TypeError(f"'{val}' is an invalid key") 882 883 self._check_type(val) 884 885 if self.over_size_threshold and self.is_monotonic_increasing: 886 if not self.is_unique: 887 return self._get_loc_duplicates(val) 888 889 values = self.values 890 891 loc = self._searchsorted_left(val) 892 if loc >= len(values): 893 raise KeyError(val) 894 if values[loc] != val: 895 raise KeyError(val) 896 return loc 897 898 if not self.unique: 899 return self._get_loc_duplicates(val) 900 901 return self._get_loc_duplicates(val) 902 903 cdef inline _get_loc_duplicates(self, object val): 904 # -> Py_ssize_t | slice | ndarray[bool] 905 cdef: 906 Py_ssize_t diff 907 908 if self.is_monotonic_increasing: 909 values = self.values 910 try: 911 left = values.searchsorted(val, side='left') 912 right = values.searchsorted(val, side='right') 913 except TypeError: 914 # e.g. GH#29189 get_loc(None) with a Float64Index 915 raise KeyError(val) 916 917 diff = right - left 918 if diff == 0: 919 raise KeyError(val) 920 elif diff == 1: 921 return left 922 else: 923 return slice(left, right) 924 925 return self._maybe_get_bool_indexer(val) 926 927 cdef Py_ssize_t _searchsorted_left(self, val) except? -1: 928 """ 929 See ObjectEngine._searchsorted_left.__doc__. 930 """ 931 try: 932 loc = self.values.searchsorted(val, side="left") 933 except TypeError as err: 934 # GH#35788 e.g. val=None with float64 values 935 raise KeyError(val) 936 return loc 937 938 cdef ndarray _get_bool_indexer(self, val): 939 raise NotImplementedError 940 941 cdef _maybe_get_bool_indexer(self, object val): 942 # Returns ndarray[bool] or int 943 cdef: 944 ndarray[uint8_t, ndim=1, cast=True] indexer 945 946 indexer = self._get_bool_indexer(val) 947 return _unpack_bool_indexer(indexer, val) 948 949 def get_indexer(self, values) -> np.ndarray: 950 # values : type(self.values) 951 # Note: we only get here with self.is_unique 952 cdef: 953 Py_ssize_t i, N = len(values) 954 955 res = np.empty(N, dtype=np.intp) 956 957 for i in range(N): 958 val = values[i] 959 try: 960 loc = self.get_loc(val) 961 # Because we are unique, loc should always be an integer 962 except KeyError: 963 loc = -1 964 else: 965 assert util.is_integer_object(loc), (loc, val) 966 res[i] = loc 967 968 return res 969 970 def get_indexer_non_unique(self, targets): 971 """ 972 Return an indexer suitable for taking from a non unique index 973 return the labels in the same order as the target 974 and a missing indexer into the targets (which correspond 975 to the -1 indices in the results 976 Parameters 977 ---------- 978 targets : type(self.values) 979 Returns 980 ------- 981 indexer : np.ndarray[np.intp] 982 missing : np.ndarray[np.intp] 983 """ 984 cdef: 985 Py_ssize_t i, N = len(targets) 986 987 indexer = [] 988 missing = [] 989 990 # See also IntervalIndex.get_indexer_pointwise 991 for i in range(N): 992 val = targets[i] 993 994 try: 995 locs = self.get_loc(val) 996 except KeyError: 997 locs = np.array([-1], dtype=np.intp) 998 missing.append(i) 999 else: 1000 if isinstance(locs, slice): 1001 # Only needed for get_indexer_non_unique 1002 locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp) 1003 elif util.is_integer_object(locs): 1004 locs = np.array([locs], dtype=np.intp) 1005 else: 1006 assert locs.dtype.kind == "b" 1007 locs = locs.nonzero()[0] 1008 1009 indexer.append(locs) 1010 1011 try: 1012 indexer = np.concatenate(indexer, dtype=np.intp) 1013 except TypeError: 1014 # numpy<1.20 doesn't accept dtype keyword 1015 indexer = np.concatenate(indexer).astype(np.intp, copy=False) 1016 missing = np.array(missing, dtype=np.intp) 1017 1018 return indexer, missing 1019 1020 1021 cdef class ExtensionEngine(SharedEngine): 1022 def __init__(self, values: "ExtensionArray"): 1023 self.values = values 1024 1025 self.over_size_threshold = len(values) >= _SIZE_CUTOFF 1026 self.need_unique_check = True 1027 self.need_monotonic_check = True 1028 self.need_unique_check = True 1029 1030 cdef _do_monotonic_check(self): 1031 cdef: 1032 bint is_unique 1033 1034 values = self.values 1035 if values._hasna: 1036 self.monotonic_inc = 0 1037 self.monotonic_dec = 0 1038 1039 nunique = len(values.unique()) 1040 self.unique = nunique == len(values) 1041 self.need_unique_check = 0 1042 return 1043 1044 try: 1045 ranks = values._rank() 1046 1047 except TypeError: 1048 self.monotonic_inc = 0 1049 self.monotonic_dec = 0 1050 is_unique = 0 1051 else: 1052 self.monotonic_inc, self.monotonic_dec, is_unique = \ 1053 self._call_monotonic(ranks) 1054 1055 self.need_monotonic_check = 0 1056 1057 # we can only be sure of uniqueness if is_unique=1 1058 if is_unique: 1059 self.unique = 1 1060 self.need_unique_check = 0 1061 1062 cdef ndarray _get_bool_indexer(self, val): 1063 if checknull(val): 1064 return self.values.isna() 1065 1066 try: 1067 return self.values == val 1068 except TypeError: 1069 # e.g. if __eq__ returns a BooleanArray instead of ndarray[bool] 1070 try: 1071 return (self.values == val).to_numpy(dtype=bool, na_value=False) 1072 except (TypeError, AttributeError) as err: 1073 # e.g. (self.values == val) returned a bool 1074 # see test_get_loc_generator[string[pyarrow]] 1075 # e.g. self.value == val raises TypeError bc generator has no len 1076 # see test_get_loc_generator[string[python]] 1077 raise KeyError from err 1078 1079 cdef _check_type(self, object val): 1080 hash(val)