/ lib / pandas / _libs / index.pyx
index.pyx
   1  cimport cython
   2  
   3  import numpy as np
   4  
   5  cimport numpy as cnp
   6  from numpy cimport (
   7      float32_t,
   8      float64_t,
   9      int8_t,
  10      int16_t,
  11      int32_t,
  12      int64_t,
  13      intp_t,
  14      ndarray,
  15      uint8_t,
  16      uint16_t,
  17      uint32_t,
  18      uint64_t,
  19  )
  20  
  21  cnp.import_array()
  22  
  23  
  24  from pandas._libs cimport util
  25  from pandas._libs.hashtable cimport HashTable
  26  from pandas._libs.tslibs.nattype cimport c_NaT as NaT
  27  from pandas._libs.tslibs.period cimport is_period_object
  28  from pandas._libs.tslibs.timedeltas cimport _Timedelta
  29  from pandas._libs.tslibs.timestamps cimport _Timestamp
  30  
  31  from pandas._libs import (
  32      algos,
  33      hashtable as _hash,
  34  )
  35  
  36  from pandas._libs.lib cimport eq_NA_compat
  37  from pandas._libs.missing cimport (
  38      C_NA as NA,
  39      checknull,
  40      is_matching_na,
  41  )
  42  
  43  
  44  cdef inline bint is_definitely_invalid_key(object val):
  45      try:
  46          hash(val)
  47      except TypeError:
  48          return True
  49      return False
  50  
  51  
  52  cdef ndarray _get_bool_indexer(ndarray values, object val):
  53      """
  54      Return a ndarray[bool] of locations where val matches self.values.
  55  
  56      If val is not NA, this is equivalent to `self.values == val`
  57      """
  58      # Caller is responsible for ensuring _check_type has already been called
  59      cdef:
  60          ndarray[uint8_t, ndim=1, cast=True] indexer
  61          Py_ssize_t i
  62          object item
  63  
  64      if values.descr.type_num == cnp.NPY_OBJECT:
  65          # i.e. values.dtype == object
  66          if not checknull(val):
  67              indexer = eq_NA_compat(values, val)
  68  
  69          else:
  70              # We need to check for _matching_ NA values
  71              indexer = np.empty(len(values), dtype=np.uint8)
  72  
  73              for i in range(len(values)):
  74                  item = values[i]
  75                  indexer[i] = is_matching_na(item, val)
  76  
  77      else:
  78          if util.is_nan(val):
  79              indexer = np.isnan(values)
  80          else:
  81              indexer = values == val
  82  
  83      return indexer.view(bool)
  84  
  85  
  86  # Don't populate hash tables in monotonic indexes larger than this
  87  _SIZE_CUTOFF = 1_000_000
  88  
  89  
  90  cdef _unpack_bool_indexer(ndarray[uint8_t, ndim=1, cast=True] indexer, object val):
  91      """
  92      Possibly unpack a boolean mask to a single indexer.
  93      """
  94      # Returns ndarray[bool] or int
  95      cdef:
  96          ndarray[intp_t, ndim=1] found
  97          int count
  98  
  99      found = np.where(indexer)[0]
 100      count = len(found)
 101  
 102      if count > 1:
 103          return indexer
 104      if count == 1:
 105          return int(found[0])
 106  
 107      raise KeyError(val)
 108  
 109  
 110  @cython.freelist(32)
 111  cdef class IndexEngine:
 112  
 113      cdef readonly:
 114          ndarray values
 115          HashTable mapping
 116          bint over_size_threshold
 117  
 118      cdef:
 119          bint unique, monotonic_inc, monotonic_dec
 120          bint need_monotonic_check, need_unique_check
 121          object _np_type
 122  
 123      def __init__(self, ndarray values):
 124          self.values = values
 125  
 126          self.over_size_threshold = len(values) >= _SIZE_CUTOFF
 127          self.clear_mapping()
 128          self._np_type = values.dtype.type
 129  
 130      def __contains__(self, val: object) -> bool:
 131          hash(val)
 132          try:
 133              self.get_loc(val)
 134          except KeyError:
 135              return False
 136          return True
 137  
 138      cpdef get_loc(self, object val):
 139          # -> Py_ssize_t | slice | ndarray[bool]
 140          cdef:
 141              Py_ssize_t loc
 142  
 143          if is_definitely_invalid_key(val):
 144              raise TypeError(f"'{val}' is an invalid key")
 145  
 146          val = self._check_type(val)
 147  
 148          if self.over_size_threshold and self.is_monotonic_increasing:
 149              if not self.is_unique:
 150                  return self._get_loc_duplicates(val)
 151              values = self.values
 152  
 153              loc = self._searchsorted_left(val)
 154              if loc >= len(values):
 155                  raise KeyError(val)
 156              if values[loc] != val:
 157                  raise KeyError(val)
 158              return loc
 159  
 160          self._ensure_mapping_populated()
 161          if not self.unique:
 162              return self._get_loc_duplicates(val)
 163  
 164          try:
 165              return self.mapping.get_item(val)
 166          except OverflowError as err:
 167              # GH#41775 OverflowError e.g. if we are uint64 and val is -1
 168              #  or if we are int64 and value is np.iinfo(np.int64).max+1
 169              #  (the uint64 with -1 case should actually be excluded by _check_type)
 170              raise KeyError(val) from err
 171  
 172      cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
 173          """
 174          See ObjectEngine._searchsorted_left.__doc__.
 175          """
 176          # Caller is responsible for ensuring _check_type has already been called
 177          loc = self.values.searchsorted(self._np_type(val), side="left")
 178          return loc
 179  
 180      cdef inline _get_loc_duplicates(self, object val):
 181          # -> Py_ssize_t | slice | ndarray[bool]
 182          cdef:
 183              Py_ssize_t diff, left, right
 184  
 185          if self.is_monotonic_increasing:
 186              values = self.values
 187              try:
 188                  left = values.searchsorted(val, side='left')
 189                  right = values.searchsorted(val, side='right')
 190              except TypeError:
 191                  # e.g. GH#29189 get_loc(None) with a Float64Index
 192                  #  2021-09-29 Now only reached for object-dtype
 193                  raise KeyError(val)
 194  
 195              diff = right - left
 196              if diff == 0:
 197                  raise KeyError(val)
 198              elif diff == 1:
 199                  return left
 200              else:
 201                  return slice(left, right)
 202  
 203          return self._maybe_get_bool_indexer(val)
 204  
 205      cdef _maybe_get_bool_indexer(self, object val):
 206          # Returns ndarray[bool] or int
 207          cdef:
 208              ndarray[uint8_t, ndim=1, cast=True] indexer
 209  
 210          indexer = _get_bool_indexer(self.values, val)
 211          return _unpack_bool_indexer(indexer, val)
 212  
 213      def sizeof(self, deep: bool = False) -> int:
 214          """ return the sizeof our mapping """
 215          if not self.is_mapping_populated:
 216              return 0
 217          return self.mapping.sizeof(deep=deep)
 218  
 219      def __sizeof__(self) -> int:
 220          return self.sizeof()
 221  
 222      @property
 223      def is_unique(self) -> bool:
 224          if self.need_unique_check:
 225              self._do_unique_check()
 226  
 227          return self.unique == 1
 228  
 229      cdef inline _do_unique_check(self):
 230  
 231          # this de-facto the same
 232          self._ensure_mapping_populated()
 233  
 234      @property
 235      def is_monotonic_increasing(self) -> bool:
 236          if self.need_monotonic_check:
 237              self._do_monotonic_check()
 238  
 239          return self.monotonic_inc == 1
 240  
 241      @property
 242      def is_monotonic_decreasing(self) -> bool:
 243          if self.need_monotonic_check:
 244              self._do_monotonic_check()
 245  
 246          return self.monotonic_dec == 1
 247  
 248      cdef inline _do_monotonic_check(self):
 249          cdef:
 250              bint is_unique
 251          try:
 252              values = self.values
 253              self.monotonic_inc, self.monotonic_dec, is_unique = \
 254                  self._call_monotonic(values)
 255          except TypeError:
 256              self.monotonic_inc = 0
 257              self.monotonic_dec = 0
 258              is_unique = 0
 259  
 260          self.need_monotonic_check = 0
 261  
 262          # we can only be sure of uniqueness if is_unique=1
 263          if is_unique:
 264              self.unique = 1
 265              self.need_unique_check = 0
 266  
 267      cdef _call_monotonic(self, values):
 268          return algos.is_monotonic(values, timelike=False)
 269  
 270      cdef _make_hash_table(self, Py_ssize_t n):
 271          raise NotImplementedError  # pragma: no cover
 272  
 273      cdef _check_type(self, object val):
 274          hash(val)
 275          return val
 276  
 277      @property
 278      def is_mapping_populated(self) -> bool:
 279          return self.mapping is not None
 280  
 281      cdef inline _ensure_mapping_populated(self):
 282          # this populates the mapping
 283          # if its not already populated
 284          # also satisfies the need_unique_check
 285  
 286          if not self.is_mapping_populated:
 287  
 288              values = self.values
 289              self.mapping = self._make_hash_table(len(values))
 290              self.mapping.map_locations(values)
 291  
 292              if len(self.mapping) == len(values):
 293                  self.unique = 1
 294  
 295          self.need_unique_check = 0
 296  
 297      def clear_mapping(self):
 298          self.mapping = None
 299          self.need_monotonic_check = 1
 300          self.need_unique_check = 1
 301  
 302          self.unique = 0
 303          self.monotonic_inc = 0
 304          self.monotonic_dec = 0
 305  
 306      def get_indexer(self, ndarray values) -> np.ndarray:
 307          self._ensure_mapping_populated()
 308          return self.mapping.lookup(values)
 309  
 310      def get_indexer_non_unique(self, ndarray targets):
 311          """
 312          Return an indexer suitable for taking from a non unique index
 313          return the labels in the same order as the target
 314          and a missing indexer into the targets (which correspond
 315          to the -1 indices in the results
 316  
 317          Returns
 318          -------
 319          indexer : np.ndarray[np.intp]
 320          missing : np.ndarray[np.intp]
 321          """
 322          cdef:
 323              ndarray values
 324              ndarray[intp_t] result, missing
 325              set stargets, remaining_stargets, found_nas
 326              dict d = {}
 327              object val
 328              Py_ssize_t count = 0, count_missing = 0
 329              Py_ssize_t i, j, n, n_t, n_alloc, start, end
 330              bint check_na_values = False
 331  
 332          values = self.values
 333          stargets = set(targets)
 334  
 335          n = len(values)
 336          n_t = len(targets)
 337          if n > 10_000:
 338              n_alloc = 10_000
 339          else:
 340              n_alloc = n
 341  
 342          result = np.empty(n_alloc, dtype=np.intp)
 343          missing = np.empty(n_t, dtype=np.intp)
 344  
 345          # map each starget to its position in the index
 346          if (
 347                  stargets and
 348                  len(stargets) < 5 and
 349                  not any([checknull(t) for t in stargets]) and
 350                  self.is_monotonic_increasing
 351          ):
 352              # if there are few enough stargets and the index is monotonically
 353              # increasing, then use binary search for each starget
 354              remaining_stargets = set()
 355              for starget in stargets:
 356                  try:
 357                      start = values.searchsorted(starget, side='left')
 358                      end = values.searchsorted(starget, side='right')
 359                  except TypeError:  # e.g. if we tried to search for string in int array
 360                      remaining_stargets.add(starget)
 361                  else:
 362                      if start != end:
 363                          d[starget] = list(range(start, end))
 364  
 365              stargets = remaining_stargets
 366  
 367          if stargets:
 368              # otherwise, map by iterating through all items in the index
 369  
 370              # short-circuit na check
 371              if values.dtype == object:
 372                  check_na_values = True
 373                  # keep track of nas in values
 374                  found_nas = set()
 375  
 376              for i in range(n):
 377                  val = values[i]
 378  
 379                  # GH#43870
 380                  # handle lookup for nas
 381                  # (ie. np.nan, float("NaN"), Decimal("NaN"), dt64nat, td64nat)
 382                  if check_na_values and checknull(val):
 383                      match = [na for na in found_nas if is_matching_na(val, na)]
 384  
 385                      # matching na not found
 386                      if not len(match):
 387                          found_nas.add(val)
 388  
 389                          # add na to stargets to utilize `in` for stargets/d lookup
 390                          match_stargets = [
 391                              x for x in stargets if is_matching_na(val, x)
 392                          ]
 393  
 394                          if len(match_stargets):
 395                              # add our 'standardized' na
 396                              stargets.add(val)
 397  
 398                      # matching na found
 399                      else:
 400                          assert len(match) == 1
 401                          val = match[0]
 402  
 403                  if val in stargets:
 404                      if val not in d:
 405                          d[val] = []
 406                      d[val].append(i)
 407  
 408          for i in range(n_t):
 409              val = targets[i]
 410  
 411              # ensure there are nas in values before looking for a matching na
 412              if check_na_values and checknull(val):
 413                  match = [na for na in found_nas if is_matching_na(val, na)]
 414                  if len(match):
 415                      assert len(match) == 1
 416                      val = match[0]
 417  
 418              # found
 419              if val in d:
 420                  key = val
 421  
 422                  for j in d[key]:
 423  
 424                      # realloc if needed
 425                      if count >= n_alloc:
 426                          n_alloc += 10_000
 427                          result = np.resize(result, n_alloc)
 428  
 429                      result[count] = j
 430                      count += 1
 431  
 432              # value not found
 433              else:
 434  
 435                  if count >= n_alloc:
 436                      n_alloc += 10_000
 437                      result = np.resize(result, n_alloc)
 438                  result[count] = -1
 439                  count += 1
 440                  missing[count_missing] = i
 441                  count_missing += 1
 442  
 443          return result[0:count], missing[0:count_missing]
 444  
 445  
 446  cdef Py_ssize_t _bin_search(ndarray values, object val) except -1:
 447      # GH#1757 ndarray.searchsorted is not safe to use with array of tuples
 448      #  (treats a tuple `val` as a sequence of keys instead of a single key),
 449      #  so we implement something similar.
 450      # This is equivalent to the stdlib's bisect.bisect_left
 451  
 452      cdef:
 453          Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1
 454          object pval
 455  
 456      if hi == 0 or (hi > 0 and val > values[hi]):
 457          return len(values)
 458  
 459      while lo < hi:
 460          mid = (lo + hi) // 2
 461          pval = values[mid]
 462          if val < pval:
 463              hi = mid
 464          elif val > pval:
 465              lo = mid + 1
 466          else:
 467              while mid > 0 and val == values[mid - 1]:
 468                  mid -= 1
 469              return mid
 470  
 471      if val <= values[mid]:
 472          return mid
 473      else:
 474          return mid + 1
 475  
 476  
 477  cdef class ObjectEngine(IndexEngine):
 478      """
 479      Index Engine for use with object-dtype Index, namely the base class Index.
 480      """
 481      cdef _make_hash_table(self, Py_ssize_t n):
 482          return _hash.PyObjectHashTable(n)
 483  
 484      cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
 485          # using values.searchsorted here would treat a tuple `val` as a sequence
 486          #  instead of a single key, so we use a different implementation
 487          try:
 488              loc = _bin_search(self.values, val)
 489          except TypeError as err:
 490              raise KeyError(val) from err
 491          return loc
 492  
 493  
 494  cdef class DatetimeEngine(Int64Engine):
 495  
 496      cdef int64_t _unbox_scalar(self, scalar) except? -1:
 497          # NB: caller is responsible for ensuring tzawareness compat
 498          #  before we get here
 499          if not (isinstance(scalar, _Timestamp) or scalar is NaT):
 500              raise TypeError(scalar)
 501          return scalar.value
 502  
 503      def __contains__(self, val: object) -> bool:
 504          # We assume before we get here:
 505          #  - val is hashable
 506          self._unbox_scalar(val)
 507          try:
 508              self.get_loc(val)
 509              return True
 510          except KeyError:
 511              return False
 512  
 513      cdef _call_monotonic(self, values):
 514          return algos.is_monotonic(values, timelike=True)
 515  
 516      cpdef get_loc(self, object val):
 517          # NB: the caller is responsible for ensuring that we are called
 518          #  with either a Timestamp or NaT (Timedelta or NaT for TimedeltaEngine)
 519  
 520          cdef:
 521              Py_ssize_t loc
 522  
 523          if is_definitely_invalid_key(val):
 524              raise TypeError(f"'{val}' is an invalid key")
 525  
 526          try:
 527              conv = self._unbox_scalar(val)
 528          except TypeError:
 529              raise KeyError(val)
 530  
 531          # Welcome to the spaghetti factory
 532          if self.over_size_threshold and self.is_monotonic_increasing:
 533              if not self.is_unique:
 534                  return self._get_loc_duplicates(conv)
 535              values = self.values
 536  
 537              loc = values.searchsorted(conv, side='left')
 538  
 539              if loc == len(values) or values[loc] != conv:
 540                  raise KeyError(val)
 541              return loc
 542  
 543          self._ensure_mapping_populated()
 544          if not self.unique:
 545              return self._get_loc_duplicates(conv)
 546  
 547          try:
 548              return self.mapping.get_item(conv)
 549          except KeyError:
 550              raise KeyError(val)
 551  
 552  
 553  cdef class TimedeltaEngine(DatetimeEngine):
 554  
 555      cdef int64_t _unbox_scalar(self, scalar) except? -1:
 556          if not (isinstance(scalar, _Timedelta) or scalar is NaT):
 557              raise TypeError(scalar)
 558          return scalar.value
 559  
 560  
 561  cdef class PeriodEngine(Int64Engine):
 562  
 563      cdef int64_t _unbox_scalar(self, scalar) except? -1:
 564          if scalar is NaT:
 565              return scalar.value
 566          if is_period_object(scalar):
 567              # NB: we assume that we have the correct freq here.
 568              return scalar.ordinal
 569          raise TypeError(scalar)
 570  
 571      cpdef get_loc(self, object val):
 572          # NB: the caller is responsible for ensuring that we are called
 573          #  with either a Period or NaT
 574          cdef:
 575              int64_t conv
 576  
 577          try:
 578              conv = self._unbox_scalar(val)
 579          except TypeError:
 580              raise KeyError(val)
 581  
 582          return Int64Engine.get_loc(self, conv)
 583  
 584      cdef _call_monotonic(self, values):
 585          return algos.is_monotonic(values, timelike=True)
 586  
 587  
 588  cdef class BaseMultiIndexCodesEngine:
 589      """
 590      Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
 591      represent each label in a MultiIndex as an integer, by juxtaposing the bits
 592      encoding each level, with appropriate offsets.
 593  
 594      For instance: if 3 levels have respectively 3, 6 and 1 possible values,
 595      then their labels can be represented using respectively 2, 3 and 1 bits,
 596      as follows:
 597       _ _ _ _____ _ __ __ __
 598      |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
 599       — — — ————— — —— —— ——
 600      |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
 601       — — — ————— — —— —— ——
 602      |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
 603       ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
 604      and the resulting unsigned integer representation will be:
 605       _ _ _ _____ _ __ __ __ __ __ __
 606      |0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
 607       ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
 608  
 609      Offsets are calculated at initialization, labels are transformed by method
 610      _codes_to_ints.
 611  
 612      Keys are located by first locating each component against the respective
 613      level, then locating (the integer representation of) codes.
 614      """
 615      def __init__(self, object levels, object labels,
 616                   ndarray[uint64_t, ndim=1] offsets):
 617          """
 618          Parameters
 619          ----------
 620          levels : list-like of numpy arrays
 621              Levels of the MultiIndex.
 622          labels : list-like of numpy arrays of integer dtype
 623              Labels of the MultiIndex.
 624          offsets : numpy array of uint64 dtype
 625              Pre-calculated offsets, one for each level of the index.
 626          """
 627          self.levels = levels
 628          self.offsets = offsets
 629  
 630          # Transform labels in a single array, and add 1 so that we are working
 631          # with positive integers (-1 for NaN becomes 0):
 632          codes = (np.array(labels, dtype='int64').T + 1).astype('uint64',
 633                                                                 copy=False)
 634  
 635          # Map each codes combination in the index to an integer unambiguously
 636          # (no collisions possible), based on the "offsets", which describe the
 637          # number of bits to switch labels for each level:
 638          lab_ints = self._codes_to_ints(codes)
 639  
 640          # Initialize underlying index (e.g. libindex.UInt64Engine) with
 641          # integers representing labels: we will use its get_loc and get_indexer
 642          self._base.__init__(self, lab_ints)
 643  
 644      def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
 645          raise NotImplementedError("Implemented by subclass")  # pragma: no cover
 646  
 647      def _extract_level_codes(self, target) -> np.ndarray:
 648          """
 649          Map the requested list of (tuple) keys to their integer representations
 650          for searching in the underlying integer index.
 651  
 652          Parameters
 653          ----------
 654          target : MultiIndex
 655  
 656          Returns
 657          ------
 658          int_keys : 1-dimensional array of dtype uint64 or object
 659              Integers representing one combination each
 660          """
 661          zt = [target._get_level_values(i) for i in range(target.nlevels)]
 662          level_codes = [lev.get_indexer_for(codes) + 1 for lev, codes
 663                         in zip(self.levels, zt)]
 664          return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
 665  
 666      def get_indexer(self, target: np.ndarray) -> np.ndarray:
 667          """
 668          Returns an array giving the positions of each value of `target` in
 669          `self.values`, where -1 represents a value in `target` which does not
 670          appear in `self.values`
 671  
 672          Parameters
 673          ----------
 674          target : np.ndarray
 675  
 676          Returns
 677          -------
 678          np.ndarray[intp_t, ndim=1] of the indexer of `target` into
 679          `self.values`
 680          """
 681          return self._base.get_indexer(self, target)
 682  
 683      def get_indexer_with_fill(self, ndarray target, ndarray values,
 684                                str method, object limit) -> np.ndarray:
 685          """
 686          Returns an array giving the positions of each value of `target` in
 687          `values`, where -1 represents a value in `target` which does not
 688          appear in `values`
 689  
 690          If `method` is "backfill" then the position for a value in `target`
 691          which does not appear in `values` is that of the next greater value
 692          in `values` (if one exists), and -1 if there is no such value.
 693  
 694          Similarly, if the method is "pad" then the position for a value in
 695          `target` which does not appear in `values` is that of the next smaller
 696          value in `values` (if one exists), and -1 if there is no such value.
 697  
 698          Parameters
 699          ----------
 700          target: ndarray[object] of tuples
 701              need not be sorted, but all must have the same length, which must be
 702              the same as the length of all tuples in `values`
 703          values : ndarray[object] of tuples
 704              must be sorted and all have the same length.  Should be the set of
 705              the MultiIndex's values.
 706          method: string
 707              "backfill" or "pad"
 708          limit: int or None
 709              if provided, limit the number of fills to this value
 710  
 711          Returns
 712          -------
 713          np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`,
 714          filled with the `method` (and optionally `limit`) specified
 715          """
 716          assert method in ("backfill", "pad")
 717          cdef:
 718              int64_t i, j, next_code
 719              int64_t num_values, num_target_values
 720              ndarray[int64_t, ndim=1] target_order
 721              ndarray[object, ndim=1] target_values
 722              ndarray[int64_t, ndim=1] new_codes, new_target_codes
 723              ndarray[intp_t, ndim=1] sorted_indexer
 724  
 725          target_order = np.argsort(target).astype('int64')
 726          target_values = target[target_order]
 727          num_values, num_target_values = len(values), len(target_values)
 728          new_codes, new_target_codes = (
 729              np.empty((num_values,)).astype('int64'),
 730              np.empty((num_target_values,)).astype('int64'),
 731          )
 732  
 733          # `values` and `target_values` are both sorted, so we walk through them
 734          # and memoize the (ordered) set of indices in the (implicit) merged-and
 735          # sorted list of the two which belong to each of them
 736          # the effect of this is to create a factorization for the (sorted)
 737          # merger of the index values, where `new_codes` and `new_target_codes`
 738          # are the subset of the factors which appear in `values` and `target`,
 739          # respectively
 740          i, j, next_code = 0, 0, 0
 741          while i < num_values and j < num_target_values:
 742              val, target_val = values[i], target_values[j]
 743              if val <= target_val:
 744                  new_codes[i] = next_code
 745                  i += 1
 746              if target_val <= val:
 747                  new_target_codes[j] = next_code
 748                  j += 1
 749              next_code += 1
 750  
 751          # at this point, at least one should have reached the end
 752          # the remaining values of the other should be added to the end
 753          assert i == num_values or j == num_target_values
 754          while i < num_values:
 755              new_codes[i] = next_code
 756              i += 1
 757              next_code += 1
 758          while j < num_target_values:
 759              new_target_codes[j] = next_code
 760              j += 1
 761              next_code += 1
 762  
 763          # get the indexer, and undo the sorting of `target.values`
 764          algo = algos.backfill if method == "backfill" else algos.pad
 765          sorted_indexer = algo(new_codes, new_target_codes, limit=limit)
 766          return sorted_indexer[np.argsort(target_order)]
 767  
 768      def get_loc(self, object key):
 769          if is_definitely_invalid_key(key):
 770              raise TypeError(f"'{key}' is an invalid key")
 771          if not isinstance(key, tuple):
 772              raise KeyError(key)
 773          try:
 774              indices = [0 if checknull(v) else lev.get_loc(v) + 1
 775                         for lev, v in zip(self.levels, key)]
 776          except KeyError:
 777              raise KeyError(key)
 778  
 779          # Transform indices into single integer:
 780          lab_int = self._codes_to_ints(np.array(indices, dtype='uint64'))
 781  
 782          return self._base.get_loc(self, lab_int)
 783  
 784      def get_indexer_non_unique(self, target: np.ndarray) -> np.ndarray:
 785          indexer = self._base.get_indexer_non_unique(self, target)
 786  
 787          return indexer
 788  
 789      def __contains__(self, val: object) -> bool:
 790          # We assume before we get here:
 791          #  - val is hashable
 792          # Default __contains__ looks in the underlying mapping, which in this
 793          # case only contains integer representations.
 794          try:
 795              self.get_loc(val)
 796              return True
 797          except (KeyError, TypeError, ValueError):
 798              return False
 799  
 800  
 801  # Generated from template.
 802  include "index_class_helper.pxi"
 803  
 804  
 805  cdef class BoolEngine(UInt8Engine):
 806      cdef _check_type(self, object val):
 807          if not util.is_bool_object(val):
 808              raise KeyError(val)
 809          return <uint8_t>val
 810  
 811  
 812  @cython.internal
 813  @cython.freelist(32)
 814  cdef class SharedEngine:
 815      cdef readonly:
 816          object values  # ExtensionArray
 817          bint over_size_threshold
 818  
 819      cdef:
 820          bint unique, monotonic_inc, monotonic_dec
 821          bint need_monotonic_check, need_unique_check
 822  
 823      def __contains__(self, val: object) -> bool:
 824          # We assume before we get here:
 825          #  - val is hashable
 826          try:
 827              self.get_loc(val)
 828              return True
 829          except KeyError:
 830              return False
 831  
 832      def clear_mapping(self):
 833          # for compat with IndexEngine
 834          pass
 835  
 836      @property
 837      def is_unique(self) -> bool:
 838          if self.need_unique_check:
 839              arr = self.values.unique()
 840              self.unique = len(arr) == len(self.values)
 841  
 842              self.need_unique_check = False
 843          return self.unique
 844  
 845      cdef _do_monotonic_check(self):
 846          raise NotImplementedError
 847  
 848      @property
 849      def is_monotonic_increasing(self) -> bool:
 850          if self.need_monotonic_check:
 851              self._do_monotonic_check()
 852  
 853          return self.monotonic_inc == 1
 854  
 855      @property
 856      def is_monotonic_decreasing(self) -> bool:
 857          if self.need_monotonic_check:
 858              self._do_monotonic_check()
 859  
 860          return self.monotonic_dec == 1
 861  
 862      cdef _call_monotonic(self, values):
 863          return algos.is_monotonic(values, timelike=False)
 864  
 865      def sizeof(self, deep: bool = False) -> int:
 866          """ return the sizeof our mapping """
 867          return 0
 868  
 869      def __sizeof__(self) -> int:
 870          return self.sizeof()
 871  
 872      cdef _check_type(self, object obj):
 873          raise NotImplementedError
 874  
 875      cpdef get_loc(self, object val):
 876          # -> Py_ssize_t | slice | ndarray[bool]
 877          cdef:
 878              Py_ssize_t loc
 879  
 880          if is_definitely_invalid_key(val):
 881              raise TypeError(f"'{val}' is an invalid key")
 882  
 883          self._check_type(val)
 884  
 885          if self.over_size_threshold and self.is_monotonic_increasing:
 886              if not self.is_unique:
 887                  return self._get_loc_duplicates(val)
 888  
 889              values = self.values
 890  
 891              loc = self._searchsorted_left(val)
 892              if loc >= len(values):
 893                  raise KeyError(val)
 894              if values[loc] != val:
 895                  raise KeyError(val)
 896              return loc
 897  
 898          if not self.unique:
 899              return self._get_loc_duplicates(val)
 900  
 901          return self._get_loc_duplicates(val)
 902  
 903      cdef inline _get_loc_duplicates(self, object val):
 904          # -> Py_ssize_t | slice | ndarray[bool]
 905          cdef:
 906              Py_ssize_t diff
 907  
 908          if self.is_monotonic_increasing:
 909              values = self.values
 910              try:
 911                  left = values.searchsorted(val, side='left')
 912                  right = values.searchsorted(val, side='right')
 913              except TypeError:
 914                  # e.g. GH#29189 get_loc(None) with a Float64Index
 915                  raise KeyError(val)
 916  
 917              diff = right - left
 918              if diff == 0:
 919                  raise KeyError(val)
 920              elif diff == 1:
 921                  return left
 922              else:
 923                  return slice(left, right)
 924  
 925          return self._maybe_get_bool_indexer(val)
 926  
 927      cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
 928          """
 929          See ObjectEngine._searchsorted_left.__doc__.
 930          """
 931          try:
 932              loc = self.values.searchsorted(val, side="left")
 933          except TypeError as err:
 934              # GH#35788 e.g. val=None with float64 values
 935              raise KeyError(val)
 936          return loc
 937  
 938      cdef ndarray _get_bool_indexer(self, val):
 939          raise NotImplementedError
 940  
 941      cdef _maybe_get_bool_indexer(self, object val):
 942          # Returns ndarray[bool] or int
 943          cdef:
 944              ndarray[uint8_t, ndim=1, cast=True] indexer
 945  
 946          indexer = self._get_bool_indexer(val)
 947          return _unpack_bool_indexer(indexer, val)
 948  
 949      def get_indexer(self, values) -> np.ndarray:
 950          # values : type(self.values)
 951          # Note: we only get here with self.is_unique
 952          cdef:
 953              Py_ssize_t i, N = len(values)
 954  
 955          res = np.empty(N, dtype=np.intp)
 956  
 957          for i in range(N):
 958              val = values[i]
 959              try:
 960                  loc = self.get_loc(val)
 961                  # Because we are unique, loc should always be an integer
 962              except KeyError:
 963                  loc = -1
 964              else:
 965                  assert util.is_integer_object(loc), (loc, val)
 966              res[i] = loc
 967  
 968          return res
 969  
 970      def get_indexer_non_unique(self, targets):
 971          """
 972          Return an indexer suitable for taking from a non unique index
 973          return the labels in the same order as the target
 974          and a missing indexer into the targets (which correspond
 975          to the -1 indices in the results
 976          Parameters
 977          ----------
 978          targets : type(self.values)
 979          Returns
 980          -------
 981          indexer : np.ndarray[np.intp]
 982          missing : np.ndarray[np.intp]
 983          """
 984          cdef:
 985              Py_ssize_t i, N = len(targets)
 986  
 987          indexer = []
 988          missing = []
 989  
 990          # See also IntervalIndex.get_indexer_pointwise
 991          for i in range(N):
 992              val = targets[i]
 993  
 994              try:
 995                  locs = self.get_loc(val)
 996              except KeyError:
 997                  locs = np.array([-1], dtype=np.intp)
 998                  missing.append(i)
 999              else:
1000                  if isinstance(locs, slice):
1001                      # Only needed for get_indexer_non_unique
1002                      locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp)
1003                  elif util.is_integer_object(locs):
1004                      locs = np.array([locs], dtype=np.intp)
1005                  else:
1006                      assert locs.dtype.kind == "b"
1007                      locs = locs.nonzero()[0]
1008  
1009              indexer.append(locs)
1010  
1011          try:
1012              indexer = np.concatenate(indexer, dtype=np.intp)
1013          except TypeError:
1014              # numpy<1.20 doesn't accept dtype keyword
1015              indexer = np.concatenate(indexer).astype(np.intp, copy=False)
1016          missing = np.array(missing, dtype=np.intp)
1017  
1018          return indexer, missing
1019  
1020  
1021  cdef class ExtensionEngine(SharedEngine):
1022      def __init__(self, values: "ExtensionArray"):
1023          self.values = values
1024  
1025          self.over_size_threshold = len(values) >= _SIZE_CUTOFF
1026          self.need_unique_check = True
1027          self.need_monotonic_check = True
1028          self.need_unique_check = True
1029  
1030      cdef _do_monotonic_check(self):
1031          cdef:
1032              bint is_unique
1033  
1034          values = self.values
1035          if values._hasna:
1036              self.monotonic_inc = 0
1037              self.monotonic_dec = 0
1038  
1039              nunique = len(values.unique())
1040              self.unique = nunique == len(values)
1041              self.need_unique_check = 0
1042              return
1043  
1044          try:
1045              ranks = values._rank()
1046  
1047          except TypeError:
1048              self.monotonic_inc = 0
1049              self.monotonic_dec = 0
1050              is_unique = 0
1051          else:
1052              self.monotonic_inc, self.monotonic_dec, is_unique = \
1053                  self._call_monotonic(ranks)
1054  
1055          self.need_monotonic_check = 0
1056  
1057          # we can only be sure of uniqueness if is_unique=1
1058          if is_unique:
1059              self.unique = 1
1060              self.need_unique_check = 0
1061  
1062      cdef ndarray _get_bool_indexer(self, val):
1063          if checknull(val):
1064              return self.values.isna()
1065  
1066          try:
1067              return self.values == val
1068          except TypeError:
1069              # e.g. if __eq__ returns a BooleanArray instead of ndarray[bool]
1070              try:
1071                  return (self.values == val).to_numpy(dtype=bool, na_value=False)
1072              except (TypeError, AttributeError) as err:
1073                  # e.g. (self.values == val) returned a bool
1074                  #  see test_get_loc_generator[string[pyarrow]]
1075                  # e.g. self.value == val raises TypeError bc generator has no len
1076                  #  see test_get_loc_generator[string[python]]
1077                  raise KeyError from err
1078  
1079      cdef _check_type(self, object val):
1080          hash(val)