/ lib / pandas / _libs / hashtable.pyx
hashtable.pyx
  1  cimport cython
  2  from cpython.mem cimport (
  3      PyMem_Free,
  4      PyMem_Malloc,
  5  )
  6  from cpython.ref cimport (
  7      Py_INCREF,
  8      PyObject,
  9  )
 10  from libc.stdlib cimport (
 11      free,
 12      malloc,
 13  )
 14  
 15  import numpy as np
 16  
 17  cimport numpy as cnp
 18  from numpy cimport (
 19      float64_t,
 20      ndarray,
 21      uint8_t,
 22      uint32_t,
 23  )
 24  from numpy.math cimport NAN
 25  
 26  cnp.import_array()
 27  
 28  
 29  from pandas._libs cimport util
 30  from pandas._libs.dtypes cimport numeric_object_t
 31  from pandas._libs.khash cimport (
 32      KHASH_TRACE_DOMAIN,
 33      are_equivalent_float32_t,
 34      are_equivalent_float64_t,
 35      are_equivalent_khcomplex64_t,
 36      are_equivalent_khcomplex128_t,
 37      kh_needed_n_buckets,
 38      kh_python_hash_equal,
 39      kh_python_hash_func,
 40      kh_str_t,
 41      khcomplex64_t,
 42      khcomplex128_t,
 43      khiter_t,
 44  )
 45  from pandas._libs.missing cimport checknull
 46  
 47  
 48  def get_hashtable_trace_domain():
 49      return KHASH_TRACE_DOMAIN
 50  
 51  
 52  def object_hash(obj):
 53      return kh_python_hash_func(obj)
 54  
 55  
 56  def objects_are_equal(a, b):
 57      return kh_python_hash_equal(a, b)
 58  
 59  
 60  cdef int64_t NPY_NAT = util.get_nat()
 61  SIZE_HINT_LIMIT = (1 << 20) + 7
 62  
 63  
 64  cdef Py_ssize_t _INIT_VEC_CAP = 128
 65  
 66  include "hashtable_class_helper.pxi"
 67  include "hashtable_func_helper.pxi"
 68  
 69  
 70  # map derived hash-map types onto basic hash-map types:
 71  if np.dtype(np.intp) == np.dtype(np.int64):
 72      IntpHashTable = Int64HashTable
 73      unique_label_indices = _unique_label_indices_int64
 74  elif np.dtype(np.intp) == np.dtype(np.int32):
 75      IntpHashTable = Int32HashTable
 76      unique_label_indices = _unique_label_indices_int32
 77  else:
 78      raise ValueError(np.dtype(np.intp))
 79  
 80  
 81  cdef class Factorizer:
 82      cdef readonly:
 83          Py_ssize_t count
 84  
 85      def __cinit__(self, size_hint: int):
 86          self.count = 0
 87  
 88      def get_count(self) -> int:
 89          return self.count
 90  
 91  
 92  cdef class ObjectFactorizer(Factorizer):
 93      cdef public:
 94          PyObjectHashTable table
 95          ObjectVector uniques
 96  
 97      def __cinit__(self, size_hint: int):
 98          self.table = PyObjectHashTable(size_hint)
 99          self.uniques = ObjectVector()
100  
101      def factorize(
102          self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None
103      ) -> np.ndarray:
104          """
105  
106          Returns
107          -------
108          np.ndarray[np.intp]
109  
110          Examples
111          --------
112          Factorize values with nans replaced by na_sentinel
113  
114          >>> fac = ObjectFactorizer(3)
115          >>> fac.factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
116          array([ 0,  1, 20])
117          """
118          cdef:
119              ndarray[intp_t] labels
120  
121          if self.uniques.external_view_exists:
122              uniques = ObjectVector()
123              uniques.extend(self.uniques.to_array())
124              self.uniques = uniques
125          labels = self.table.get_labels(values, self.uniques,
126                                         self.count, na_sentinel, na_value)
127          mask = (labels == na_sentinel)
128          # sort on
129          if sort:
130              sorter = self.uniques.to_array().argsort()
131              reverse_indexer = np.empty(len(sorter), dtype=np.intp)
132              reverse_indexer.put(sorter, np.arange(len(sorter)))
133              labels = reverse_indexer.take(labels, mode='clip')
134              labels[mask] = na_sentinel
135          self.count = len(self.uniques)
136          return labels
137  
138  
139  cdef class Int64Factorizer(Factorizer):
140      cdef public:
141          Int64HashTable table
142          Int64Vector uniques
143  
144      def __cinit__(self, size_hint: int):
145          self.table = Int64HashTable(size_hint)
146          self.uniques = Int64Vector()
147  
148      def factorize(self, const int64_t[:] values, sort=False,
149                    na_sentinel=-1, na_value=None) -> np.ndarray:
150          """
151          Returns
152          -------
153          ndarray[intp_t]
154  
155          Examples
156          --------
157          Factorize values with nans replaced by na_sentinel
158  
159          >>> fac = Int64Factorizer(3)
160          >>> fac.factorize(np.array([1,2,3]), na_sentinel=20)
161          array([0, 1, 2])
162          """
163          cdef:
164              ndarray[intp_t] labels
165  
166          if self.uniques.external_view_exists:
167              uniques = Int64Vector()
168              uniques.extend(self.uniques.to_array())
169              self.uniques = uniques
170          labels = self.table.get_labels(values, self.uniques,
171                                         self.count, na_sentinel,
172                                         na_value=na_value)
173  
174          # sort on
175          if sort:
176              sorter = self.uniques.to_array().argsort()
177              reverse_indexer = np.empty(len(sorter), dtype=np.intp)
178              reverse_indexer.put(sorter, np.arange(len(sorter)))
179  
180              labels = reverse_indexer.take(labels)
181  
182          self.count = len(self.uniques)
183          return labels