hashtable.pyx
1 cimport cython 2 from cpython.mem cimport ( 3 PyMem_Free, 4 PyMem_Malloc, 5 ) 6 from cpython.ref cimport ( 7 Py_INCREF, 8 PyObject, 9 ) 10 from libc.stdlib cimport ( 11 free, 12 malloc, 13 ) 14 15 import numpy as np 16 17 cimport numpy as cnp 18 from numpy cimport ( 19 float64_t, 20 ndarray, 21 uint8_t, 22 uint32_t, 23 ) 24 from numpy.math cimport NAN 25 26 cnp.import_array() 27 28 29 from pandas._libs cimport util 30 from pandas._libs.dtypes cimport numeric_object_t 31 from pandas._libs.khash cimport ( 32 KHASH_TRACE_DOMAIN, 33 are_equivalent_float32_t, 34 are_equivalent_float64_t, 35 are_equivalent_khcomplex64_t, 36 are_equivalent_khcomplex128_t, 37 kh_needed_n_buckets, 38 kh_python_hash_equal, 39 kh_python_hash_func, 40 kh_str_t, 41 khcomplex64_t, 42 khcomplex128_t, 43 khiter_t, 44 ) 45 from pandas._libs.missing cimport checknull 46 47 48 def get_hashtable_trace_domain(): 49 return KHASH_TRACE_DOMAIN 50 51 52 def object_hash(obj): 53 return kh_python_hash_func(obj) 54 55 56 def objects_are_equal(a, b): 57 return kh_python_hash_equal(a, b) 58 59 60 cdef int64_t NPY_NAT = util.get_nat() 61 SIZE_HINT_LIMIT = (1 << 20) + 7 62 63 64 cdef Py_ssize_t _INIT_VEC_CAP = 128 65 66 include "hashtable_class_helper.pxi" 67 include "hashtable_func_helper.pxi" 68 69 70 # map derived hash-map types onto basic hash-map types: 71 if np.dtype(np.intp) == np.dtype(np.int64): 72 IntpHashTable = Int64HashTable 73 unique_label_indices = _unique_label_indices_int64 74 elif np.dtype(np.intp) == np.dtype(np.int32): 75 IntpHashTable = Int32HashTable 76 unique_label_indices = _unique_label_indices_int32 77 else: 78 raise ValueError(np.dtype(np.intp)) 79 80 81 cdef class Factorizer: 82 cdef readonly: 83 Py_ssize_t count 84 85 def __cinit__(self, size_hint: int): 86 self.count = 0 87 88 def get_count(self) -> int: 89 return self.count 90 91 92 cdef class ObjectFactorizer(Factorizer): 93 cdef public: 94 PyObjectHashTable table 95 ObjectVector uniques 96 97 def __cinit__(self, size_hint: int): 98 self.table = PyObjectHashTable(size_hint) 99 self.uniques = ObjectVector() 100 101 def factorize( 102 self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None 103 ) -> np.ndarray: 104 """ 105 106 Returns 107 ------- 108 np.ndarray[np.intp] 109 110 Examples 111 -------- 112 Factorize values with nans replaced by na_sentinel 113 114 >>> fac = ObjectFactorizer(3) 115 >>> fac.factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) 116 array([ 0, 1, 20]) 117 """ 118 cdef: 119 ndarray[intp_t] labels 120 121 if self.uniques.external_view_exists: 122 uniques = ObjectVector() 123 uniques.extend(self.uniques.to_array()) 124 self.uniques = uniques 125 labels = self.table.get_labels(values, self.uniques, 126 self.count, na_sentinel, na_value) 127 mask = (labels == na_sentinel) 128 # sort on 129 if sort: 130 sorter = self.uniques.to_array().argsort() 131 reverse_indexer = np.empty(len(sorter), dtype=np.intp) 132 reverse_indexer.put(sorter, np.arange(len(sorter))) 133 labels = reverse_indexer.take(labels, mode='clip') 134 labels[mask] = na_sentinel 135 self.count = len(self.uniques) 136 return labels 137 138 139 cdef class Int64Factorizer(Factorizer): 140 cdef public: 141 Int64HashTable table 142 Int64Vector uniques 143 144 def __cinit__(self, size_hint: int): 145 self.table = Int64HashTable(size_hint) 146 self.uniques = Int64Vector() 147 148 def factorize(self, const int64_t[:] values, sort=False, 149 na_sentinel=-1, na_value=None) -> np.ndarray: 150 """ 151 Returns 152 ------- 153 ndarray[intp_t] 154 155 Examples 156 -------- 157 Factorize values with nans replaced by na_sentinel 158 159 >>> fac = Int64Factorizer(3) 160 >>> fac.factorize(np.array([1,2,3]), na_sentinel=20) 161 array([0, 1, 2]) 162 """ 163 cdef: 164 ndarray[intp_t] labels 165 166 if self.uniques.external_view_exists: 167 uniques = Int64Vector() 168 uniques.extend(self.uniques.to_array()) 169 self.uniques = uniques 170 labels = self.table.get_labels(values, self.uniques, 171 self.count, na_sentinel, 172 na_value=na_value) 173 174 # sort on 175 if sort: 176 sorter = self.uniques.to_array().argsort() 177 reverse_indexer = np.empty(len(sorter), dtype=np.intp) 178 reverse_indexer.put(sorter, np.arange(len(sorter))) 179 180 labels = reverse_indexer.take(labels) 181 182 self.count = len(self.uniques) 183 return labels