lib.pyx
1 from collections import abc 2 from decimal import Decimal 3 from enum import Enum 4 from typing import Literal 5 import warnings 6 7 cimport cython 8 from cpython.datetime cimport ( 9 PyDate_Check, 10 PyDateTime_Check, 11 PyDelta_Check, 12 PyTime_Check, 13 import_datetime, 14 ) 15 from cpython.iterator cimport PyIter_Check 16 from cpython.number cimport PyNumber_Check 17 from cpython.object cimport ( 18 Py_EQ, 19 PyObject_RichCompareBool, 20 PyTypeObject, 21 ) 22 from cpython.ref cimport Py_INCREF 23 from cpython.sequence cimport PySequence_Check 24 from cpython.tuple cimport ( 25 PyTuple_New, 26 PyTuple_SET_ITEM, 27 ) 28 from cython cimport ( 29 Py_ssize_t, 30 floating, 31 ) 32 33 from pandas.util._exceptions import find_stack_level 34 35 import_datetime() 36 37 import numpy as np 38 39 cimport numpy as cnp 40 from numpy cimport ( 41 NPY_OBJECT, 42 PyArray_Check, 43 PyArray_GETITEM, 44 PyArray_ITER_DATA, 45 PyArray_ITER_NEXT, 46 PyArray_IterNew, 47 complex128_t, 48 flatiter, 49 float32_t, 50 float64_t, 51 int64_t, 52 intp_t, 53 ndarray, 54 uint8_t, 55 uint64_t, 56 ) 57 58 cnp.import_array() 59 60 cdef extern from "Python.h": 61 # Note: importing extern-style allows us to declare these as nogil 62 # functions, whereas `from cpython cimport` does not. 63 bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil 64 65 cdef extern from "numpy/arrayobject.h": 66 # cython's numpy.dtype specification is incorrect, which leads to 67 # errors in issubclass(self.dtype.type, np.bool_), so we directly 68 # include the correct version 69 # https://github.com/cython/cython/issues/2022 70 71 ctypedef class numpy.dtype [object PyArray_Descr]: 72 # Use PyDataType_* macros when possible, however there are no macros 73 # for accessing some of the fields, so some are defined. Please 74 # ask on cython-dev if you need more. 75 cdef: 76 int type_num 77 int itemsize "elsize" 78 char byteorder 79 object fields 80 tuple names 81 82 PyTypeObject PySignedIntegerArrType_Type 83 PyTypeObject PyUnsignedIntegerArrType_Type 84 85 cdef extern from "numpy/ndarrayobject.h": 86 bint PyArray_CheckScalar(obj) nogil 87 88 89 cdef extern from "src/parse_helper.h": 90 int floatify(object, float64_t *result, int *maybe_int) except -1 91 92 from pandas._libs cimport util 93 from pandas._libs.util cimport ( 94 INT64_MAX, 95 INT64_MIN, 96 UINT64_MAX, 97 is_nan, 98 ) 99 100 from pandas._libs.tslib import array_to_datetime 101 from pandas._libs.tslibs import ( 102 OutOfBoundsDatetime, 103 OutOfBoundsTimedelta, 104 ) 105 from pandas._libs.tslibs.period import Period 106 107 from pandas._libs.missing cimport ( 108 C_NA, 109 checknull, 110 is_matching_na, 111 is_null_datetime64, 112 is_null_timedelta64, 113 ) 114 from pandas._libs.tslibs.conversion cimport convert_to_tsobject 115 from pandas._libs.tslibs.nattype cimport ( 116 NPY_NAT, 117 c_NaT as NaT, 118 checknull_with_nat, 119 ) 120 from pandas._libs.tslibs.offsets cimport is_offset_object 121 from pandas._libs.tslibs.period cimport is_period_object 122 from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 123 from pandas._libs.tslibs.timezones cimport tz_compare 124 125 # constants that will be compared to potentially arbitrarily large 126 # python int 127 cdef: 128 object oINT64_MAX = <int64_t>INT64_MAX 129 object oINT64_MIN = <int64_t>INT64_MIN 130 object oUINT64_MAX = <uint64_t>UINT64_MAX 131 132 float64_t NaN = <float64_t>np.NaN 133 134 # python-visible 135 i8max = <int64_t>INT64_MAX 136 u8max = <uint64_t>UINT64_MAX 137 138 139 @cython.wraparound(False) 140 @cython.boundscheck(False) 141 def memory_usage_of_objects(arr: object[:]) -> int64_t: 142 """ 143 Return the memory usage of an object array in bytes. 144 145 Does not include the actual bytes of the pointers 146 """ 147 i: Py_ssize_t 148 n: Py_ssize_t 149 size: int64_t 150 151 size = 0 152 n = len(arr) 153 for i in range(n): 154 size += arr[i].__sizeof__() 155 return size 156 157 158 # ---------------------------------------------------------------------- 159 160 161 def is_scalar(val: object) -> bool: 162 """ 163 Return True if given object is scalar. 164 165 Parameters 166 ---------- 167 val : object 168 This includes: 169 170 - numpy array scalar (e.g. np.int64) 171 - Python builtin numerics 172 - Python builtin byte arrays and strings 173 - None 174 - datetime.datetime 175 - datetime.timedelta 176 - Period 177 - decimal.Decimal 178 - Interval 179 - DateOffset 180 - Fraction 181 - Number. 182 183 Returns 184 ------- 185 bool 186 Return True if given object is scalar. 187 188 Examples 189 -------- 190 >>> import datetime 191 >>> dt = datetime.datetime(2018, 10, 3) 192 >>> pd.api.types.is_scalar(dt) 193 True 194 195 >>> pd.api.types.is_scalar([2, 3]) 196 False 197 198 >>> pd.api.types.is_scalar({0: 1, 2: 3}) 199 False 200 201 >>> pd.api.types.is_scalar((0, 2)) 202 False 203 204 pandas supports PEP 3141 numbers: 205 206 >>> from fractions import Fraction 207 >>> pd.api.types.is_scalar(Fraction(3, 5)) 208 True 209 """ 210 211 # Start with C-optimized checks 212 if (cnp.PyArray_IsAnyScalar(val) 213 # PyArray_IsAnyScalar is always False for bytearrays on Py3 214 or PyDate_Check(val) 215 or PyDelta_Check(val) 216 or PyTime_Check(val) 217 # We differ from numpy, which claims that None is not scalar; 218 # see np.isscalar 219 or val is C_NA 220 or val is None): 221 return True 222 223 # Next use C-optimized checks to exclude common non-scalars before falling 224 # back to non-optimized checks. 225 if PySequence_Check(val): 226 # e.g. list, tuple 227 # includes np.ndarray, Series which PyNumber_Check can return True for 228 return False 229 230 # Note: PyNumber_Check check includes Decimal, Fraction, numbers.Number 231 return (PyNumber_Check(val) 232 or is_period_object(val) 233 or is_interval(val) 234 or is_offset_object(val)) 235 236 237 cdef inline int64_t get_itemsize(object val): 238 """ 239 Get the itemsize of a NumPy scalar, -1 if not a NumPy scalar. 240 241 Parameters 242 ---------- 243 val : object 244 245 Returns 246 ------- 247 is_ndarray : bool 248 """ 249 if PyArray_CheckScalar(val): 250 return cnp.PyArray_DescrFromScalar(val).itemsize 251 else: 252 return -1 253 254 255 def is_iterator(obj: object) -> bool: 256 """ 257 Check if the object is an iterator. 258 259 This is intended for generators, not list-like objects. 260 261 Parameters 262 ---------- 263 obj : The object to check 264 265 Returns 266 ------- 267 is_iter : bool 268 Whether `obj` is an iterator. 269 270 Examples 271 -------- 272 >>> import datetime 273 >>> is_iterator((x for x in [])) 274 True 275 >>> is_iterator([1, 2, 3]) 276 False 277 >>> is_iterator(datetime.datetime(2017, 1, 1)) 278 False 279 >>> is_iterator("foo") 280 False 281 >>> is_iterator(1) 282 False 283 """ 284 return PyIter_Check(obj) 285 286 287 def item_from_zerodim(val: object) -> object: 288 """ 289 If the value is a zerodim array, return the item it contains. 290 291 Parameters 292 ---------- 293 val : object 294 295 Returns 296 ------- 297 object 298 299 Examples 300 -------- 301 >>> item_from_zerodim(1) 302 1 303 >>> item_from_zerodim('foobar') 304 'foobar' 305 >>> item_from_zerodim(np.array(1)) 306 1 307 >>> item_from_zerodim(np.array([1])) 308 array([1]) 309 """ 310 if cnp.PyArray_IsZeroDim(val): 311 return cnp.PyArray_ToScalar(cnp.PyArray_DATA(val), val) 312 return val 313 314 315 @cython.wraparound(False) 316 @cython.boundscheck(False) 317 def fast_unique_multiple(list arrays, sort: bool = True): 318 """ 319 Generate a list of unique values from a list of arrays. 320 321 Parameters 322 ---------- 323 list : array-like 324 List of array-like objects. 325 sort : bool 326 Whether or not to sort the resulting unique list. 327 328 Returns 329 ------- 330 list of unique values 331 """ 332 cdef: 333 ndarray[object] buf 334 Py_ssize_t k = len(arrays) 335 Py_ssize_t i, j, n 336 list uniques = [] 337 dict table = {} 338 object val, stub = 0 339 340 for i in range(k): 341 buf = arrays[i] 342 n = len(buf) 343 for j in range(n): 344 val = buf[j] 345 if val not in table: 346 table[val] = stub 347 uniques.append(val) 348 349 if sort is None: 350 try: 351 uniques.sort() 352 except TypeError: 353 warnings.warn( 354 "The values in the array are unorderable. " 355 "Pass `sort=False` to suppress this warning.", 356 RuntimeWarning, 357 stacklevel=find_stack_level(), 358 ) 359 pass 360 361 return uniques 362 363 364 @cython.wraparound(False) 365 @cython.boundscheck(False) 366 def fast_unique_multiple_list(lists: list, sort: bool | None = True) -> list: 367 cdef: 368 list buf 369 Py_ssize_t k = len(lists) 370 Py_ssize_t i, j, n 371 list uniques = [] 372 dict table = {} 373 object val, stub = 0 374 375 for i in range(k): 376 buf = lists[i] 377 n = len(buf) 378 for j in range(n): 379 val = buf[j] 380 if val not in table: 381 table[val] = stub 382 uniques.append(val) 383 if sort: 384 try: 385 uniques.sort() 386 except TypeError: 387 pass 388 389 return uniques 390 391 392 @cython.wraparound(False) 393 @cython.boundscheck(False) 394 def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list: 395 """ 396 Generate a list of unique values from a generator of lists. 397 398 Parameters 399 ---------- 400 gen : generator object 401 Generator of lists from which the unique list is created. 402 sort : bool 403 Whether or not to sort the resulting unique list. 404 405 Returns 406 ------- 407 list of unique values 408 """ 409 cdef: 410 list buf 411 Py_ssize_t j, n 412 list uniques = [] 413 dict table = {} 414 object val, stub = 0 415 416 for buf in gen: 417 n = len(buf) 418 for j in range(n): 419 val = buf[j] 420 if val not in table: 421 table[val] = stub 422 uniques.append(val) 423 if sort: 424 try: 425 uniques.sort() 426 except TypeError: 427 pass 428 429 return uniques 430 431 432 @cython.wraparound(False) 433 @cython.boundscheck(False) 434 def dicts_to_array(dicts: list, columns: list): 435 cdef: 436 Py_ssize_t i, j, k, n 437 ndarray[object, ndim=2] result 438 dict row 439 object col, onan = np.nan 440 441 k = len(columns) 442 n = len(dicts) 443 444 result = np.empty((n, k), dtype='O') 445 446 for i in range(n): 447 row = dicts[i] 448 for j in range(k): 449 col = columns[j] 450 if col in row: 451 result[i, j] = row[col] 452 else: 453 result[i, j] = onan 454 455 return result 456 457 458 def fast_zip(list ndarrays) -> ndarray[object]: 459 """ 460 For zipping multiple ndarrays into an ndarray of tuples. 461 """ 462 cdef: 463 Py_ssize_t i, j, k, n 464 ndarray[object, ndim=1] result 465 flatiter it 466 object val, tup 467 468 k = len(ndarrays) 469 n = len(ndarrays[0]) 470 471 result = np.empty(n, dtype=object) 472 473 # initialize tuples on first pass 474 arr = ndarrays[0] 475 it = <flatiter>PyArray_IterNew(arr) 476 for i in range(n): 477 val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) 478 tup = PyTuple_New(k) 479 480 PyTuple_SET_ITEM(tup, 0, val) 481 Py_INCREF(val) 482 result[i] = tup 483 PyArray_ITER_NEXT(it) 484 485 for j in range(1, k): 486 arr = ndarrays[j] 487 it = <flatiter>PyArray_IterNew(arr) 488 if len(arr) != n: 489 raise ValueError("all arrays must be same length") 490 491 for i in range(n): 492 val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) 493 PyTuple_SET_ITEM(result[i], j, val) 494 Py_INCREF(val) 495 PyArray_ITER_NEXT(it) 496 497 return result 498 499 500 def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray: 501 """ 502 Reverse indexing operation. 503 504 Given `indexer`, make `indexer_inv` of it, such that:: 505 506 indexer_inv[indexer[x]] = x 507 508 Parameters 509 ---------- 510 indexer : np.ndarray[np.intp] 511 length : int 512 513 Returns 514 ------- 515 np.ndarray[np.intp] 516 517 Notes 518 ----- 519 If indexer is not unique, only first occurrence is accounted. 520 """ 521 cdef: 522 Py_ssize_t i, n = len(indexer) 523 ndarray[intp_t, ndim=1] rev_indexer 524 intp_t idx 525 526 rev_indexer = np.empty(length, dtype=np.intp) 527 rev_indexer[:] = -1 528 for i in range(n): 529 idx = indexer[i] 530 if idx != -1: 531 rev_indexer[idx] = i 532 533 return rev_indexer 534 535 536 @cython.wraparound(False) 537 @cython.boundscheck(False) 538 # Can add const once https://github.com/cython/cython/issues/1772 resolved 539 def has_infs(floating[:] arr) -> bool: 540 cdef: 541 Py_ssize_t i, n = len(arr) 542 floating inf, neginf, val 543 bint ret = False 544 545 inf = np.inf 546 neginf = -inf 547 with nogil: 548 for i in range(n): 549 val = arr[i] 550 if val == inf or val == neginf: 551 ret = True 552 break 553 return ret 554 555 556 def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len): 557 cdef: 558 Py_ssize_t i, n = len(indices) 559 intp_t k, vstart, vlast, v 560 561 if n == 0: 562 return slice(0, 0) 563 564 vstart = indices[0] 565 if vstart < 0 or max_len <= vstart: 566 return indices 567 568 if n == 1: 569 return slice(vstart, <intp_t>(vstart + 1)) 570 571 vlast = indices[n - 1] 572 if vlast < 0 or max_len <= vlast: 573 return indices 574 575 k = indices[1] - indices[0] 576 if k == 0: 577 return indices 578 else: 579 for i in range(2, n): 580 v = indices[i] 581 if v - indices[i - 1] != k: 582 return indices 583 584 if k > 0: 585 return slice(vstart, <intp_t>(vlast + 1), k) 586 else: 587 if vlast == 0: 588 return slice(vstart, None, k) 589 else: 590 return slice(vstart, <intp_t>(vlast - 1), k) 591 592 593 @cython.wraparound(False) 594 @cython.boundscheck(False) 595 def maybe_booleans_to_slice(ndarray[uint8_t, ndim=1] mask): 596 cdef: 597 Py_ssize_t i, n = len(mask) 598 Py_ssize_t start = 0, end = 0 599 bint started = False, finished = False 600 601 for i in range(n): 602 if mask[i]: 603 if finished: 604 return mask.view(np.bool_) 605 if not started: 606 started = True 607 start = i 608 else: 609 if finished: 610 continue 611 612 if started: 613 end = i 614 finished = True 615 616 if not started: 617 return slice(0, 0) 618 if not finished: 619 return slice(start, None) 620 else: 621 return slice(start, end) 622 623 624 @cython.wraparound(False) 625 @cython.boundscheck(False) 626 def array_equivalent_object(left: object[:], right: object[:]) -> bool: 627 """ 628 Perform an element by element comparison on 1-d object arrays 629 taking into account nan positions. 630 """ 631 cdef: 632 Py_ssize_t i, n = left.shape[0] 633 object x, y 634 635 for i in range(n): 636 x = left[i] 637 y = right[i] 638 639 # we are either not equal or both nan 640 # I think None == None will be true here 641 try: 642 if PyArray_Check(x) and PyArray_Check(y): 643 if not array_equivalent_object(x, y): 644 return False 645 elif (x is C_NA) ^ (y is C_NA): 646 return False 647 elif not ( 648 PyObject_RichCompareBool(x, y, Py_EQ) 649 or is_matching_na(x, y, nan_matches_none=True) 650 ): 651 return False 652 except ValueError: 653 # Avoid raising ValueError when comparing Numpy arrays to other types 654 if cnp.PyArray_IsAnyScalar(x) != cnp.PyArray_IsAnyScalar(y): 655 # Only compare scalars to scalars and non-scalars to non-scalars 656 return False 657 elif (not (cnp.PyArray_IsPythonScalar(x) or cnp.PyArray_IsPythonScalar(y)) 658 and not (isinstance(x, type(y)) or isinstance(y, type(x)))): 659 # Check if non-scalars have the same type 660 return False 661 raise 662 return True 663 664 665 ctypedef fused ndarr_object: 666 ndarray[object, ndim=1] 667 ndarray[object, ndim=2] 668 669 # TODO: get rid of this in StringArray and modify 670 # and go through ensure_string_array instead 671 @cython.wraparound(False) 672 @cython.boundscheck(False) 673 def convert_nans_to_NA(ndarr_object arr) -> ndarray: 674 """ 675 Helper for StringArray that converts null values that 676 are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements 677 have already been validated as null. 678 """ 679 cdef: 680 Py_ssize_t i, m, n 681 object val 682 ndarr_object result 683 result = np.asarray(arr, dtype="object") 684 if arr.ndim == 2: 685 m, n = arr.shape[0], arr.shape[1] 686 for i in range(m): 687 for j in range(n): 688 val = arr[i, j] 689 if not isinstance(val, str): 690 result[i, j] = <object>C_NA 691 else: 692 n = len(arr) 693 for i in range(n): 694 val = arr[i] 695 if not isinstance(val, str): 696 result[i] = <object>C_NA 697 return result 698 699 700 @cython.wraparound(False) 701 @cython.boundscheck(False) 702 cpdef ndarray[object] ensure_string_array( 703 arr, 704 object na_value=np.nan, 705 bint convert_na_value=True, 706 bint copy=True, 707 bint skipna=True, 708 ): 709 """ 710 Returns a new numpy array with object dtype and only strings and na values. 711 712 Parameters 713 ---------- 714 arr : array-like 715 The values to be converted to str, if needed. 716 na_value : Any, default np.nan 717 The value to use for na. For example, np.nan or pd.NA. 718 convert_na_value : bool, default True 719 If False, existing na values will be used unchanged in the new array. 720 copy : bool, default True 721 Whether to ensure that a new array is returned. 722 skipna : bool, default True 723 Whether or not to coerce nulls to their stringified form 724 (e.g. if False, NaN becomes 'nan'). 725 726 Returns 727 ------- 728 np.ndarray[object] 729 An array with the input array's elements casted to str or nan-like. 730 """ 731 cdef: 732 Py_ssize_t i = 0, n = len(arr) 733 734 if hasattr(arr, "to_numpy"): 735 736 if hasattr(arr, "dtype") and arr.dtype.kind in ["m", "M"]: 737 # dtype check to exclude DataFrame 738 # GH#41409 TODO: not a great place for this 739 out = arr.astype(str).astype(object) 740 out[arr.isna()] = na_value 741 return out 742 743 arr = arr.to_numpy() 744 elif not util.is_array(arr): 745 arr = np.array(arr, dtype="object") 746 747 result = np.asarray(arr, dtype="object") 748 749 if copy and result is arr: 750 result = result.copy() 751 752 for i in range(n): 753 val = arr[i] 754 755 if isinstance(val, str): 756 continue 757 758 if not checknull(val): 759 if not util.is_float_object(val): 760 # f"{val}" is faster than str(val) 761 result[i] = f"{val}" 762 else: 763 # f"{val}" is not always equivalent to str(val) for floats 764 result[i] = str(val) 765 else: 766 if convert_na_value: 767 val = na_value 768 if skipna: 769 result[i] = val 770 else: 771 result[i] = f"{val}" 772 773 return result 774 775 776 def is_all_arraylike(obj: list) -> bool: 777 """ 778 Should we treat these as levels of a MultiIndex, as opposed to Index items? 779 """ 780 cdef: 781 Py_ssize_t i, n = len(obj) 782 object val 783 bint all_arrays = True 784 785 for i in range(n): 786 val = obj[i] 787 if not (isinstance(val, list) or 788 util.is_array(val) or hasattr(val, '_data')): 789 # TODO: EA? 790 # exclude tuples, frozensets as they may be contained in an Index 791 all_arrays = False 792 break 793 794 return all_arrays 795 796 797 # ------------------------------------------------------------------------------ 798 # Groupby-related functions 799 800 # TODO: could do even better if we know something about the data. eg, index has 801 # 1-min data, binner has 5-min data, then bins are just strides in index. This 802 # is a general, O(max(len(values), len(binner))) method. 803 @cython.boundscheck(False) 804 @cython.wraparound(False) 805 def generate_bins_dt64(ndarray[int64_t, ndim=1] values, const int64_t[:] binner, 806 object closed='left', bint hasnans=False): 807 """ 808 Int64 (datetime64) version of generic python version in ``groupby.py``. 809 """ 810 cdef: 811 Py_ssize_t lenidx, lenbin, i, j, bc, vc 812 ndarray[int64_t, ndim=1] bins 813 int64_t l_bin, r_bin, nat_count 814 bint right_closed = closed == 'right' 815 816 nat_count = 0 817 if hasnans: 818 mask = values == NPY_NAT 819 nat_count = np.sum(mask) 820 values = values[~mask] 821 822 lenidx = len(values) 823 lenbin = len(binner) 824 825 if lenidx <= 0 or lenbin <= 0: 826 raise ValueError("Invalid length for values or for binner") 827 828 # check binner fits data 829 if values[0] < binner[0]: 830 raise ValueError("Values falls before first bin") 831 832 if values[lenidx - 1] > binner[lenbin - 1]: 833 raise ValueError("Values falls after last bin") 834 835 bins = np.empty(lenbin - 1, dtype=np.int64) 836 837 j = 0 # index into values 838 bc = 0 # bin count 839 840 # linear scan 841 if right_closed: 842 for i in range(0, lenbin - 1): 843 r_bin = binner[i + 1] 844 # count values in current bin, advance to next bin 845 while j < lenidx and values[j] <= r_bin: 846 j += 1 847 bins[bc] = j 848 bc += 1 849 else: 850 for i in range(0, lenbin - 1): 851 r_bin = binner[i + 1] 852 # count values in current bin, advance to next bin 853 while j < lenidx and values[j] < r_bin: 854 j += 1 855 bins[bc] = j 856 bc += 1 857 858 if nat_count > 0: 859 # shift bins by the number of NaT 860 bins = bins + nat_count 861 bins = np.insert(bins, 0, nat_count) 862 863 return bins 864 865 866 @cython.boundscheck(False) 867 @cython.wraparound(False) 868 def get_level_sorter( 869 ndarray[int64_t, ndim=1] codes, const intp_t[:] starts 870 ) -> ndarray: 871 """ 872 Argsort for a single level of a multi-index, keeping the order of higher 873 levels unchanged. `starts` points to starts of same-key indices w.r.t 874 to leading levels; equivalent to: 875 np.hstack([codes[starts[i]:starts[i+1]].argsort(kind='mergesort') 876 + starts[i] for i in range(len(starts) - 1)]) 877 878 Parameters 879 ---------- 880 codes : np.ndarray[int64_t, ndim=1] 881 starts : np.ndarray[intp, ndim=1] 882 883 Returns 884 ------- 885 np.ndarray[np.int, ndim=1] 886 """ 887 cdef: 888 Py_ssize_t i, l, r 889 ndarray[intp_t, ndim=1] out = cnp.PyArray_EMPTY(1, codes.shape, cnp.NPY_INTP, 0) 890 891 for i in range(len(starts) - 1): 892 l, r = starts[i], starts[i + 1] 893 out[l:r] = l + codes[l:r].argsort(kind='mergesort') 894 895 return out 896 897 898 @cython.boundscheck(False) 899 @cython.wraparound(False) 900 def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, 901 const intp_t[:] labels, 902 Py_ssize_t max_bin, 903 int axis): 904 cdef: 905 Py_ssize_t i, j, k, n 906 ndarray[int64_t, ndim=2] counts 907 908 assert (axis == 0 or axis == 1) 909 n, k = (<object>mask).shape 910 911 if axis == 0: 912 counts = np.zeros((max_bin, k), dtype='i8') 913 with nogil: 914 for i in range(n): 915 for j in range(k): 916 if mask[i, j]: 917 counts[labels[i], j] += 1 918 919 else: # axis == 1 920 counts = np.zeros((n, max_bin), dtype='i8') 921 with nogil: 922 for i in range(n): 923 for j in range(k): 924 if mask[i, j]: 925 counts[i, labels[j]] += 1 926 927 return counts 928 929 930 @cython.wraparound(False) 931 @cython.boundscheck(False) 932 def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups): 933 cdef: 934 Py_ssize_t i, group_size, n, start 935 intp_t lab 936 int64_t[::1] starts, ends 937 938 n = len(labels) 939 940 starts = np.zeros(ngroups, dtype=np.int64) 941 ends = np.zeros(ngroups, dtype=np.int64) 942 943 start = 0 944 group_size = 0 945 with nogil: 946 for i in range(n): 947 lab = labels[i] 948 if lab < 0: 949 start += 1 950 else: 951 group_size += 1 952 if i == n - 1 or lab != labels[i + 1]: 953 starts[lab] = start 954 ends[lab] = start + group_size 955 start += group_size 956 group_size = 0 957 958 return np.asarray(starts), np.asarray(ends) 959 960 961 def indices_fast(ndarray[intp_t, ndim=1] index, const int64_t[:] labels, list keys, 962 list sorted_labels) -> dict: 963 """ 964 Parameters 965 ---------- 966 index : ndarray[intp] 967 labels : ndarray[int64] 968 keys : list 969 sorted_labels : list[ndarray[int64]] 970 """ 971 cdef: 972 Py_ssize_t i, j, k, lab, cur, start, n = len(labels) 973 dict result = {} 974 object tup 975 976 k = len(keys) 977 978 # Start at the first non-null entry 979 j = 0 980 for j in range(0, n): 981 if labels[j] != -1: 982 break 983 else: 984 return result 985 cur = labels[j] 986 start = j 987 988 for i in range(j+1, n): 989 lab = labels[i] 990 991 if lab != cur: 992 if lab != -1: 993 if k == 1: 994 # When k = 1 we do not want to return a tuple as key 995 tup = keys[0][sorted_labels[0][i - 1]] 996 else: 997 tup = PyTuple_New(k) 998 for j in range(k): 999 val = keys[j][sorted_labels[j][i - 1]] 1000 PyTuple_SET_ITEM(tup, j, val) 1001 Py_INCREF(val) 1002 result[tup] = index[start:i] 1003 start = i 1004 cur = lab 1005 1006 if k == 1: 1007 # When k = 1 we do not want to return a tuple as key 1008 tup = keys[0][sorted_labels[0][n - 1]] 1009 else: 1010 tup = PyTuple_New(k) 1011 for j in range(k): 1012 val = keys[j][sorted_labels[j][n - 1]] 1013 PyTuple_SET_ITEM(tup, j, val) 1014 Py_INCREF(val) 1015 result[tup] = index[start:] 1016 1017 return result 1018 1019 1020 # core.common import for fast inference checks 1021 1022 def is_float(obj: object) -> bool: 1023 """ 1024 Return True if given object is float. 1025 1026 Returns 1027 ------- 1028 bool 1029 """ 1030 return util.is_float_object(obj) 1031 1032 1033 def is_integer(obj: object) -> bool: 1034 """ 1035 Return True if given object is integer. 1036 1037 Returns 1038 ------- 1039 bool 1040 """ 1041 return util.is_integer_object(obj) 1042 1043 1044 def is_bool(obj: object) -> bool: 1045 """ 1046 Return True if given object is boolean. 1047 1048 Returns 1049 ------- 1050 bool 1051 """ 1052 return util.is_bool_object(obj) 1053 1054 1055 def is_complex(obj: object) -> bool: 1056 """ 1057 Return True if given object is complex. 1058 1059 Returns 1060 ------- 1061 bool 1062 """ 1063 return util.is_complex_object(obj) 1064 1065 1066 cpdef bint is_decimal(object obj): 1067 return isinstance(obj, Decimal) 1068 1069 1070 cpdef bint is_interval(object obj): 1071 return getattr(obj, '_typ', '_typ') == 'interval' 1072 1073 1074 def is_period(val: object) -> bool: 1075 """ 1076 Return True if given object is Period. 1077 1078 Returns 1079 ------- 1080 bool 1081 """ 1082 return is_period_object(val) 1083 1084 1085 def is_list_like(obj: object, allow_sets: bool = True) -> bool: 1086 """ 1087 Check if the object is list-like. 1088 1089 Objects that are considered list-like are for example Python 1090 lists, tuples, sets, NumPy arrays, and Pandas Series. 1091 1092 Strings and datetime objects, however, are not considered list-like. 1093 1094 Parameters 1095 ---------- 1096 obj : object 1097 Object to check. 1098 allow_sets : bool, default True 1099 If this parameter is False, sets will not be considered list-like. 1100 1101 Returns 1102 ------- 1103 bool 1104 Whether `obj` has list-like properties. 1105 1106 Examples 1107 -------- 1108 >>> import datetime 1109 >>> is_list_like([1, 2, 3]) 1110 True 1111 >>> is_list_like({1, 2, 3}) 1112 True 1113 >>> is_list_like(datetime.datetime(2017, 1, 1)) 1114 False 1115 >>> is_list_like("foo") 1116 False 1117 >>> is_list_like(1) 1118 False 1119 >>> is_list_like(np.array([2])) 1120 True 1121 >>> is_list_like(np.array(2)) 1122 False 1123 """ 1124 return c_is_list_like(obj, allow_sets) 1125 1126 1127 cdef inline bint c_is_list_like(object obj, bint allow_sets) except -1: 1128 # first, performance short-cuts for the most common cases 1129 if util.is_array(obj): 1130 # exclude zero-dimensional numpy arrays, effectively scalars 1131 return not cnp.PyArray_IsZeroDim(obj) 1132 elif isinstance(obj, list): 1133 return True 1134 # then the generic implementation 1135 return ( 1136 # equiv: `isinstance(obj, abc.Iterable)` 1137 getattr(obj, "__iter__", None) is not None and not isinstance(obj, type) 1138 # we do not count strings/unicode/bytes as list-like 1139 and not isinstance(obj, (str, bytes)) 1140 # exclude zero-dimensional duck-arrays, effectively scalars 1141 and not (hasattr(obj, "ndim") and obj.ndim == 0) 1142 # exclude sets if allow_sets is False 1143 and not (allow_sets is False and isinstance(obj, abc.Set)) 1144 ) 1145 1146 1147 _TYPE_MAP = { 1148 "categorical": "categorical", 1149 "category": "categorical", 1150 "int8": "integer", 1151 "int16": "integer", 1152 "int32": "integer", 1153 "int64": "integer", 1154 "i": "integer", 1155 "uint8": "integer", 1156 "uint16": "integer", 1157 "uint32": "integer", 1158 "uint64": "integer", 1159 "u": "integer", 1160 "float32": "floating", 1161 "float64": "floating", 1162 "f": "floating", 1163 "complex64": "complex", 1164 "complex128": "complex", 1165 "c": "complex", 1166 "string": "string", 1167 str: "string", 1168 "S": "bytes", 1169 "U": "string", 1170 "bool": "boolean", 1171 "b": "boolean", 1172 "datetime64[ns]": "datetime64", 1173 "M": "datetime64", 1174 "timedelta64[ns]": "timedelta64", 1175 "m": "timedelta64", 1176 "interval": "interval", 1177 Period: "period", 1178 } 1179 1180 # types only exist on certain platform 1181 try: 1182 np.float128 1183 _TYPE_MAP['float128'] = 'floating' 1184 except AttributeError: 1185 pass 1186 try: 1187 np.complex256 1188 _TYPE_MAP['complex256'] = 'complex' 1189 except AttributeError: 1190 pass 1191 try: 1192 np.float16 1193 _TYPE_MAP['float16'] = 'floating' 1194 except AttributeError: 1195 pass 1196 1197 1198 @cython.internal 1199 cdef class Seen: 1200 """ 1201 Class for keeping track of the types of elements 1202 encountered when trying to perform type conversions. 1203 """ 1204 1205 cdef: 1206 bint int_ # seen_int 1207 bint nat_ # seen nat 1208 bint bool_ # seen_bool 1209 bint null_ # seen_null 1210 bint nan_ # seen_np.nan 1211 bint uint_ # seen_uint (unsigned integer) 1212 bint sint_ # seen_sint (signed integer) 1213 bint float_ # seen_float 1214 bint object_ # seen_object 1215 bint complex_ # seen_complex 1216 bint datetime_ # seen_datetime 1217 bint coerce_numeric # coerce data to numeric 1218 bint timedelta_ # seen_timedelta 1219 bint datetimetz_ # seen_datetimetz 1220 bint period_ # seen_period 1221 bint interval_ # seen_interval 1222 1223 def __cinit__(self, bint coerce_numeric=False): 1224 """ 1225 Initialize a Seen instance. 1226 1227 Parameters 1228 ---------- 1229 coerce_numeric : bool, default False 1230 Whether or not to force conversion to a numeric data type if 1231 initial methods to convert to numeric fail. 1232 """ 1233 self.int_ = False 1234 self.nat_ = False 1235 self.bool_ = False 1236 self.null_ = False 1237 self.nan_ = False 1238 self.uint_ = False 1239 self.sint_ = False 1240 self.float_ = False 1241 self.object_ = False 1242 self.complex_ = False 1243 self.datetime_ = False 1244 self.timedelta_ = False 1245 self.datetimetz_ = False 1246 self.period_ = False 1247 self.interval_ = False 1248 self.coerce_numeric = coerce_numeric 1249 1250 cdef inline bint check_uint64_conflict(self) except -1: 1251 """ 1252 Check whether we can safely convert a uint64 array to a numeric dtype. 1253 1254 There are two cases when conversion to numeric dtype with a uint64 1255 array is not safe (and will therefore not be performed) 1256 1257 1) A NaN element is encountered. 1258 1259 uint64 cannot be safely cast to float64 due to truncation issues 1260 at the extreme ends of the range. 1261 1262 2) A negative number is encountered. 1263 1264 There is no numerical dtype that can hold both negative numbers 1265 and numbers greater than INT64_MAX. Hence, at least one number 1266 will be improperly cast if we convert to a numeric dtype. 1267 1268 Returns 1269 ------- 1270 bool 1271 Whether or not we should return the original input array to avoid 1272 data truncation. 1273 1274 Raises 1275 ------ 1276 ValueError 1277 uint64 elements were detected, and at least one of the 1278 two conflict cases was also detected. However, we are 1279 trying to force conversion to a numeric dtype. 1280 """ 1281 return (self.uint_ and (self.null_ or self.sint_) 1282 and not self.coerce_numeric) 1283 1284 cdef inline saw_null(self): 1285 """ 1286 Set flags indicating that a null value was encountered. 1287 """ 1288 self.null_ = True 1289 self.float_ = True 1290 1291 cdef saw_int(self, object val): 1292 """ 1293 Set flags indicating that an integer value was encountered. 1294 1295 In addition to setting a flag that an integer was seen, we 1296 also set two flags depending on the type of integer seen: 1297 1298 1) sint_ : a signed numpy integer type or a negative (signed) number in the 1299 range of [-2**63, 0) was encountered 1300 2) uint_ : an unsigned numpy integer type or a positive number in the range of 1301 [2**63, 2**64) was encountered 1302 1303 Parameters 1304 ---------- 1305 val : Python int 1306 Value with which to set the flags. 1307 """ 1308 self.int_ = True 1309 self.sint_ = ( 1310 self.sint_ 1311 or (oINT64_MIN <= val < 0) 1312 # Cython equivalent of `isinstance(val, np.signedinteger)` 1313 or PyObject_TypeCheck(val, &PySignedIntegerArrType_Type) 1314 ) 1315 self.uint_ = ( 1316 self.uint_ 1317 or (oINT64_MAX < val <= oUINT64_MAX) 1318 # Cython equivalent of `isinstance(val, np.unsignedinteger)` 1319 or PyObject_TypeCheck(val, &PyUnsignedIntegerArrType_Type) 1320 ) 1321 1322 @property 1323 def numeric_(self): 1324 return self.complex_ or self.float_ or self.int_ 1325 1326 @property 1327 def is_bool(self): 1328 return not (self.datetime_ or self.numeric_ or self.timedelta_ 1329 or self.nat_) 1330 1331 @property 1332 def is_float_or_complex(self): 1333 return not (self.bool_ or self.datetime_ or self.timedelta_ 1334 or self.nat_) 1335 1336 1337 cdef object _try_infer_map(object dtype): 1338 """ 1339 If its in our map, just return the dtype. 1340 """ 1341 cdef: 1342 object val 1343 str attr 1344 for attr in ["name", "kind", "base", "type"]: 1345 val = getattr(dtype, attr, None) 1346 if val in _TYPE_MAP: 1347 return _TYPE_MAP[val] 1348 return None 1349 1350 1351 def infer_dtype(value: object, skipna: bool = True) -> str: 1352 """ 1353 Return a string label of the type of a scalar or list-like of values. 1354 1355 Parameters 1356 ---------- 1357 value : scalar, list, ndarray, or pandas type 1358 skipna : bool, default True 1359 Ignore NaN values when inferring the type. 1360 1361 Returns 1362 ------- 1363 str 1364 Describing the common type of the input data. 1365 Results can include: 1366 1367 - string 1368 - bytes 1369 - floating 1370 - integer 1371 - mixed-integer 1372 - mixed-integer-float 1373 - decimal 1374 - complex 1375 - categorical 1376 - boolean 1377 - datetime64 1378 - datetime 1379 - date 1380 - timedelta64 1381 - timedelta 1382 - time 1383 - period 1384 - mixed 1385 - unknown-array 1386 1387 Raises 1388 ------ 1389 TypeError 1390 If ndarray-like but cannot infer the dtype 1391 1392 Notes 1393 ----- 1394 - 'mixed' is the catchall for anything that is not otherwise 1395 specialized 1396 - 'mixed-integer-float' are floats and integers 1397 - 'mixed-integer' are integers mixed with non-integers 1398 - 'unknown-array' is the catchall for something that *is* an array (has 1399 a dtype attribute), but has a dtype unknown to pandas (e.g. external 1400 extension array) 1401 1402 Examples 1403 -------- 1404 >>> import datetime 1405 >>> infer_dtype(['foo', 'bar']) 1406 'string' 1407 1408 >>> infer_dtype(['a', np.nan, 'b'], skipna=True) 1409 'string' 1410 1411 >>> infer_dtype(['a', np.nan, 'b'], skipna=False) 1412 'mixed' 1413 1414 >>> infer_dtype([b'foo', b'bar']) 1415 'bytes' 1416 1417 >>> infer_dtype([1, 2, 3]) 1418 'integer' 1419 1420 >>> infer_dtype([1, 2, 3.5]) 1421 'mixed-integer-float' 1422 1423 >>> infer_dtype([1.0, 2.0, 3.5]) 1424 'floating' 1425 1426 >>> infer_dtype(['a', 1]) 1427 'mixed-integer' 1428 1429 >>> infer_dtype([Decimal(1), Decimal(2.0)]) 1430 'decimal' 1431 1432 >>> infer_dtype([True, False]) 1433 'boolean' 1434 1435 >>> infer_dtype([True, False, np.nan]) 1436 'boolean' 1437 1438 >>> infer_dtype([pd.Timestamp('20130101')]) 1439 'datetime' 1440 1441 >>> infer_dtype([datetime.date(2013, 1, 1)]) 1442 'date' 1443 1444 >>> infer_dtype([np.datetime64('2013-01-01')]) 1445 'datetime64' 1446 1447 >>> infer_dtype([datetime.timedelta(0, 1, 1)]) 1448 'timedelta' 1449 1450 >>> infer_dtype(pd.Series(list('aabc')).astype('category')) 1451 'categorical' 1452 """ 1453 cdef: 1454 Py_ssize_t i, n 1455 object val 1456 ndarray values 1457 bint seen_pdnat = False 1458 bint seen_val = False 1459 flatiter it 1460 1461 if util.is_array(value): 1462 values = value 1463 elif hasattr(value, "inferred_type") and skipna is False: 1464 # Index, use the cached attribute if possible, populate the cache otherwise 1465 return value.inferred_type 1466 elif hasattr(value, "dtype"): 1467 # this will handle ndarray-like 1468 # e.g. categoricals 1469 dtype = value.dtype 1470 if not cnp.PyArray_DescrCheck(dtype): 1471 # i.e. not isinstance(dtype, np.dtype) 1472 inferred = _try_infer_map(value.dtype) 1473 if inferred is not None: 1474 return inferred 1475 return "unknown-array" 1476 1477 # Unwrap Series/Index 1478 values = np.asarray(value) 1479 1480 else: 1481 if not isinstance(value, list): 1482 value = list(value) 1483 1484 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike 1485 values = construct_1d_object_array_from_listlike(value) 1486 1487 val = _try_infer_map(values.dtype) 1488 if val is not None: 1489 # Anything other than object-dtype should return here. 1490 return val 1491 1492 if values.descr.type_num != NPY_OBJECT: 1493 # i.e. values.dtype != np.object 1494 # This should not be reached 1495 values = values.astype(object) 1496 1497 n = cnp.PyArray_SIZE(values) 1498 if n == 0: 1499 return "empty" 1500 1501 # Iterate until we find our first valid value. We will use this 1502 # value to decide which of the is_foo_array functions to call. 1503 it = PyArray_IterNew(values) 1504 for i in range(n): 1505 # The PyArray_GETITEM and PyArray_ITER_NEXT are faster 1506 # equivalents to `val = values[i]` 1507 val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) 1508 PyArray_ITER_NEXT(it) 1509 1510 # do not use checknull to keep 1511 # np.datetime64('nat') and np.timedelta64('nat') 1512 if val is None or util.is_nan(val) or val is C_NA: 1513 pass 1514 elif val is NaT: 1515 seen_pdnat = True 1516 else: 1517 seen_val = True 1518 break 1519 1520 # if all values are nan/NaT 1521 if seen_val is False and seen_pdnat is True: 1522 return "datetime" 1523 # float/object nan is handled in latter logic 1524 if seen_val is False and skipna: 1525 return "empty" 1526 1527 if util.is_datetime64_object(val): 1528 if is_datetime64_array(values, skipna=skipna): 1529 return "datetime64" 1530 1531 elif is_timedelta(val): 1532 if is_timedelta_or_timedelta64_array(values, skipna=skipna): 1533 return "timedelta" 1534 1535 elif util.is_integer_object(val): 1536 # ordering matters here; this check must come after the is_timedelta 1537 # check otherwise numpy timedelta64 objects would come through here 1538 1539 if is_integer_array(values, skipna=skipna): 1540 return "integer" 1541 elif is_integer_float_array(values, skipna=skipna): 1542 if is_integer_na_array(values, skipna=skipna): 1543 return "integer-na" 1544 else: 1545 return "mixed-integer-float" 1546 return "mixed-integer" 1547 1548 elif PyDateTime_Check(val): 1549 if is_datetime_array(values, skipna=skipna): 1550 return "datetime" 1551 elif is_date_array(values, skipna=skipna): 1552 return "date" 1553 1554 elif PyDate_Check(val): 1555 if is_date_array(values, skipna=skipna): 1556 return "date" 1557 1558 elif PyTime_Check(val): 1559 if is_time_array(values, skipna=skipna): 1560 return "time" 1561 1562 elif is_decimal(val): 1563 if is_decimal_array(values, skipna=skipna): 1564 return "decimal" 1565 1566 elif util.is_complex_object(val): 1567 if is_complex_array(values): 1568 return "complex" 1569 1570 elif util.is_float_object(val): 1571 if is_float_array(values): 1572 return "floating" 1573 elif is_integer_float_array(values, skipna=skipna): 1574 if is_integer_na_array(values, skipna=skipna): 1575 return "integer-na" 1576 else: 1577 return "mixed-integer-float" 1578 1579 elif util.is_bool_object(val): 1580 if is_bool_array(values, skipna=skipna): 1581 return "boolean" 1582 1583 elif isinstance(val, str): 1584 if is_string_array(values, skipna=skipna): 1585 return "string" 1586 1587 elif isinstance(val, bytes): 1588 if is_bytes_array(values, skipna=skipna): 1589 return "bytes" 1590 1591 elif is_period_object(val): 1592 if is_period_array(values, skipna=skipna): 1593 return "period" 1594 1595 elif is_interval(val): 1596 if is_interval_array(values): 1597 return "interval" 1598 1599 cnp.PyArray_ITER_RESET(it) 1600 for i in range(n): 1601 val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) 1602 PyArray_ITER_NEXT(it) 1603 1604 if util.is_integer_object(val): 1605 return "mixed-integer" 1606 1607 return "mixed" 1608 1609 1610 def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]: 1611 """ 1612 Infer if we have a datetime or timedelta array. 1613 - date: we have *only* date and maybe strings, nulls 1614 - datetime: we have *only* datetimes and maybe strings, nulls 1615 - timedelta: we have *only* timedeltas and maybe strings, nulls 1616 - nat: we do not have *any* date, datetimes or timedeltas, but do have 1617 at least a NaT 1618 - mixed: other objects (strings, a mix of tz-aware and tz-naive, or 1619 actual objects) 1620 1621 Parameters 1622 ---------- 1623 arr : ndarray[object] 1624 1625 Returns 1626 ------- 1627 str: {datetime, timedelta, date, nat, mixed} 1628 bool 1629 """ 1630 cdef: 1631 Py_ssize_t i, n = len(arr) 1632 bint seen_timedelta = False, seen_date = False, seen_datetime = False 1633 bint seen_tz_aware = False, seen_tz_naive = False 1634 bint seen_nat = False, seen_str = False 1635 bint seen_period = False, seen_interval = False 1636 list objs = [] 1637 object v 1638 1639 for i in range(n): 1640 v = arr[i] 1641 if isinstance(v, str): 1642 objs.append(v) 1643 seen_str = True 1644 1645 if len(objs) == 3: 1646 break 1647 1648 elif v is None or util.is_nan(v): 1649 # nan or None 1650 pass 1651 elif v is NaT: 1652 seen_nat = True 1653 elif PyDateTime_Check(v): 1654 # datetime 1655 seen_datetime = True 1656 1657 # disambiguate between tz-naive and tz-aware 1658 if v.tzinfo is None: 1659 seen_tz_naive = True 1660 else: 1661 seen_tz_aware = True 1662 1663 if seen_tz_naive and seen_tz_aware: 1664 return "mixed", seen_str 1665 elif util.is_datetime64_object(v): 1666 # np.datetime64 1667 seen_datetime = True 1668 elif PyDate_Check(v): 1669 seen_date = True 1670 elif is_timedelta(v): 1671 # timedelta, or timedelta64 1672 seen_timedelta = True 1673 elif is_period_object(v): 1674 seen_period = True 1675 break 1676 elif is_interval(v): 1677 seen_interval = True 1678 break 1679 else: 1680 return "mixed", seen_str 1681 1682 if seen_period: 1683 if is_period_array(arr): 1684 return "period", seen_str 1685 return "mixed", seen_str 1686 1687 if seen_interval: 1688 if is_interval_array(arr): 1689 return "interval", seen_str 1690 return "mixed", seen_str 1691 1692 if seen_date and not (seen_datetime or seen_timedelta): 1693 return "date", seen_str 1694 elif seen_datetime and not seen_timedelta: 1695 return "datetime", seen_str 1696 elif seen_timedelta and not seen_datetime: 1697 return "timedelta", seen_str 1698 elif seen_nat: 1699 return "nat", seen_str 1700 1701 # short-circuit by trying to 1702 # actually convert these strings 1703 # this is for performance as we don't need to try 1704 # convert *every* string array 1705 if len(objs): 1706 try: 1707 # require_iso8601 as in maybe_infer_to_datetimelike 1708 array_to_datetime(objs, errors="raise", require_iso8601=True) 1709 return "datetime", seen_str 1710 except (ValueError, TypeError): 1711 pass 1712 1713 # we are *not* going to infer from strings 1714 # for timedelta as too much ambiguity 1715 1716 return "mixed", seen_str 1717 1718 1719 cdef inline bint is_timedelta(object o): 1720 return PyDelta_Check(o) or util.is_timedelta64_object(o) 1721 1722 1723 @cython.internal 1724 cdef class Validator: 1725 1726 cdef: 1727 Py_ssize_t n 1728 dtype dtype 1729 bint skipna 1730 1731 def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_), 1732 bint skipna=False): 1733 self.n = n 1734 self.dtype = dtype 1735 self.skipna = skipna 1736 1737 cdef bint validate(self, ndarray values) except -1: 1738 if not self.n: 1739 return False 1740 1741 if self.is_array_typed(): 1742 # i.e. this ndarray is already of the desired dtype 1743 return True 1744 elif self.dtype.type_num == NPY_OBJECT: 1745 if self.skipna: 1746 return self._validate_skipna(values) 1747 else: 1748 return self._validate(values) 1749 else: 1750 return False 1751 1752 @cython.wraparound(False) 1753 @cython.boundscheck(False) 1754 cdef bint _validate(self, ndarray values) except -1: 1755 cdef: 1756 Py_ssize_t i 1757 Py_ssize_t n = values.size 1758 flatiter it = PyArray_IterNew(values) 1759 1760 for i in range(n): 1761 # The PyArray_GETITEM and PyArray_ITER_NEXT are faster 1762 # equivalents to `val = values[i]` 1763 val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) 1764 PyArray_ITER_NEXT(it) 1765 if not self.is_valid(val): 1766 return False 1767 1768 return True 1769 1770 @cython.wraparound(False) 1771 @cython.boundscheck(False) 1772 cdef bint _validate_skipna(self, ndarray values) except -1: 1773 cdef: 1774 Py_ssize_t i 1775 Py_ssize_t n = values.size 1776 flatiter it = PyArray_IterNew(values) 1777 1778 for i in range(n): 1779 # The PyArray_GETITEM and PyArray_ITER_NEXT are faster 1780 # equivalents to `val = values[i]` 1781 val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) 1782 PyArray_ITER_NEXT(it) 1783 if not self.is_valid_skipna(val): 1784 return False 1785 1786 return True 1787 1788 cdef bint is_valid(self, object value) except -1: 1789 return self.is_value_typed(value) 1790 1791 cdef bint is_valid_skipna(self, object value) except -1: 1792 return self.is_valid(value) or self.is_valid_null(value) 1793 1794 cdef bint is_value_typed(self, object value) except -1: 1795 raise NotImplementedError(f"{type(self).__name__} child class " 1796 "must define is_value_typed") 1797 1798 cdef bint is_valid_null(self, object value) except -1: 1799 return value is None or value is C_NA or util.is_nan(value) 1800 1801 cdef bint is_array_typed(self) except -1: 1802 return False 1803 1804 1805 @cython.internal 1806 cdef class BoolValidator(Validator): 1807 cdef inline bint is_value_typed(self, object value) except -1: 1808 return util.is_bool_object(value) 1809 1810 cdef inline bint is_array_typed(self) except -1: 1811 return issubclass(self.dtype.type, np.bool_) 1812 1813 1814 cpdef bint is_bool_array(ndarray values, bint skipna=False): 1815 cdef: 1816 BoolValidator validator = BoolValidator(len(values), 1817 values.dtype, 1818 skipna=skipna) 1819 return validator.validate(values) 1820 1821 1822 @cython.internal 1823 cdef class IntegerValidator(Validator): 1824 cdef inline bint is_value_typed(self, object value) except -1: 1825 return util.is_integer_object(value) 1826 1827 cdef inline bint is_array_typed(self) except -1: 1828 return issubclass(self.dtype.type, np.integer) 1829 1830 1831 # Note: only python-exposed for tests 1832 cpdef bint is_integer_array(ndarray values, bint skipna=True): 1833 cdef: 1834 IntegerValidator validator = IntegerValidator(len(values), 1835 values.dtype, 1836 skipna=skipna) 1837 return validator.validate(values) 1838 1839 1840 @cython.internal 1841 cdef class IntegerNaValidator(Validator): 1842 cdef inline bint is_value_typed(self, object value) except -1: 1843 return (util.is_integer_object(value) 1844 or (util.is_nan(value) and util.is_float_object(value))) 1845 1846 1847 cdef bint is_integer_na_array(ndarray values, bint skipna=True): 1848 cdef: 1849 IntegerNaValidator validator = IntegerNaValidator(len(values), 1850 values.dtype, skipna=skipna) 1851 return validator.validate(values) 1852 1853 1854 @cython.internal 1855 cdef class IntegerFloatValidator(Validator): 1856 cdef inline bint is_value_typed(self, object value) except -1: 1857 return util.is_integer_object(value) or util.is_float_object(value) 1858 1859 cdef inline bint is_array_typed(self) except -1: 1860 return issubclass(self.dtype.type, np.integer) 1861 1862 1863 cdef bint is_integer_float_array(ndarray values, bint skipna=True): 1864 cdef: 1865 IntegerFloatValidator validator = IntegerFloatValidator(len(values), 1866 values.dtype, 1867 skipna=skipna) 1868 return validator.validate(values) 1869 1870 1871 @cython.internal 1872 cdef class FloatValidator(Validator): 1873 cdef inline bint is_value_typed(self, object value) except -1: 1874 return util.is_float_object(value) 1875 1876 cdef inline bint is_array_typed(self) except -1: 1877 return issubclass(self.dtype.type, np.floating) 1878 1879 1880 # Note: only python-exposed for tests 1881 cpdef bint is_float_array(ndarray values): 1882 cdef: 1883 FloatValidator validator = FloatValidator(len(values), values.dtype) 1884 return validator.validate(values) 1885 1886 1887 @cython.internal 1888 cdef class ComplexValidator(Validator): 1889 cdef inline bint is_value_typed(self, object value) except -1: 1890 return ( 1891 util.is_complex_object(value) 1892 or (util.is_float_object(value) and is_nan(value)) 1893 ) 1894 1895 cdef inline bint is_array_typed(self) except -1: 1896 return issubclass(self.dtype.type, np.complexfloating) 1897 1898 1899 cdef bint is_complex_array(ndarray values): 1900 cdef: 1901 ComplexValidator validator = ComplexValidator(len(values), values.dtype) 1902 return validator.validate(values) 1903 1904 1905 @cython.internal 1906 cdef class DecimalValidator(Validator): 1907 cdef inline bint is_value_typed(self, object value) except -1: 1908 return is_decimal(value) 1909 1910 1911 cdef bint is_decimal_array(ndarray values, bint skipna=False): 1912 cdef: 1913 DecimalValidator validator = DecimalValidator( 1914 len(values), values.dtype, skipna=skipna 1915 ) 1916 return validator.validate(values) 1917 1918 1919 @cython.internal 1920 cdef class StringValidator(Validator): 1921 cdef inline bint is_value_typed(self, object value) except -1: 1922 return isinstance(value, str) 1923 1924 cdef inline bint is_array_typed(self) except -1: 1925 return issubclass(self.dtype.type, np.str_) 1926 1927 1928 cpdef bint is_string_array(ndarray values, bint skipna=False): 1929 cdef: 1930 StringValidator validator = StringValidator(len(values), 1931 values.dtype, 1932 skipna=skipna) 1933 return validator.validate(values) 1934 1935 1936 @cython.internal 1937 cdef class BytesValidator(Validator): 1938 cdef inline bint is_value_typed(self, object value) except -1: 1939 return isinstance(value, bytes) 1940 1941 cdef inline bint is_array_typed(self) except -1: 1942 return issubclass(self.dtype.type, np.bytes_) 1943 1944 1945 cdef bint is_bytes_array(ndarray values, bint skipna=False): 1946 cdef: 1947 BytesValidator validator = BytesValidator(len(values), values.dtype, 1948 skipna=skipna) 1949 return validator.validate(values) 1950 1951 1952 @cython.internal 1953 cdef class TemporalValidator(Validator): 1954 cdef: 1955 bint all_generic_na 1956 1957 def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_), 1958 bint skipna=False): 1959 self.n = n 1960 self.dtype = dtype 1961 self.skipna = skipna 1962 self.all_generic_na = True 1963 1964 cdef inline bint is_valid(self, object value) except -1: 1965 return self.is_value_typed(value) or self.is_valid_null(value) 1966 1967 cdef bint is_valid_null(self, object value) except -1: 1968 raise NotImplementedError(f"{type(self).__name__} child class " 1969 "must define is_valid_null") 1970 1971 cdef inline bint is_valid_skipna(self, object value) except -1: 1972 cdef: 1973 bint is_typed_null = self.is_valid_null(value) 1974 bint is_generic_null = value is None or util.is_nan(value) 1975 if not is_generic_null: 1976 self.all_generic_na = False 1977 return self.is_value_typed(value) or is_typed_null or is_generic_null 1978 1979 cdef bint _validate_skipna(self, ndarray values) except -1: 1980 """ 1981 If we _only_ saw non-dtype-specific NA values, even if they are valid 1982 for this dtype, we do not infer this dtype. 1983 """ 1984 return Validator._validate_skipna(self, values) and not self.all_generic_na 1985 1986 1987 @cython.internal 1988 cdef class DatetimeValidator(TemporalValidator): 1989 cdef bint is_value_typed(self, object value) except -1: 1990 return PyDateTime_Check(value) 1991 1992 cdef inline bint is_valid_null(self, object value) except -1: 1993 return is_null_datetime64(value) 1994 1995 1996 cpdef bint is_datetime_array(ndarray values, bint skipna=True): 1997 cdef: 1998 DatetimeValidator validator = DatetimeValidator(len(values), 1999 skipna=skipna) 2000 return validator.validate(values) 2001 2002 2003 @cython.internal 2004 cdef class Datetime64Validator(DatetimeValidator): 2005 cdef inline bint is_value_typed(self, object value) except -1: 2006 return util.is_datetime64_object(value) 2007 2008 2009 # Note: only python-exposed for tests 2010 cpdef bint is_datetime64_array(ndarray values, bint skipna=True): 2011 cdef: 2012 Datetime64Validator validator = Datetime64Validator(len(values), 2013 skipna=skipna) 2014 return validator.validate(values) 2015 2016 2017 @cython.internal 2018 cdef class AnyDatetimeValidator(DatetimeValidator): 2019 cdef inline bint is_value_typed(self, object value) except -1: 2020 return util.is_datetime64_object(value) or ( 2021 PyDateTime_Check(value) and value.tzinfo is None 2022 ) 2023 2024 2025 cdef bint is_datetime_or_datetime64_array(ndarray values, bint skipna=True): 2026 cdef: 2027 AnyDatetimeValidator validator = AnyDatetimeValidator(len(values), 2028 skipna=skipna) 2029 return validator.validate(values) 2030 2031 2032 # Note: only python-exposed for tests 2033 def is_datetime_with_singletz_array(values: ndarray) -> bool: 2034 """ 2035 Check values have the same tzinfo attribute. 2036 Doesn't check values are datetime-like types. 2037 """ 2038 cdef: 2039 Py_ssize_t i = 0, j, n = len(values) 2040 object base_val, base_tz, val, tz 2041 2042 if n == 0: 2043 return False 2044 2045 # Get a reference timezone to compare with the rest of the tzs in the array 2046 for i in range(n): 2047 base_val = values[i] 2048 if base_val is not NaT and base_val is not None and not util.is_nan(base_val): 2049 base_tz = getattr(base_val, 'tzinfo', None) 2050 break 2051 2052 for j in range(i, n): 2053 # Compare val's timezone with the reference timezone 2054 # NaT can coexist with tz-aware datetimes, so skip if encountered 2055 val = values[j] 2056 if val is not NaT and val is not None and not util.is_nan(val): 2057 tz = getattr(val, 'tzinfo', None) 2058 if not tz_compare(base_tz, tz): 2059 return False 2060 2061 # Note: we should only be called if a tzaware datetime has been seen, 2062 # so base_tz should always be set at this point. 2063 return True 2064 2065 2066 @cython.internal 2067 cdef class TimedeltaValidator(TemporalValidator): 2068 cdef bint is_value_typed(self, object value) except -1: 2069 return PyDelta_Check(value) 2070 2071 cdef inline bint is_valid_null(self, object value) except -1: 2072 return is_null_timedelta64(value) 2073 2074 2075 @cython.internal 2076 cdef class AnyTimedeltaValidator(TimedeltaValidator): 2077 cdef inline bint is_value_typed(self, object value) except -1: 2078 return is_timedelta(value) 2079 2080 2081 # Note: only python-exposed for tests 2082 cpdef bint is_timedelta_or_timedelta64_array(ndarray values, bint skipna=True): 2083 """ 2084 Infer with timedeltas and/or nat/none. 2085 """ 2086 cdef: 2087 AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values), 2088 skipna=skipna) 2089 return validator.validate(values) 2090 2091 2092 @cython.internal 2093 cdef class DateValidator(Validator): 2094 cdef inline bint is_value_typed(self, object value) except -1: 2095 return PyDate_Check(value) 2096 2097 2098 # Note: only python-exposed for tests 2099 cpdef bint is_date_array(ndarray values, bint skipna=False): 2100 cdef: 2101 DateValidator validator = DateValidator(len(values), skipna=skipna) 2102 return validator.validate(values) 2103 2104 2105 @cython.internal 2106 cdef class TimeValidator(Validator): 2107 cdef inline bint is_value_typed(self, object value) except -1: 2108 return PyTime_Check(value) 2109 2110 2111 # Note: only python-exposed for tests 2112 cpdef bint is_time_array(ndarray values, bint skipna=False): 2113 cdef: 2114 TimeValidator validator = TimeValidator(len(values), skipna=skipna) 2115 return validator.validate(values) 2116 2117 2118 # FIXME: actually use skipna 2119 cdef bint is_period_array(ndarray values, bint skipna=True): 2120 """ 2121 Is this an ndarray of Period objects (or NaT) with a single `freq`? 2122 """ 2123 # values should be object-dtype, but ndarray[object] assumes 1D, while 2124 # this _may_ be 2D. 2125 cdef: 2126 Py_ssize_t i, N = values.size 2127 int dtype_code = -10000 # i.e. c_FreqGroup.FR_UND 2128 object val 2129 flatiter it 2130 2131 if N == 0: 2132 return False 2133 2134 it = PyArray_IterNew(values) 2135 for i in range(N): 2136 # The PyArray_GETITEM and PyArray_ITER_NEXT are faster 2137 # equivalents to `val = values[i]` 2138 val = PyArray_GETITEM(values, PyArray_ITER_DATA(it)) 2139 PyArray_ITER_NEXT(it) 2140 2141 if is_period_object(val): 2142 if dtype_code == -10000: 2143 dtype_code = val._dtype._dtype_code 2144 elif dtype_code != val._dtype._dtype_code: 2145 # mismatched freqs 2146 return False 2147 elif checknull_with_nat(val): 2148 pass 2149 else: 2150 # Not a Period or NaT-like 2151 return False 2152 2153 if dtype_code == -10000: 2154 # we saw all-NaTs, no actual Periods 2155 return False 2156 return True 2157 2158 2159 # Note: only python-exposed for tests 2160 cpdef bint is_interval_array(ndarray values): 2161 """ 2162 Is this an ndarray of Interval (or np.nan) with a single dtype? 2163 """ 2164 cdef: 2165 Py_ssize_t i, n = len(values) 2166 str closed = None 2167 bint numeric = False 2168 bint dt64 = False 2169 bint td64 = False 2170 object val 2171 2172 if len(values) == 0: 2173 return False 2174 2175 for i in range(n): 2176 val = values[i] 2177 2178 if is_interval(val): 2179 if closed is None: 2180 closed = val.closed 2181 numeric = ( 2182 util.is_float_object(val.left) 2183 or util.is_integer_object(val.left) 2184 ) 2185 td64 = is_timedelta(val.left) 2186 dt64 = PyDateTime_Check(val.left) 2187 elif val.closed != closed: 2188 # mismatched closedness 2189 return False 2190 elif numeric: 2191 if not ( 2192 util.is_float_object(val.left) 2193 or util.is_integer_object(val.left) 2194 ): 2195 # i.e. datetime64 or timedelta64 2196 return False 2197 elif td64: 2198 if not is_timedelta(val.left): 2199 return False 2200 elif dt64: 2201 if not PyDateTime_Check(val.left): 2202 return False 2203 else: 2204 raise ValueError(val) 2205 elif util.is_nan(val) or val is None: 2206 pass 2207 else: 2208 return False 2209 2210 if closed is None: 2211 # we saw all-NAs, no actual Intervals 2212 return False 2213 return True 2214 2215 2216 @cython.boundscheck(False) 2217 @cython.wraparound(False) 2218 def maybe_convert_numeric( 2219 ndarray[object, ndim=1] values, 2220 set na_values, 2221 bint convert_empty=True, 2222 bint coerce_numeric=False, 2223 bint convert_to_masked_nullable=False, 2224 ) -> tuple[np.ndarray, np.ndarray | None]: 2225 """ 2226 Convert object array to a numeric array if possible. 2227 2228 Parameters 2229 ---------- 2230 values : ndarray[object] 2231 Array of object elements to convert. 2232 na_values : set 2233 Set of values that should be interpreted as NaN. 2234 convert_empty : bool, default True 2235 If an empty array-like object is encountered, whether to interpret 2236 that element as NaN or not. If set to False, a ValueError will be 2237 raised if such an element is encountered and 'coerce_numeric' is False. 2238 coerce_numeric : bool, default False 2239 If initial attempts to convert to numeric have failed, whether to 2240 force conversion to numeric via alternative methods or by setting the 2241 element to NaN. Otherwise, an Exception will be raised when such an 2242 element is encountered. 2243 2244 This boolean also has an impact on how conversion behaves when a 2245 numeric array has no suitable numerical dtype to return (i.e. uint64, 2246 int32, uint8). If set to False, the original object array will be 2247 returned. Otherwise, a ValueError will be raised. 2248 convert_to_masked_nullable : bool, default False 2249 Whether to return a mask for the converted values. This also disables 2250 upcasting for ints with nulls to float64. 2251 Returns 2252 ------- 2253 np.ndarray 2254 Array of converted object values to numerical ones. 2255 2256 Optional[np.ndarray] 2257 If convert_to_masked_nullable is True, 2258 returns a boolean mask for the converted values, otherwise returns None. 2259 """ 2260 if len(values) == 0: 2261 return (np.array([], dtype='i8'), None) 2262 2263 # fastpath for ints - try to convert all based on first value 2264 cdef: 2265 object val = values[0] 2266 2267 if util.is_integer_object(val): 2268 try: 2269 maybe_ints = values.astype('i8') 2270 if (maybe_ints == values).all(): 2271 return (maybe_ints, None) 2272 except (ValueError, OverflowError, TypeError): 2273 pass 2274 2275 # Otherwise, iterate and do full inference. 2276 cdef: 2277 int status, maybe_int 2278 Py_ssize_t i, n = values.size 2279 Seen seen = Seen(coerce_numeric) 2280 ndarray[float64_t, ndim=1] floats = cnp.PyArray_EMPTY(1, values.shape, cnp.NPY_FLOAT64, 0) 2281 ndarray[complex128_t, ndim=1] complexes = cnp.PyArray_EMPTY(1, values.shape, cnp.NPY_COMPLEX128, 0) 2282 ndarray[int64_t, ndim=1] ints = cnp.PyArray_EMPTY(1, values.shape, cnp.NPY_INT64, 0) 2283 ndarray[uint64_t, ndim=1] uints = cnp.PyArray_EMPTY(1, values.shape, cnp.NPY_UINT64, 0) 2284 ndarray[uint8_t, ndim=1] bools = cnp.PyArray_EMPTY(1, values.shape, cnp.NPY_UINT8, 0) 2285 ndarray[uint8_t, ndim=1] mask = np.zeros(n, dtype="u1") 2286 float64_t fval 2287 bint allow_null_in_int = convert_to_masked_nullable 2288 2289 for i in range(n): 2290 val = values[i] 2291 # We only want to disable NaNs showing as float if 2292 # a) convert_to_masked_nullable = True 2293 # b) no floats have been seen ( assuming an int shows up later ) 2294 # However, if no ints present (all null array), we need to return floats 2295 allow_null_in_int = convert_to_masked_nullable and not seen.float_ 2296 2297 if val.__hash__ is not None and val in na_values: 2298 if allow_null_in_int: 2299 seen.null_ = True 2300 mask[i] = 1 2301 else: 2302 if convert_to_masked_nullable: 2303 mask[i] = 1 2304 seen.saw_null() 2305 floats[i] = complexes[i] = NaN 2306 elif util.is_float_object(val): 2307 fval = val 2308 if fval != fval: 2309 seen.null_ = True 2310 if allow_null_in_int: 2311 mask[i] = 1 2312 else: 2313 if convert_to_masked_nullable: 2314 mask[i] = 1 2315 seen.float_ = True 2316 else: 2317 seen.float_ = True 2318 floats[i] = complexes[i] = fval 2319 elif util.is_integer_object(val): 2320 floats[i] = complexes[i] = val 2321 2322 val = int(val) 2323 seen.saw_int(val) 2324 2325 if val >= 0: 2326 if val <= oUINT64_MAX: 2327 uints[i] = val 2328 else: 2329 seen.float_ = True 2330 2331 if oINT64_MIN <= val <= oINT64_MAX: 2332 ints[i] = val 2333 2334 if val < oINT64_MIN or (seen.sint_ and seen.uint_): 2335 seen.float_ = True 2336 2337 elif util.is_bool_object(val): 2338 floats[i] = uints[i] = ints[i] = bools[i] = val 2339 seen.bool_ = True 2340 elif val is None or val is C_NA: 2341 if allow_null_in_int: 2342 seen.null_ = True 2343 mask[i] = 1 2344 else: 2345 if convert_to_masked_nullable: 2346 mask[i] = 1 2347 seen.saw_null() 2348 floats[i] = complexes[i] = NaN 2349 elif hasattr(val, '__len__') and len(val) == 0: 2350 if convert_empty or seen.coerce_numeric: 2351 seen.saw_null() 2352 floats[i] = complexes[i] = NaN 2353 else: 2354 raise ValueError("Empty string encountered") 2355 elif util.is_complex_object(val): 2356 complexes[i] = val 2357 seen.complex_ = True 2358 elif is_decimal(val): 2359 floats[i] = complexes[i] = val 2360 seen.float_ = True 2361 else: 2362 try: 2363 status = floatify(val, &fval, &maybe_int) 2364 2365 if fval in na_values: 2366 seen.saw_null() 2367 floats[i] = complexes[i] = NaN 2368 mask[i] = 1 2369 else: 2370 if fval != fval: 2371 seen.null_ = True 2372 mask[i] = 1 2373 2374 floats[i] = fval 2375 2376 if maybe_int: 2377 as_int = int(val) 2378 2379 if as_int in na_values: 2380 mask[i] = 1 2381 seen.null_ = True 2382 if not allow_null_in_int: 2383 seen.float_ = True 2384 else: 2385 seen.saw_int(as_int) 2386 2387 if as_int not in na_values: 2388 if as_int < oINT64_MIN or as_int > oUINT64_MAX: 2389 if seen.coerce_numeric: 2390 seen.float_ = True 2391 else: 2392 raise ValueError("Integer out of range.") 2393 else: 2394 if as_int >= 0: 2395 uints[i] = as_int 2396 2397 if as_int <= oINT64_MAX: 2398 ints[i] = as_int 2399 2400 seen.float_ = seen.float_ or (seen.uint_ and seen.sint_) 2401 else: 2402 seen.float_ = True 2403 except (TypeError, ValueError) as err: 2404 if not seen.coerce_numeric: 2405 raise type(err)(f"{err} at position {i}") 2406 2407 seen.saw_null() 2408 floats[i] = NaN 2409 2410 if seen.check_uint64_conflict(): 2411 return (values, None) 2412 2413 # This occurs since we disabled float nulls showing as null in anticipation 2414 # of seeing ints that were never seen. So then, we return float 2415 if allow_null_in_int and seen.null_ and not seen.int_: 2416 seen.float_ = True 2417 2418 if seen.complex_: 2419 return (complexes, None) 2420 elif seen.float_: 2421 if seen.null_ and convert_to_masked_nullable: 2422 return (floats, mask.view(np.bool_)) 2423 return (floats, None) 2424 elif seen.int_: 2425 if seen.null_ and convert_to_masked_nullable: 2426 if seen.uint_: 2427 return (uints, mask.view(np.bool_)) 2428 else: 2429 return (ints, mask.view(np.bool_)) 2430 if seen.uint_: 2431 return (uints, None) 2432 else: 2433 return (ints, None) 2434 elif seen.bool_: 2435 return (bools.view(np.bool_), None) 2436 elif seen.uint_: 2437 return (uints, None) 2438 return (ints, None) 2439 2440 2441 @cython.boundscheck(False) 2442 @cython.wraparound(False) 2443 def maybe_convert_objects(ndarray[object] objects, 2444 *, 2445 bint try_float=False, 2446 bint safe=False, 2447 bint convert_datetime=False, 2448 bint convert_timedelta=False, 2449 bint convert_period=False, 2450 bint convert_interval=False, 2451 bint convert_to_nullable_integer=False, 2452 object dtype_if_all_nat=None) -> "ArrayLike": 2453 """ 2454 Type inference function-- convert object array to proper dtype 2455 2456 Parameters 2457 ---------- 2458 objects : ndarray[object] 2459 Array of object elements to convert. 2460 try_float : bool, default False 2461 If an array-like object contains only float or NaN values is 2462 encountered, whether to convert and return an array of float dtype. 2463 safe : bool, default False 2464 Whether to upcast numeric type (e.g. int cast to float). If set to 2465 True, no upcasting will be performed. 2466 convert_datetime : bool, default False 2467 If an array-like object contains only datetime values or NaT is 2468 encountered, whether to convert and return an array of M8[ns] dtype. 2469 convert_timedelta : bool, default False 2470 If an array-like object contains only timedelta values or NaT is 2471 encountered, whether to convert and return an array of m8[ns] dtype. 2472 convert_period : bool, default False 2473 If an array-like object contains only (homogeneous-freq) Period values 2474 or NaT, whether to convert and return a PeriodArray. 2475 convert_interval : bool, default False 2476 If an array-like object contains only Interval objects (with matching 2477 dtypes and closedness) or NaN, whether to convert to IntervalArray. 2478 convert_to_nullable_integer : bool, default False 2479 If an array-like object contains only integer values (and NaN) is 2480 encountered, whether to convert and return an IntegerArray. 2481 dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None 2482 Dtype to cast to if we have all-NaT. 2483 2484 Returns 2485 ------- 2486 np.ndarray or ExtensionArray 2487 Array of converted object values to more specific dtypes if applicable. 2488 """ 2489 cdef: 2490 Py_ssize_t i, n, itemsize_max = 0 2491 ndarray[float64_t] floats 2492 ndarray[complex128_t] complexes 2493 ndarray[int64_t] ints 2494 ndarray[uint64_t] uints 2495 ndarray[uint8_t] bools 2496 int64_t[::1] idatetimes 2497 int64_t[::1] itimedeltas 2498 Seen seen = Seen() 2499 object val 2500 float64_t fval, fnan = np.nan 2501 2502 n = len(objects) 2503 2504 floats = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_FLOAT64, 0) 2505 complexes = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_COMPLEX128, 0) 2506 ints = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_INT64, 0) 2507 uints = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT64, 0) 2508 bools = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT8, 0) 2509 mask = np.full(n, False) 2510 2511 if convert_datetime: 2512 datetimes = np.empty(n, dtype='M8[ns]') 2513 idatetimes = datetimes.view(np.int64) 2514 2515 if convert_timedelta: 2516 timedeltas = np.empty(n, dtype='m8[ns]') 2517 itimedeltas = timedeltas.view(np.int64) 2518 2519 for i in range(n): 2520 val = objects[i] 2521 if itemsize_max != -1: 2522 itemsize = get_itemsize(val) 2523 if itemsize > itemsize_max or itemsize == -1: 2524 itemsize_max = itemsize 2525 2526 if val is None: 2527 seen.null_ = True 2528 floats[i] = complexes[i] = fnan 2529 mask[i] = True 2530 elif val is NaT: 2531 seen.nat_ = True 2532 if convert_datetime: 2533 idatetimes[i] = NPY_NAT 2534 if convert_timedelta: 2535 itimedeltas[i] = NPY_NAT 2536 if not (convert_datetime or convert_timedelta or convert_period): 2537 seen.object_ = True 2538 break 2539 elif val is np.nan: 2540 seen.nan_ = True 2541 mask[i] = True 2542 floats[i] = complexes[i] = val 2543 elif util.is_bool_object(val): 2544 seen.bool_ = True 2545 bools[i] = val 2546 elif util.is_float_object(val): 2547 floats[i] = complexes[i] = val 2548 seen.float_ = True 2549 elif is_timedelta(val): 2550 if convert_timedelta: 2551 seen.timedelta_ = True 2552 try: 2553 itimedeltas[i] = convert_to_timedelta64(val, "ns").view("i8") 2554 except OutOfBoundsTimedelta: 2555 seen.object_ = True 2556 break 2557 break 2558 else: 2559 seen.object_ = True 2560 break 2561 elif util.is_integer_object(val): 2562 seen.int_ = True 2563 floats[i] = <float64_t>val 2564 complexes[i] = <double complex>val 2565 if not seen.null_: 2566 seen.saw_int(val) 2567 2568 if ((seen.uint_ and seen.sint_) or 2569 val > oUINT64_MAX or val < oINT64_MIN): 2570 seen.object_ = True 2571 break 2572 2573 if seen.uint_: 2574 uints[i] = val 2575 elif seen.sint_: 2576 ints[i] = val 2577 else: 2578 uints[i] = val 2579 ints[i] = val 2580 2581 elif util.is_complex_object(val): 2582 complexes[i] = val 2583 seen.complex_ = True 2584 elif PyDateTime_Check(val) or util.is_datetime64_object(val): 2585 2586 # if we have an tz's attached then return the objects 2587 if convert_datetime: 2588 if getattr(val, 'tzinfo', None) is not None: 2589 seen.datetimetz_ = True 2590 break 2591 else: 2592 seen.datetime_ = True 2593 try: 2594 idatetimes[i] = convert_to_tsobject( 2595 val, None, None, 0, 0).value 2596 except OutOfBoundsDatetime: 2597 seen.object_ = True 2598 break 2599 else: 2600 seen.object_ = True 2601 break 2602 elif is_period_object(val): 2603 if convert_period: 2604 seen.period_ = True 2605 break 2606 else: 2607 seen.object_ = True 2608 break 2609 elif try_float and not isinstance(val, str): 2610 # this will convert Decimal objects 2611 try: 2612 floats[i] = float(val) 2613 complexes[i] = complex(val) 2614 seen.float_ = True 2615 except (ValueError, TypeError): 2616 seen.object_ = True 2617 break 2618 elif is_interval(val): 2619 if convert_interval: 2620 seen.interval_ = True 2621 break 2622 else: 2623 seen.object_ = True 2624 break 2625 else: 2626 seen.object_ = True 2627 break 2628 2629 # we try to coerce datetime w/tz but must all have the same tz 2630 if seen.datetimetz_: 2631 if is_datetime_with_singletz_array(objects): 2632 from pandas import DatetimeIndex 2633 dti = DatetimeIndex(objects) 2634 2635 # unbox to DatetimeArray 2636 return dti._data 2637 seen.object_ = True 2638 2639 elif seen.datetime_: 2640 if is_datetime_or_datetime64_array(objects): 2641 from pandas import DatetimeIndex 2642 2643 try: 2644 dti = DatetimeIndex(objects) 2645 except OutOfBoundsDatetime: 2646 pass 2647 else: 2648 # unbox to ndarray[datetime64[ns]] 2649 return dti._data._ndarray 2650 seen.object_ = True 2651 2652 elif seen.timedelta_: 2653 if is_timedelta_or_timedelta64_array(objects): 2654 from pandas import TimedeltaIndex 2655 2656 try: 2657 tdi = TimedeltaIndex(objects) 2658 except OutOfBoundsTimedelta: 2659 pass 2660 else: 2661 # unbox to ndarray[timedelta64[ns]] 2662 return tdi._data._ndarray 2663 seen.object_ = True 2664 2665 if seen.period_: 2666 if is_period_array(objects): 2667 from pandas import PeriodIndex 2668 pi = PeriodIndex(objects) 2669 2670 # unbox to PeriodArray 2671 return pi._data 2672 seen.object_ = True 2673 2674 if seen.interval_: 2675 if is_interval_array(objects): 2676 from pandas import IntervalIndex 2677 ii = IntervalIndex(objects) 2678 2679 # unbox to IntervalArray 2680 return ii._data 2681 2682 seen.object_ = True 2683 2684 if not seen.object_: 2685 result = None 2686 if not safe: 2687 if seen.null_ or seen.nan_: 2688 if seen.is_float_or_complex: 2689 if seen.complex_: 2690 result = complexes 2691 elif seen.float_: 2692 result = floats 2693 elif seen.int_: 2694 if convert_to_nullable_integer: 2695 from pandas.core.arrays import IntegerArray 2696 result = IntegerArray(ints, mask) 2697 else: 2698 result = floats 2699 elif seen.nan_: 2700 result = floats 2701 else: 2702 if not seen.bool_: 2703 if seen.datetime_: 2704 if not seen.numeric_ and not seen.timedelta_: 2705 result = datetimes 2706 elif seen.timedelta_: 2707 if not seen.numeric_: 2708 result = timedeltas 2709 elif seen.nat_: 2710 if not seen.numeric_: 2711 if convert_datetime and convert_timedelta: 2712 dtype = dtype_if_all_nat 2713 if dtype is not None: 2714 # otherwise we keep object dtype 2715 result = _infer_all_nats( 2716 dtype, datetimes, timedeltas 2717 ) 2718 2719 elif convert_datetime: 2720 result = datetimes 2721 elif convert_timedelta: 2722 result = timedeltas 2723 else: 2724 if seen.complex_: 2725 result = complexes 2726 elif seen.float_: 2727 result = floats 2728 elif seen.int_: 2729 if seen.uint_: 2730 result = uints 2731 else: 2732 result = ints 2733 elif seen.is_bool: 2734 result = bools.view(np.bool_) 2735 2736 else: 2737 # don't cast int to float, etc. 2738 if seen.null_: 2739 if seen.is_float_or_complex: 2740 if seen.complex_: 2741 if not seen.int_: 2742 result = complexes 2743 elif seen.float_ or seen.nan_: 2744 if not seen.int_: 2745 result = floats 2746 else: 2747 if not seen.bool_: 2748 if seen.datetime_: 2749 if not seen.numeric_ and not seen.timedelta_: 2750 result = datetimes 2751 elif seen.timedelta_: 2752 if not seen.numeric_: 2753 result = timedeltas 2754 elif seen.nat_: 2755 if not seen.numeric_: 2756 if convert_datetime and convert_timedelta: 2757 dtype = dtype_if_all_nat 2758 if dtype is not None: 2759 # otherwise we keep object dtype 2760 result = _infer_all_nats( 2761 dtype, datetimes, timedeltas 2762 ) 2763 2764 elif convert_datetime: 2765 result = datetimes 2766 elif convert_timedelta: 2767 result = timedeltas 2768 else: 2769 if seen.complex_: 2770 if not seen.int_: 2771 result = complexes 2772 elif seen.float_ or seen.nan_: 2773 if not seen.int_: 2774 result = floats 2775 elif seen.int_: 2776 if seen.uint_: 2777 result = uints 2778 else: 2779 result = ints 2780 elif seen.is_bool and not seen.nan_: 2781 result = bools.view(np.bool_) 2782 2783 if result is uints or result is ints or result is floats or result is complexes: 2784 # cast to the largest itemsize when all values are NumPy scalars 2785 if itemsize_max > 0 and itemsize_max != result.dtype.itemsize: 2786 result = result.astype(result.dtype.kind + str(itemsize_max)) 2787 return result 2788 elif result is not None: 2789 return result 2790 2791 return objects 2792 2793 2794 cdef _infer_all_nats(dtype, ndarray datetimes, ndarray timedeltas): 2795 """ 2796 If we have all-NaT values, cast these to the given dtype. 2797 """ 2798 if cnp.PyArray_DescrCheck(dtype): 2799 # i.e. isinstance(dtype, np.dtype): 2800 if dtype == "M8[ns]": 2801 result = datetimes 2802 elif dtype == "m8[ns]": 2803 result = timedeltas 2804 else: 2805 raise ValueError(dtype) 2806 else: 2807 # ExtensionDtype 2808 cls = dtype.construct_array_type() 2809 i8vals = cnp.PyArray_EMPTY(1, datetimes.shape, cnp.NPY_INT64, 0) 2810 i8vals.fill(NPY_NAT) 2811 result = cls(i8vals, dtype=dtype) 2812 return result 2813 2814 2815 class _NoDefault(Enum): 2816 # We make this an Enum 2817 # 1) because it round-trips through pickle correctly (see GH#40397) 2818 # 2) because mypy does not understand singletons 2819 no_default = "NO_DEFAULT" 2820 2821 def __repr__(self) -> str: 2822 return "<no_default>" 2823 2824 2825 # Note: no_default is exported to the public API in pandas.api.extensions 2826 no_default = _NoDefault.no_default # Sentinel indicating the default value. 2827 NoDefault = Literal[_NoDefault.no_default] 2828 2829 2830 @cython.boundscheck(False) 2831 @cython.wraparound(False) 2832 def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True, 2833 object na_value=no_default, cnp.dtype dtype=np.dtype(object) 2834 ) -> np.ndarray: 2835 """ 2836 Substitute for np.vectorize with pandas-friendly dtype inference. 2837 2838 Parameters 2839 ---------- 2840 arr : ndarray 2841 f : function 2842 mask : ndarray 2843 uint8 dtype ndarray indicating values not to apply `f` to. 2844 convert : bool, default True 2845 Whether to call `maybe_convert_objects` on the resulting ndarray 2846 na_value : Any, optional 2847 The result value to use for masked values. By default, the 2848 input value is used 2849 dtype : numpy.dtype 2850 The numpy dtype to use for the result ndarray. 2851 2852 Returns 2853 ------- 2854 np.ndarray 2855 """ 2856 cdef: 2857 Py_ssize_t i, n 2858 ndarray result 2859 object val 2860 2861 n = len(arr) 2862 result = np.empty(n, dtype=dtype) 2863 for i in range(n): 2864 if mask[i]: 2865 if na_value is no_default: 2866 val = arr[i] 2867 else: 2868 val = na_value 2869 else: 2870 val = f(arr[i]) 2871 2872 if cnp.PyArray_IsZeroDim(val): 2873 # unbox 0-dim arrays, GH#690 2874 val = val.item() 2875 2876 result[i] = val 2877 2878 if convert: 2879 return maybe_convert_objects(result, 2880 try_float=False, 2881 convert_datetime=False, 2882 convert_timedelta=False) 2883 2884 return result 2885 2886 2887 @cython.boundscheck(False) 2888 @cython.wraparound(False) 2889 def map_infer( 2890 ndarray arr, object f, bint convert=True, bint ignore_na=False 2891 ) -> np.ndarray: 2892 """ 2893 Substitute for np.vectorize with pandas-friendly dtype inference. 2894 2895 Parameters 2896 ---------- 2897 arr : ndarray 2898 f : function 2899 convert : bint 2900 ignore_na : bint 2901 If True, NA values will not have f applied 2902 2903 Returns 2904 ------- 2905 np.ndarray 2906 """ 2907 cdef: 2908 Py_ssize_t i, n 2909 ndarray[object] result 2910 object val 2911 2912 n = len(arr) 2913 result = cnp.PyArray_EMPTY(1, arr.shape, cnp.NPY_OBJECT, 0) 2914 for i in range(n): 2915 if ignore_na and checknull(arr[i]): 2916 result[i] = arr[i] 2917 continue 2918 val = f(arr[i]) 2919 2920 if cnp.PyArray_IsZeroDim(val): 2921 # unbox 0-dim arrays, GH#690 2922 val = val.item() 2923 2924 result[i] = val 2925 2926 if convert: 2927 return maybe_convert_objects(result, 2928 try_float=False, 2929 convert_datetime=False, 2930 convert_timedelta=False) 2931 2932 return result 2933 2934 2935 def to_object_array(rows: object, min_width: int = 0) -> ndarray: 2936 """ 2937 Convert a list of lists into an object array. 2938 2939 Parameters 2940 ---------- 2941 rows : 2-d array (N, K) 2942 List of lists to be converted into an array. 2943 min_width : int 2944 Minimum width of the object array. If a list 2945 in `rows` contains fewer than `width` elements, 2946 the remaining elements in the corresponding row 2947 will all be `NaN`. 2948 2949 Returns 2950 ------- 2951 np.ndarray[object, ndim=2] 2952 """ 2953 cdef: 2954 Py_ssize_t i, j, n, k, tmp 2955 ndarray[object, ndim=2] result 2956 list row 2957 2958 rows = list(rows) 2959 n = len(rows) 2960 2961 k = min_width 2962 for i in range(n): 2963 tmp = len(rows[i]) 2964 if tmp > k: 2965 k = tmp 2966 2967 result = np.empty((n, k), dtype=object) 2968 2969 for i in range(n): 2970 row = list(rows[i]) 2971 2972 for j in range(len(row)): 2973 result[i, j] = row[j] 2974 2975 return result 2976 2977 2978 def tuples_to_object_array(ndarray[object] tuples): 2979 cdef: 2980 Py_ssize_t i, j, n, k, tmp 2981 ndarray[object, ndim=2] result 2982 tuple tup 2983 2984 n = len(tuples) 2985 k = len(tuples[0]) 2986 result = np.empty((n, k), dtype=object) 2987 for i in range(n): 2988 tup = tuples[i] 2989 for j in range(k): 2990 result[i, j] = tup[j] 2991 2992 return result 2993 2994 2995 def to_object_array_tuples(rows: object) -> np.ndarray: 2996 """ 2997 Convert a list of tuples into an object array. Any subclass of 2998 tuple in `rows` will be casted to tuple. 2999 3000 Parameters 3001 ---------- 3002 rows : 2-d array (N, K) 3003 List of tuples to be converted into an array. 3004 3005 Returns 3006 ------- 3007 np.ndarray[object, ndim=2] 3008 """ 3009 cdef: 3010 Py_ssize_t i, j, n, k, tmp 3011 ndarray[object, ndim=2] result 3012 tuple row 3013 3014 rows = list(rows) 3015 n = len(rows) 3016 3017 k = 0 3018 for i in range(n): 3019 tmp = 1 if checknull(rows[i]) else len(rows[i]) 3020 if tmp > k: 3021 k = tmp 3022 3023 result = np.empty((n, k), dtype=object) 3024 3025 try: 3026 for i in range(n): 3027 row = rows[i] 3028 for j in range(len(row)): 3029 result[i, j] = row[j] 3030 except TypeError: 3031 # e.g. "Expected tuple, got list" 3032 # upcast any subclasses to tuple 3033 for i in range(n): 3034 row = (rows[i],) if checknull(rows[i]) else tuple(rows[i]) 3035 for j in range(len(row)): 3036 result[i, j] = row[j] 3037 3038 return result 3039 3040 3041 @cython.wraparound(False) 3042 @cython.boundscheck(False) 3043 def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray: 3044 cdef: 3045 Py_ssize_t i, n = len(keys) 3046 object val 3047 ndarray[object] output = np.empty(n, dtype='O') 3048 3049 if n == 0: 3050 # kludge, for Series 3051 return np.empty(0, dtype='f8') 3052 3053 for i in range(n): 3054 val = keys[i] 3055 if val in mapping: 3056 output[i] = mapping[val] 3057 else: 3058 output[i] = default 3059 3060 return maybe_convert_objects(output) 3061 3062 3063 def is_bool_list(obj: list) -> bool: 3064 """ 3065 Check if this list contains only bool or np.bool_ objects. 3066 3067 This is appreciably faster than checking `np.array(obj).dtype == bool` 3068 3069 obj1 = [True, False] * 100 3070 obj2 = obj1 * 100 3071 obj3 = obj2 * 100 3072 obj4 = [True, None] + obj1 3073 3074 for obj in [obj1, obj2, obj3, obj4]: 3075 %timeit is_bool_list(obj) 3076 %timeit np.array(obj).dtype.kind == "b" 3077 3078 340 ns ± 8.22 ns 3079 8.78 µs ± 253 ns 3080 3081 28.8 µs ± 704 ns 3082 813 µs ± 17.8 µs 3083 3084 3.4 ms ± 168 µs 3085 78.4 ms ± 1.05 ms 3086 3087 48.1 ns ± 1.26 ns 3088 8.1 µs ± 198 ns 3089 """ 3090 cdef: 3091 object item 3092 3093 for item in obj: 3094 if not util.is_bool_object(item): 3095 return False 3096 3097 # Note: we return True for empty list 3098 return True 3099 3100 3101 cpdef ndarray eq_NA_compat(ndarray[object] arr, object key): 3102 """ 3103 Check for `arr == key`, treating all values as not-equal to pd.NA. 3104 3105 key is assumed to have `not isna(key)` 3106 """ 3107 cdef: 3108 ndarray[uint8_t, cast=True] result = cnp.PyArray_EMPTY(arr.ndim, arr.shape, cnp.NPY_BOOL, 0) 3109 Py_ssize_t i 3110 object item 3111 3112 for i in range(len(arr)): 3113 item = arr[i] 3114 if item is C_NA: 3115 result[i] = False 3116 else: 3117 result[i] = item == key 3118 3119 return result 3120 3121 3122 def dtypes_all_equal(list types not None) -> bool: 3123 """ 3124 Faster version for: 3125 3126 first = types[0] 3127 all(is_dtype_equal(first, t) for t in types[1:]) 3128 3129 And assuming all elements in the list are np.dtype/ExtensionDtype objects 3130 3131 See timings at https://github.com/pandas-dev/pandas/pull/44594 3132 """ 3133 first = types[0] 3134 for t in types[1:]: 3135 try: 3136 if not t == first: 3137 return False 3138 except (TypeError, AttributeError): 3139 return False 3140 else: 3141 return True