groupby.pyx
1 cimport cython 2 from cython cimport ( 3 Py_ssize_t, 4 floating, 5 ) 6 from libc.stdlib cimport ( 7 free, 8 malloc, 9 ) 10 11 import numpy as np 12 13 cimport numpy as cnp 14 from numpy cimport ( 15 complex64_t, 16 complex128_t, 17 float32_t, 18 float64_t, 19 int8_t, 20 int16_t, 21 int32_t, 22 int64_t, 23 intp_t, 24 ndarray, 25 uint8_t, 26 uint16_t, 27 uint32_t, 28 uint64_t, 29 ) 30 from numpy.math cimport NAN 31 32 cnp.import_array() 33 34 from pandas._libs cimport util 35 from pandas._libs.algos cimport ( 36 get_rank_nan_fill_val, 37 kth_smallest_c, 38 ) 39 40 from pandas._libs.algos import ( 41 ensure_platform_int, 42 groupsort_indexer, 43 rank_1d, 44 take_2d_axis1_float64_float64, 45 ) 46 47 from pandas._libs.dtypes cimport ( 48 numeric_object_t, 49 numeric_t, 50 ) 51 from pandas._libs.missing cimport checknull 52 53 54 cdef int64_t NPY_NAT = util.get_nat() 55 _int64_max = np.iinfo(np.int64).max 56 57 cdef float64_t NaN = <float64_t>np.NaN 58 59 cdef enum InterpolationEnumType: 60 INTERPOLATION_LINEAR, 61 INTERPOLATION_LOWER, 62 INTERPOLATION_HIGHER, 63 INTERPOLATION_NEAREST, 64 INTERPOLATION_MIDPOINT 65 66 67 cdef inline float64_t median_linear(float64_t* a, int n) nogil: 68 cdef: 69 int i, j, na_count = 0 70 float64_t result 71 float64_t* tmp 72 73 if n == 0: 74 return NaN 75 76 # count NAs 77 for i in range(n): 78 if a[i] != a[i]: 79 na_count += 1 80 81 if na_count: 82 if na_count == n: 83 return NaN 84 85 tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t)) 86 87 j = 0 88 for i in range(n): 89 if a[i] == a[i]: 90 tmp[j] = a[i] 91 j += 1 92 93 a = tmp 94 n -= na_count 95 96 if n % 2: 97 result = kth_smallest_c(a, n // 2, n) 98 else: 99 result = (kth_smallest_c(a, n // 2, n) + 100 kth_smallest_c(a, n // 2 - 1, n)) / 2 101 102 if na_count: 103 free(a) 104 105 return result 106 107 108 @cython.boundscheck(False) 109 @cython.wraparound(False) 110 def group_median_float64( 111 ndarray[float64_t, ndim=2] out, 112 ndarray[int64_t] counts, 113 ndarray[float64_t, ndim=2] values, 114 ndarray[intp_t] labels, 115 Py_ssize_t min_count=-1, 116 ) -> None: 117 """ 118 Only aggregates on axis=0 119 """ 120 cdef: 121 Py_ssize_t i, j, N, K, ngroups, size 122 ndarray[intp_t] _counts 123 ndarray[float64_t, ndim=2] data 124 ndarray[intp_t] indexer 125 float64_t* ptr 126 127 assert min_count == -1, "'min_count' only used in sum and prod" 128 129 ngroups = len(counts) 130 N, K = (<object>values).shape 131 132 indexer, _counts = groupsort_indexer(labels, ngroups) 133 counts[:] = _counts[1:] 134 135 data = np.empty((K, N), dtype=np.float64) 136 ptr = <float64_t*>cnp.PyArray_DATA(data) 137 138 take_2d_axis1_float64_float64(values.T, indexer, out=data) 139 140 with nogil: 141 142 for i in range(K): 143 # exclude NA group 144 ptr += _counts[0] 145 for j in range(ngroups): 146 size = _counts[j + 1] 147 out[j, i] = median_linear(ptr, size) 148 ptr += size 149 150 151 @cython.boundscheck(False) 152 @cython.wraparound(False) 153 def group_cumprod_float64( 154 float64_t[:, ::1] out, 155 const float64_t[:, :] values, 156 const intp_t[::1] labels, 157 int ngroups, 158 bint is_datetimelike, 159 bint skipna=True, 160 ) -> None: 161 """ 162 Cumulative product of columns of `values`, in row groups `labels`. 163 164 Parameters 165 ---------- 166 out : np.ndarray[np.float64, ndim=2] 167 Array to store cumprod in. 168 values : np.ndarray[np.float64, ndim=2] 169 Values to take cumprod of. 170 labels : np.ndarray[np.intp] 171 Labels to group by. 172 ngroups : int 173 Number of groups, larger than all entries of `labels`. 174 is_datetimelike : bool 175 Always false, `values` is never datetime-like. 176 skipna : bool 177 If true, ignore nans in `values`. 178 179 Notes 180 ----- 181 This method modifies the `out` parameter, rather than returning an object. 182 """ 183 cdef: 184 Py_ssize_t i, j, N, K, size 185 float64_t val 186 float64_t[:, ::1] accum 187 intp_t lab 188 189 N, K = (<object>values).shape 190 accum = np.ones((ngroups, K), dtype=np.float64) 191 192 with nogil: 193 for i in range(N): 194 lab = labels[i] 195 196 if lab < 0: 197 continue 198 for j in range(K): 199 val = values[i, j] 200 if val == val: 201 accum[lab, j] *= val 202 out[i, j] = accum[lab, j] 203 else: 204 out[i, j] = NaN 205 if not skipna: 206 accum[lab, j] = NaN 207 208 209 ctypedef fused int64float_t: 210 int64_t 211 uint64_t 212 float32_t 213 float64_t 214 215 216 @cython.boundscheck(False) 217 @cython.wraparound(False) 218 def group_cumsum( 219 int64float_t[:, ::1] out, 220 ndarray[int64float_t, ndim=2] values, 221 const intp_t[::1] labels, 222 int ngroups, 223 bint is_datetimelike, 224 bint skipna=True, 225 const uint8_t[:, :] mask=None, 226 uint8_t[:, ::1] result_mask=None, 227 ) -> None: 228 """ 229 Cumulative sum of columns of `values`, in row groups `labels`. 230 231 Parameters 232 ---------- 233 out : np.ndarray[ndim=2] 234 Array to store cumsum in. 235 values : np.ndarray[ndim=2] 236 Values to take cumsum of. 237 labels : np.ndarray[np.intp] 238 Labels to group by. 239 ngroups : int 240 Number of groups, larger than all entries of `labels`. 241 is_datetimelike : bool 242 True if `values` contains datetime-like entries. 243 skipna : bool 244 If true, ignore nans in `values`. 245 mask: np.ndarray[uint8], optional 246 Mask of values 247 result_mask: np.ndarray[int8], optional 248 Mask of out array 249 250 Notes 251 ----- 252 This method modifies the `out` parameter, rather than returning an object. 253 """ 254 cdef: 255 Py_ssize_t i, j, N, K, size 256 int64float_t val, y, t, na_val 257 int64float_t[:, ::1] accum, compensation 258 uint8_t[:, ::1] accum_mask 259 intp_t lab 260 bint isna_entry, isna_prev = False 261 bint uses_mask = mask is not None 262 263 N, K = (<object>values).shape 264 265 if uses_mask: 266 accum_mask = np.zeros((ngroups, K), dtype="uint8") 267 268 accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype) 269 compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype) 270 271 na_val = _get_na_val(<int64float_t>0, is_datetimelike) 272 273 with nogil: 274 for i in range(N): 275 lab = labels[i] 276 277 if lab < 0: 278 continue 279 for j in range(K): 280 val = values[i, j] 281 282 if uses_mask: 283 isna_entry = mask[i, j] 284 else: 285 isna_entry = _treat_as_na(val, is_datetimelike) 286 287 if not skipna: 288 if uses_mask: 289 isna_prev = accum_mask[lab, j] 290 else: 291 isna_prev = _treat_as_na(accum[lab, j], is_datetimelike) 292 293 if isna_prev: 294 if uses_mask: 295 result_mask[i, j] = True 296 # Be deterministic, out was initialized as empty 297 out[i, j] = 0 298 else: 299 out[i, j] = na_val 300 continue 301 302 if isna_entry: 303 304 if uses_mask: 305 result_mask[i, j] = True 306 # Be deterministic, out was initialized as empty 307 out[i, j] = 0 308 else: 309 out[i, j] = na_val 310 311 if not skipna: 312 if uses_mask: 313 accum_mask[lab, j] = True 314 else: 315 accum[lab, j] = na_val 316 317 else: 318 # For floats, use Kahan summation to reduce floating-point 319 # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm) 320 if int64float_t == float32_t or int64float_t == float64_t: 321 y = val - compensation[lab, j] 322 t = accum[lab, j] + y 323 compensation[lab, j] = t - accum[lab, j] - y 324 else: 325 t = val + accum[lab, j] 326 327 accum[lab, j] = t 328 out[i, j] = t 329 330 331 @cython.boundscheck(False) 332 @cython.wraparound(False) 333 def group_shift_indexer( 334 int64_t[::1] out, 335 const intp_t[::1] labels, 336 int ngroups, 337 int periods, 338 ) -> None: 339 cdef: 340 Py_ssize_t N, i, j, ii, lab 341 int offset = 0, sign 342 int64_t idxer, idxer_slot 343 int64_t[::1] label_seen = np.zeros(ngroups, dtype=np.int64) 344 int64_t[:, ::1] label_indexer 345 346 N, = (<object>labels).shape 347 348 if periods < 0: 349 periods = -periods 350 offset = N - 1 351 sign = -1 352 elif periods > 0: 353 offset = 0 354 sign = 1 355 356 if periods == 0: 357 with nogil: 358 for i in range(N): 359 out[i] = i 360 else: 361 # array of each previous indexer seen 362 label_indexer = np.zeros((ngroups, periods), dtype=np.int64) 363 with nogil: 364 for i in range(N): 365 # reverse iterator if shifting backwards 366 ii = offset + sign * i 367 lab = labels[ii] 368 369 # Skip null keys 370 if lab == -1: 371 out[ii] = -1 372 continue 373 374 label_seen[lab] += 1 375 376 idxer_slot = label_seen[lab] % periods 377 idxer = label_indexer[lab, idxer_slot] 378 379 if label_seen[lab] > periods: 380 out[ii] = idxer 381 else: 382 out[ii] = -1 383 384 label_indexer[lab, idxer_slot] = ii 385 386 387 @cython.wraparound(False) 388 @cython.boundscheck(False) 389 def group_fillna_indexer( 390 ndarray[intp_t] out, 391 ndarray[intp_t] labels, 392 ndarray[intp_t] sorted_labels, 393 ndarray[uint8_t] mask, 394 str direction, 395 int64_t limit, 396 bint dropna, 397 ) -> None: 398 """ 399 Indexes how to fill values forwards or backwards within a group. 400 401 Parameters 402 ---------- 403 out : np.ndarray[np.intp] 404 Values into which this method will write its results. 405 labels : np.ndarray[np.intp] 406 Array containing unique label for each group, with its ordering 407 matching up to the corresponding record in `values`. 408 sorted_labels : np.ndarray[np.intp] 409 obtained by `np.argsort(labels, kind="mergesort")`; reversed if 410 direction == "bfill" 411 values : np.ndarray[np.uint8] 412 Containing the truth value of each element. 413 mask : np.ndarray[np.uint8] 414 Indicating whether a value is na or not. 415 direction : {'ffill', 'bfill'} 416 Direction for fill to be applied (forwards or backwards, respectively) 417 limit : Consecutive values to fill before stopping, or -1 for no limit 418 dropna : Flag to indicate if NaN groups should return all NaN values 419 420 Notes 421 ----- 422 This method modifies the `out` parameter rather than returning an object 423 """ 424 cdef: 425 Py_ssize_t i, N, idx 426 intp_t curr_fill_idx=-1 427 int64_t filled_vals = 0 428 429 N = len(out) 430 431 # Make sure all arrays are the same size 432 assert N == len(labels) == len(mask) 433 434 with nogil: 435 for i in range(N): 436 idx = sorted_labels[i] 437 if dropna and labels[idx] == -1: # nan-group gets nan-values 438 curr_fill_idx = -1 439 elif mask[idx] == 1: # is missing 440 # Stop filling once we've hit the limit 441 if filled_vals >= limit and limit != -1: 442 curr_fill_idx = -1 443 filled_vals += 1 444 else: # reset items when not missing 445 filled_vals = 0 446 curr_fill_idx = idx 447 448 out[idx] = curr_fill_idx 449 450 # If we move to the next group, reset 451 # the fill_idx and counter 452 if i == N - 1 or labels[idx] != labels[sorted_labels[i + 1]]: 453 curr_fill_idx = -1 454 filled_vals = 0 455 456 457 @cython.boundscheck(False) 458 @cython.wraparound(False) 459 def group_any_all( 460 int8_t[:, ::1] out, 461 const int8_t[:, :] values, 462 const intp_t[::1] labels, 463 const uint8_t[:, :] mask, 464 str val_test, 465 bint skipna, 466 bint nullable, 467 ) -> None: 468 """ 469 Aggregated boolean values to show truthfulness of group elements. If the 470 input is a nullable type (nullable=True), the result will be computed 471 using Kleene logic. 472 473 Parameters 474 ---------- 475 out : np.ndarray[np.int8] 476 Values into which this method will write its results. 477 labels : np.ndarray[np.intp] 478 Array containing unique label for each group, with its 479 ordering matching up to the corresponding record in `values` 480 values : np.ndarray[np.int8] 481 Containing the truth value of each element. 482 mask : np.ndarray[np.uint8] 483 Indicating whether a value is na or not. 484 val_test : {'any', 'all'} 485 String object dictating whether to use any or all truth testing 486 skipna : bool 487 Flag to ignore nan values during truth testing 488 nullable : bool 489 Whether or not the input is a nullable type. If True, the 490 result will be computed using Kleene logic 491 492 Notes 493 ----- 494 This method modifies the `out` parameter rather than returning an object. 495 The returned values will either be 0, 1 (False or True, respectively), or 496 -1 to signify a masked position in the case of a nullable input. 497 """ 498 cdef: 499 Py_ssize_t i, j, N = len(labels), K = out.shape[1] 500 intp_t lab 501 int8_t flag_val, val 502 503 if val_test == 'all': 504 # Because the 'all' value of an empty iterable in Python is True we can 505 # start with an array full of ones and set to zero when a False value 506 # is encountered 507 flag_val = 0 508 elif val_test == 'any': 509 # Because the 'any' value of an empty iterable in Python is False we 510 # can start with an array full of zeros and set to one only if any 511 # value encountered is True 512 flag_val = 1 513 else: 514 raise ValueError("'bool_func' must be either 'any' or 'all'!") 515 516 out[:] = 1 - flag_val 517 518 with nogil: 519 for i in range(N): 520 lab = labels[i] 521 if lab < 0: 522 continue 523 524 for j in range(K): 525 if skipna and mask[i, j]: 526 continue 527 528 if nullable and mask[i, j]: 529 # Set the position as masked if `out[lab] != flag_val`, which 530 # would indicate True/False has not yet been seen for any/all, 531 # so by Kleene logic the result is currently unknown 532 if out[lab, j] != flag_val: 533 out[lab, j] = -1 534 continue 535 536 val = values[i, j] 537 538 # If True and 'any' or False and 'all', the result is 539 # already determined 540 if val == flag_val: 541 out[lab, j] = flag_val 542 543 544 # ---------------------------------------------------------------------- 545 # group_sum, group_prod, group_var, group_mean, group_ohlc 546 # ---------------------------------------------------------------------- 547 548 ctypedef fused mean_t: 549 float64_t 550 float32_t 551 complex64_t 552 complex128_t 553 554 ctypedef fused sum_t: 555 mean_t 556 int64_t 557 uint64_t 558 object 559 560 561 @cython.wraparound(False) 562 @cython.boundscheck(False) 563 def group_sum( 564 sum_t[:, ::1] out, 565 int64_t[::1] counts, 566 ndarray[sum_t, ndim=2] values, 567 const intp_t[::1] labels, 568 const uint8_t[:, :] mask, 569 uint8_t[:, ::1] result_mask=None, 570 Py_ssize_t min_count=0, 571 bint is_datetimelike=False, 572 ) -> None: 573 """ 574 Only aggregates on axis=0 using Kahan summation 575 """ 576 cdef: 577 Py_ssize_t i, j, N, K, lab, ncounts = len(counts) 578 sum_t val, t, y 579 sum_t[:, ::1] sumx, compensation 580 int64_t[:, ::1] nobs 581 Py_ssize_t len_values = len(values), len_labels = len(labels) 582 bint uses_mask = mask is not None 583 bint isna_entry 584 585 if len_values != len_labels: 586 raise ValueError("len(index) != len(labels)") 587 588 nobs = np.zeros((<object>out).shape, dtype=np.int64) 589 # the below is equivalent to `np.zeros_like(out)` but faster 590 sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype) 591 compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype) 592 593 N, K = (<object>values).shape 594 595 if sum_t is object: 596 # NB: this does not use 'compensation' like the non-object track does. 597 for i in range(N): 598 lab = labels[i] 599 if lab < 0: 600 continue 601 602 counts[lab] += 1 603 for j in range(K): 604 val = values[i, j] 605 606 # not nan 607 if not checknull(val): 608 nobs[lab, j] += 1 609 610 if nobs[lab, j] == 1: 611 # i.e. we haven't added anything yet; avoid TypeError 612 # if e.g. val is a str and sumx[lab, j] is 0 613 t = val 614 else: 615 t = sumx[lab, j] + val 616 sumx[lab, j] = t 617 618 for i in range(ncounts): 619 for j in range(K): 620 if nobs[i, j] < min_count: 621 out[i, j] = None 622 623 else: 624 out[i, j] = sumx[i, j] 625 else: 626 with nogil: 627 for i in range(N): 628 lab = labels[i] 629 if lab < 0: 630 continue 631 632 counts[lab] += 1 633 for j in range(K): 634 val = values[i, j] 635 636 # not nan 637 # With dt64/td64 values, values have been cast to float64 638 # instead if int64 for group_sum, but the logic 639 # is otherwise the same as in _treat_as_na 640 if uses_mask: 641 isna_entry = mask[i, j] 642 elif (sum_t is float32_t or sum_t is float64_t 643 or sum_t is complex64_t or sum_t is complex64_t): 644 # avoid warnings because of equality comparison 645 isna_entry = not val == val 646 elif sum_t is int64_t and is_datetimelike and val == NPY_NAT: 647 isna_entry = True 648 else: 649 isna_entry = False 650 651 if not isna_entry: 652 nobs[lab, j] += 1 653 y = val - compensation[lab, j] 654 t = sumx[lab, j] + y 655 compensation[lab, j] = t - sumx[lab, j] - y 656 sumx[lab, j] = t 657 658 for i in range(ncounts): 659 for j in range(K): 660 if nobs[i, j] < min_count: 661 # if we are integer dtype, not is_datetimelike, and 662 # not uses_mask, then getting here implies that 663 # counts[i] < min_count, which means we will 664 # be cast to float64 and masked at the end 665 # of WrappedCythonOp._call_cython_op. So we can safely 666 # set a placeholder value in out[i, j]. 667 if uses_mask: 668 result_mask[i, j] = True 669 elif (sum_t is float32_t or sum_t is float64_t 670 or sum_t is complex64_t or sum_t is complex64_t): 671 out[i, j] = NAN 672 elif sum_t is int64_t: 673 out[i, j] = NPY_NAT 674 else: 675 # placeholder, see above 676 out[i, j] = 0 677 678 else: 679 out[i, j] = sumx[i, j] 680 681 682 @cython.wraparound(False) 683 @cython.boundscheck(False) 684 def group_prod( 685 int64float_t[:, ::1] out, 686 int64_t[::1] counts, 687 ndarray[int64float_t, ndim=2] values, 688 const intp_t[::1] labels, 689 const uint8_t[:, ::1] mask, 690 uint8_t[:, ::1] result_mask=None, 691 Py_ssize_t min_count=0, 692 ) -> None: 693 """ 694 Only aggregates on axis=0 695 """ 696 cdef: 697 Py_ssize_t i, j, N, K, lab, ncounts = len(counts) 698 int64float_t val, count 699 int64float_t[:, ::1] prodx 700 int64_t[:, ::1] nobs 701 Py_ssize_t len_values = len(values), len_labels = len(labels) 702 bint isna_entry, uses_mask = mask is not None 703 704 if len_values != len_labels: 705 raise ValueError("len(index) != len(labels)") 706 707 nobs = np.zeros((<object>out).shape, dtype=np.int64) 708 prodx = np.ones((<object>out).shape, dtype=(<object>out).base.dtype) 709 710 N, K = (<object>values).shape 711 712 with nogil: 713 for i in range(N): 714 lab = labels[i] 715 if lab < 0: 716 continue 717 718 counts[lab] += 1 719 for j in range(K): 720 val = values[i, j] 721 722 if uses_mask: 723 isna_entry = mask[i, j] 724 elif int64float_t is float32_t or int64float_t is float64_t: 725 isna_entry = not val == val 726 else: 727 isna_entry = False 728 729 if not isna_entry: 730 nobs[lab, j] += 1 731 prodx[lab, j] *= val 732 733 for i in range(ncounts): 734 for j in range(K): 735 if nobs[i, j] < min_count: 736 737 # else case is not possible 738 if uses_mask: 739 result_mask[i, j] = True 740 # Be deterministic, out was initialized as empty 741 out[i, j] = 0 742 elif int64float_t is float32_t or int64float_t is float64_t: 743 out[i, j] = NAN 744 else: 745 # we only get here when < mincount which gets handled later 746 pass 747 748 else: 749 out[i, j] = prodx[i, j] 750 751 752 @cython.wraparound(False) 753 @cython.boundscheck(False) 754 @cython.cdivision(True) 755 def group_var( 756 floating[:, ::1] out, 757 int64_t[::1] counts, 758 ndarray[floating, ndim=2] values, 759 const intp_t[::1] labels, 760 Py_ssize_t min_count=-1, 761 int64_t ddof=1, 762 ) -> None: 763 cdef: 764 Py_ssize_t i, j, N, K, lab, ncounts = len(counts) 765 floating val, ct, oldmean 766 floating[:, ::1] mean 767 int64_t[:, ::1] nobs 768 Py_ssize_t len_values = len(values), len_labels = len(labels) 769 770 assert min_count == -1, "'min_count' only used in sum and prod" 771 772 if len_values != len_labels: 773 raise ValueError("len(index) != len(labels)") 774 775 nobs = np.zeros((<object>out).shape, dtype=np.int64) 776 mean = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype) 777 778 N, K = (<object>values).shape 779 780 out[:, :] = 0.0 781 782 with nogil: 783 for i in range(N): 784 lab = labels[i] 785 if lab < 0: 786 continue 787 788 counts[lab] += 1 789 790 for j in range(K): 791 val = values[i, j] 792 793 # not nan 794 if val == val: 795 nobs[lab, j] += 1 796 oldmean = mean[lab, j] 797 mean[lab, j] += (val - oldmean) / nobs[lab, j] 798 out[lab, j] += (val - mean[lab, j]) * (val - oldmean) 799 800 for i in range(ncounts): 801 for j in range(K): 802 ct = nobs[i, j] 803 if ct <= ddof: 804 out[i, j] = NAN 805 else: 806 out[i, j] /= (ct - ddof) 807 808 809 @cython.wraparound(False) 810 @cython.boundscheck(False) 811 def group_mean( 812 mean_t[:, ::1] out, 813 int64_t[::1] counts, 814 ndarray[mean_t, ndim=2] values, 815 const intp_t[::1] labels, 816 Py_ssize_t min_count=-1, 817 bint is_datetimelike=False, 818 const uint8_t[:, ::1] mask=None, 819 uint8_t[:, ::1] result_mask=None, 820 ) -> None: 821 """ 822 Compute the mean per label given a label assignment for each value. 823 NaN values are ignored. 824 825 Parameters 826 ---------- 827 out : np.ndarray[floating] 828 Values into which this method will write its results. 829 counts : np.ndarray[int64] 830 A zeroed array of the same shape as labels, 831 populated by group sizes during algorithm. 832 values : np.ndarray[floating] 833 2-d array of the values to find the mean of. 834 labels : np.ndarray[np.intp] 835 Array containing unique label for each group, with its 836 ordering matching up to the corresponding record in `values`. 837 min_count : Py_ssize_t 838 Only used in sum and prod. Always -1. 839 is_datetimelike : bool 840 True if `values` contains datetime-like entries. 841 mask : ndarray[bool, ndim=2], optional 842 Not used. 843 result_mask : ndarray[bool, ndim=2], optional 844 Not used. 845 846 Notes 847 ----- 848 This method modifies the `out` parameter rather than returning an object. 849 `counts` is modified to hold group sizes 850 """ 851 852 cdef: 853 Py_ssize_t i, j, N, K, lab, ncounts = len(counts) 854 mean_t val, count, y, t, nan_val 855 mean_t[:, ::1] sumx, compensation 856 int64_t[:, ::1] nobs 857 Py_ssize_t len_values = len(values), len_labels = len(labels) 858 859 assert min_count == -1, "'min_count' only used in sum and prod" 860 861 if len_values != len_labels: 862 raise ValueError("len(index) != len(labels)") 863 864 # the below is equivalent to `np.zeros_like(out)` but faster 865 nobs = np.zeros((<object>out).shape, dtype=np.int64) 866 sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype) 867 compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype) 868 869 N, K = (<object>values).shape 870 nan_val = NPY_NAT if is_datetimelike else NAN 871 872 with nogil: 873 for i in range(N): 874 lab = labels[i] 875 if lab < 0: 876 continue 877 878 counts[lab] += 1 879 for j in range(K): 880 val = values[i, j] 881 # not nan 882 if val == val and not (is_datetimelike and val == NPY_NAT): 883 nobs[lab, j] += 1 884 y = val - compensation[lab, j] 885 t = sumx[lab, j] + y 886 compensation[lab, j] = t - sumx[lab, j] - y 887 sumx[lab, j] = t 888 889 for i in range(ncounts): 890 for j in range(K): 891 count = nobs[i, j] 892 if nobs[i, j] == 0: 893 out[i, j] = nan_val 894 else: 895 out[i, j] = sumx[i, j] / count 896 897 898 @cython.wraparound(False) 899 @cython.boundscheck(False) 900 def group_ohlc( 901 int64float_t[:, ::1] out, 902 int64_t[::1] counts, 903 ndarray[int64float_t, ndim=2] values, 904 const intp_t[::1] labels, 905 Py_ssize_t min_count=-1, 906 const uint8_t[:, ::1] mask=None, 907 uint8_t[:, ::1] result_mask=None, 908 ) -> None: 909 """ 910 Only aggregates on axis=0 911 """ 912 cdef: 913 Py_ssize_t i, j, N, K, lab 914 int64float_t val 915 uint8_t[::1] first_element_set 916 bint isna_entry, uses_mask = not mask is None 917 918 assert min_count == -1, "'min_count' only used in sum and prod" 919 920 if len(labels) == 0: 921 return 922 923 N, K = (<object>values).shape 924 925 if out.shape[1] != 4: 926 raise ValueError('Output array must have 4 columns') 927 928 if K > 1: 929 raise NotImplementedError("Argument 'values' must have only one dimension") 930 931 if int64float_t is float32_t or int64float_t is float64_t: 932 out[:] = np.nan 933 else: 934 out[:] = 0 935 936 first_element_set = np.zeros((<object>counts).shape, dtype=np.uint8) 937 if uses_mask: 938 result_mask[:] = True 939 940 with nogil: 941 for i in range(N): 942 lab = labels[i] 943 if lab == -1: 944 continue 945 946 counts[lab] += 1 947 val = values[i, 0] 948 949 if uses_mask: 950 isna_entry = mask[i, 0] 951 elif int64float_t is float32_t or int64float_t is float64_t: 952 isna_entry = val != val 953 else: 954 isna_entry = False 955 956 if isna_entry: 957 continue 958 959 if not first_element_set[lab]: 960 out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val 961 first_element_set[lab] = True 962 if uses_mask: 963 result_mask[lab] = False 964 else: 965 out[lab, 1] = max(out[lab, 1], val) 966 out[lab, 2] = min(out[lab, 2], val) 967 out[lab, 3] = val 968 969 970 @cython.boundscheck(False) 971 @cython.wraparound(False) 972 def group_quantile( 973 ndarray[float64_t, ndim=2] out, 974 ndarray[numeric_t, ndim=1] values, 975 ndarray[intp_t] labels, 976 ndarray[uint8_t] mask, 977 const intp_t[:] sort_indexer, 978 const float64_t[:] qs, 979 str interpolation, 980 ) -> None: 981 """ 982 Calculate the quantile per group. 983 984 Parameters 985 ---------- 986 out : np.ndarray[np.float64, ndim=2] 987 Array of aggregated values that will be written to. 988 values : np.ndarray 989 Array containing the values to apply the function against. 990 labels : ndarray[np.intp] 991 Array containing the unique group labels. 992 sort_indexer : ndarray[np.intp] 993 Indices describing sort order by values and labels. 994 qs : ndarray[float64_t] 995 The quantile values to search for. 996 interpolation : {'linear', 'lower', 'highest', 'nearest', 'midpoint'} 997 998 Notes 999 ----- 1000 Rather than explicitly returning a value, this function modifies the 1001 provided `out` parameter. 1002 """ 1003 cdef: 1004 Py_ssize_t i, N=len(labels), ngroups, grp_sz, non_na_sz, k, nqs 1005 Py_ssize_t grp_start=0, idx=0 1006 intp_t lab 1007 InterpolationEnumType interp 1008 float64_t q_val, q_idx, frac, val, next_val 1009 int64_t[::1] counts, non_na_counts 1010 1011 assert values.shape[0] == N 1012 1013 if any(not (0 <= q <= 1) for q in qs): 1014 wrong = [x for x in qs if not (0 <= x <= 1)][0] 1015 raise ValueError( 1016 f"Each 'q' must be between 0 and 1. Got '{wrong}' instead" 1017 ) 1018 1019 inter_methods = { 1020 'linear': INTERPOLATION_LINEAR, 1021 'lower': INTERPOLATION_LOWER, 1022 'higher': INTERPOLATION_HIGHER, 1023 'nearest': INTERPOLATION_NEAREST, 1024 'midpoint': INTERPOLATION_MIDPOINT, 1025 } 1026 interp = inter_methods[interpolation] 1027 1028 nqs = len(qs) 1029 ngroups = len(out) 1030 counts = np.zeros(ngroups, dtype=np.int64) 1031 non_na_counts = np.zeros(ngroups, dtype=np.int64) 1032 1033 # First figure out the size of every group 1034 with nogil: 1035 for i in range(N): 1036 lab = labels[i] 1037 if lab == -1: # NA group label 1038 continue 1039 1040 counts[lab] += 1 1041 if not mask[i]: 1042 non_na_counts[lab] += 1 1043 1044 with nogil: 1045 for i in range(ngroups): 1046 # Figure out how many group elements there are 1047 grp_sz = counts[i] 1048 non_na_sz = non_na_counts[i] 1049 1050 if non_na_sz == 0: 1051 for k in range(nqs): 1052 out[i, k] = NaN 1053 else: 1054 for k in range(nqs): 1055 q_val = qs[k] 1056 1057 # Calculate where to retrieve the desired value 1058 # Casting to int will intentionally truncate result 1059 idx = grp_start + <int64_t>(q_val * <float64_t>(non_na_sz - 1)) 1060 1061 val = values[sort_indexer[idx]] 1062 # If requested quantile falls evenly on a particular index 1063 # then write that index's value out. Otherwise interpolate 1064 q_idx = q_val * (non_na_sz - 1) 1065 frac = q_idx % 1 1066 1067 if frac == 0.0 or interp == INTERPOLATION_LOWER: 1068 out[i, k] = val 1069 else: 1070 next_val = values[sort_indexer[idx + 1]] 1071 if interp == INTERPOLATION_LINEAR: 1072 out[i, k] = val + (next_val - val) * frac 1073 elif interp == INTERPOLATION_HIGHER: 1074 out[i, k] = next_val 1075 elif interp == INTERPOLATION_MIDPOINT: 1076 out[i, k] = (val + next_val) / 2.0 1077 elif interp == INTERPOLATION_NEAREST: 1078 if frac > .5 or (frac == .5 and q_val > .5): # Always OK? 1079 out[i, k] = next_val 1080 else: 1081 out[i, k] = val 1082 1083 # Increment the index reference in sorted_arr for the next group 1084 grp_start += grp_sz 1085 1086 1087 # ---------------------------------------------------------------------- 1088 # group_nth, group_last, group_rank 1089 # ---------------------------------------------------------------------- 1090 1091 cdef inline bint _treat_as_na(numeric_object_t val, bint is_datetimelike) nogil: 1092 if numeric_object_t is object: 1093 # Should never be used, but we need to avoid the `val != val` below 1094 # or else cython will raise about gil acquisition. 1095 raise NotImplementedError 1096 1097 elif numeric_object_t is int64_t: 1098 return is_datetimelike and val == NPY_NAT 1099 elif numeric_object_t is float32_t or numeric_object_t is float64_t: 1100 return val != val 1101 else: 1102 # non-datetimelike integer 1103 return False 1104 1105 1106 cdef numeric_object_t _get_min_or_max(numeric_object_t val, bint compute_max, bint is_datetimelike): 1107 """ 1108 Find either the min or the max supported by numeric_object_t; 'val' is a 1109 placeholder to effectively make numeric_object_t an argument. 1110 """ 1111 return get_rank_nan_fill_val( 1112 not compute_max, 1113 val=val, 1114 is_datetimelike=is_datetimelike, 1115 ) 1116 1117 1118 cdef numeric_t _get_na_val(numeric_t val, bint is_datetimelike): 1119 cdef: 1120 numeric_t na_val 1121 1122 if numeric_t == float32_t or numeric_t == float64_t: 1123 na_val = NaN 1124 elif numeric_t is int64_t and is_datetimelike: 1125 na_val = NPY_NAT 1126 else: 1127 # Used in case of masks 1128 na_val = 0 1129 return na_val 1130 1131 1132 # TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can 1133 # use `const numeric_object_t[:, :] values` 1134 @cython.wraparound(False) 1135 @cython.boundscheck(False) 1136 def group_last( 1137 numeric_object_t[:, ::1] out, 1138 int64_t[::1] counts, 1139 ndarray[numeric_object_t, ndim=2] values, 1140 const intp_t[::1] labels, 1141 const uint8_t[:, :] mask, 1142 uint8_t[:, ::1] result_mask=None, 1143 Py_ssize_t min_count=-1, 1144 bint is_datetimelike=False, 1145 ) -> None: 1146 """ 1147 Only aggregates on axis=0 1148 """ 1149 cdef: 1150 Py_ssize_t i, j, N, K, lab, ncounts = len(counts) 1151 numeric_object_t val 1152 ndarray[numeric_object_t, ndim=2] resx 1153 ndarray[int64_t, ndim=2] nobs 1154 bint uses_mask = mask is not None 1155 bint isna_entry 1156 1157 # TODO(cython3): 1158 # Instead of `labels.shape[0]` use `len(labels)` 1159 if not len(values) == labels.shape[0]: 1160 raise AssertionError("len(index) != len(labels)") 1161 1162 min_count = max(min_count, 1) 1163 nobs = np.zeros((<object>out).shape, dtype=np.int64) 1164 if numeric_object_t is object: 1165 resx = np.empty((<object>out).shape, dtype=object) 1166 else: 1167 resx = np.empty_like(out) 1168 1169 N, K = (<object>values).shape 1170 1171 if numeric_object_t is object: 1172 # TODO(cython3): De-duplicate once conditional-nogil is available 1173 for i in range(N): 1174 lab = labels[i] 1175 if lab < 0: 1176 continue 1177 1178 counts[lab] += 1 1179 for j in range(K): 1180 val = values[i, j] 1181 1182 if uses_mask: 1183 isna_entry = mask[i, j] 1184 else: 1185 isna_entry = checknull(val) 1186 1187 if not isna_entry: 1188 # NB: use _treat_as_na here once 1189 # conditional-nogil is available. 1190 nobs[lab, j] += 1 1191 resx[lab, j] = val 1192 1193 for i in range(ncounts): 1194 for j in range(K): 1195 if nobs[i, j] < min_count: 1196 out[i, j] = None 1197 else: 1198 out[i, j] = resx[i, j] 1199 else: 1200 with nogil: 1201 for i in range(N): 1202 lab = labels[i] 1203 if lab < 0: 1204 continue 1205 1206 counts[lab] += 1 1207 for j in range(K): 1208 val = values[i, j] 1209 1210 if uses_mask: 1211 isna_entry = mask[i, j] 1212 else: 1213 isna_entry = _treat_as_na(val, is_datetimelike) 1214 1215 if not isna_entry: 1216 nobs[lab, j] += 1 1217 resx[lab, j] = val 1218 1219 for i in range(ncounts): 1220 for j in range(K): 1221 # TODO(cython3): the entire next block can be shared 1222 # across 3 places once conditional-nogil is available 1223 if nobs[i, j] < min_count: 1224 # if we are integer dtype, not is_datetimelike, and 1225 # not uses_mask, then getting here implies that 1226 # counts[i] < min_count, which means we will 1227 # be cast to float64 and masked at the end 1228 # of WrappedCythonOp._call_cython_op. So we can safely 1229 # set a placeholder value in out[i, j]. 1230 if uses_mask: 1231 result_mask[i, j] = True 1232 elif numeric_object_t is float32_t or numeric_object_t is float64_t: 1233 out[i, j] = NAN 1234 elif numeric_object_t is int64_t: 1235 # Per above, this is a placeholder in 1236 # non-is_datetimelike cases. 1237 out[i, j] = NPY_NAT 1238 else: 1239 # placeholder, see above 1240 out[i, j] = 0 1241 else: 1242 out[i, j] = resx[i, j] 1243 1244 1245 # TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can 1246 # use `const numeric_object_t[:, :] values` 1247 @cython.wraparound(False) 1248 @cython.boundscheck(False) 1249 def group_nth( 1250 numeric_object_t[:, ::1] out, 1251 int64_t[::1] counts, 1252 ndarray[numeric_object_t, ndim=2] values, 1253 const intp_t[::1] labels, 1254 const uint8_t[:, :] mask, 1255 uint8_t[:, ::1] result_mask=None, 1256 int64_t min_count=-1, 1257 int64_t rank=1, 1258 bint is_datetimelike=False, 1259 ) -> None: 1260 """ 1261 Only aggregates on axis=0 1262 """ 1263 cdef: 1264 Py_ssize_t i, j, N, K, lab, ncounts = len(counts) 1265 numeric_object_t val 1266 ndarray[numeric_object_t, ndim=2] resx 1267 ndarray[int64_t, ndim=2] nobs 1268 bint uses_mask = mask is not None 1269 bint isna_entry 1270 1271 # TODO(cython3): 1272 # Instead of `labels.shape[0]` use `len(labels)` 1273 if not len(values) == labels.shape[0]: 1274 raise AssertionError("len(index) != len(labels)") 1275 1276 min_count = max(min_count, 1) 1277 nobs = np.zeros((<object>out).shape, dtype=np.int64) 1278 if numeric_object_t is object: 1279 resx = np.empty((<object>out).shape, dtype=object) 1280 else: 1281 resx = np.empty_like(out) 1282 1283 N, K = (<object>values).shape 1284 1285 if numeric_object_t is object: 1286 # TODO(cython3): De-duplicate once conditional-nogil is available 1287 for i in range(N): 1288 lab = labels[i] 1289 if lab < 0: 1290 continue 1291 1292 counts[lab] += 1 1293 for j in range(K): 1294 val = values[i, j] 1295 1296 if uses_mask: 1297 isna_entry = mask[i, j] 1298 else: 1299 isna_entry = checknull(val) 1300 1301 if not isna_entry: 1302 # NB: use _treat_as_na here once 1303 # conditional-nogil is available. 1304 nobs[lab, j] += 1 1305 if nobs[lab, j] == rank: 1306 resx[lab, j] = val 1307 1308 for i in range(ncounts): 1309 for j in range(K): 1310 if nobs[i, j] < min_count: 1311 out[i, j] = None 1312 else: 1313 out[i, j] = resx[i, j] 1314 1315 else: 1316 with nogil: 1317 for i in range(N): 1318 lab = labels[i] 1319 if lab < 0: 1320 continue 1321 1322 counts[lab] += 1 1323 for j in range(K): 1324 val = values[i, j] 1325 1326 if uses_mask: 1327 isna_entry = mask[i, j] 1328 else: 1329 isna_entry = _treat_as_na(val, is_datetimelike) 1330 1331 if not isna_entry: 1332 nobs[lab, j] += 1 1333 if nobs[lab, j] == rank: 1334 resx[lab, j] = val 1335 1336 # TODO: de-dup this whole block with group_last? 1337 for i in range(ncounts): 1338 for j in range(K): 1339 if nobs[i, j] < min_count: 1340 # if we are integer dtype, not is_datetimelike, and 1341 # not uses_mask, then getting here implies that 1342 # counts[i] < min_count, which means we will 1343 # be cast to float64 and masked at the end 1344 # of WrappedCythonOp._call_cython_op. So we can safely 1345 # set a placeholder value in out[i, j]. 1346 if uses_mask: 1347 result_mask[i, j] = True 1348 # set out[i, j] to 0 to be deterministic, as 1349 # it was initialized with np.empty. Also ensures 1350 # we can downcast out if appropriate. 1351 out[i, j] = 0 1352 elif numeric_object_t is float32_t or numeric_object_t is float64_t: 1353 out[i, j] = NAN 1354 elif numeric_object_t is int64_t: 1355 # Per above, this is a placeholder in 1356 # non-is_datetimelike cases. 1357 out[i, j] = NPY_NAT 1358 else: 1359 # placeholder, see above 1360 out[i, j] = 0 1361 1362 else: 1363 out[i, j] = resx[i, j] 1364 1365 1366 @cython.boundscheck(False) 1367 @cython.wraparound(False) 1368 def group_rank( 1369 float64_t[:, ::1] out, 1370 ndarray[numeric_object_t, ndim=2] values, 1371 const intp_t[::1] labels, 1372 int ngroups, 1373 bint is_datetimelike, 1374 str ties_method="average", 1375 bint ascending=True, 1376 bint pct=False, 1377 str na_option="keep", 1378 const uint8_t[:, :] mask=None, 1379 ) -> None: 1380 """ 1381 Provides the rank of values within each group. 1382 1383 Parameters 1384 ---------- 1385 out : np.ndarray[np.float64, ndim=2] 1386 Values to which this method will write its results. 1387 values : np.ndarray of numeric_object_t values to be ranked 1388 labels : np.ndarray[np.intp] 1389 Array containing unique label for each group, with its ordering 1390 matching up to the corresponding record in `values` 1391 ngroups : int 1392 This parameter is not used, is needed to match signatures of other 1393 groupby functions. 1394 is_datetimelike : bool 1395 True if `values` contains datetime-like entries. 1396 ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' 1397 * average: average rank of group 1398 * min: lowest rank in group 1399 * max: highest rank in group 1400 * first: ranks assigned in order they appear in the array 1401 * dense: like 'min', but rank always increases by 1 between groups 1402 ascending : bool, default True 1403 False for ranks by high (1) to low (N) 1404 na_option : {'keep', 'top', 'bottom'}, default 'keep' 1405 pct : bool, default False 1406 Compute percentage rank of data within each group 1407 na_option : {'keep', 'top', 'bottom'}, default 'keep' 1408 * keep: leave NA values where they are 1409 * top: smallest rank if ascending 1410 * bottom: smallest rank if descending 1411 mask : np.ndarray[bool] or None, default None 1412 1413 Notes 1414 ----- 1415 This method modifies the `out` parameter rather than returning an object 1416 """ 1417 cdef: 1418 Py_ssize_t i, k, N 1419 ndarray[float64_t, ndim=1] result 1420 const uint8_t[:] sub_mask 1421 1422 N = values.shape[1] 1423 1424 for k in range(N): 1425 if mask is None: 1426 sub_mask = None 1427 else: 1428 sub_mask = mask[:, k] 1429 1430 result = rank_1d( 1431 values=values[:, k], 1432 labels=labels, 1433 is_datetimelike=is_datetimelike, 1434 ties_method=ties_method, 1435 ascending=ascending, 1436 pct=pct, 1437 na_option=na_option, 1438 mask=sub_mask, 1439 ) 1440 for i in range(len(result)): 1441 if labels[i] >= 0: 1442 out[i, k] = result[i] 1443 1444 1445 # ---------------------------------------------------------------------- 1446 # group_min, group_max 1447 # ---------------------------------------------------------------------- 1448 1449 1450 @cython.wraparound(False) 1451 @cython.boundscheck(False) 1452 cdef group_min_max( 1453 numeric_t[:, ::1] out, 1454 int64_t[::1] counts, 1455 ndarray[numeric_t, ndim=2] values, 1456 const intp_t[::1] labels, 1457 Py_ssize_t min_count=-1, 1458 bint is_datetimelike=False, 1459 bint compute_max=True, 1460 const uint8_t[:, ::1] mask=None, 1461 uint8_t[:, ::1] result_mask=None, 1462 ): 1463 """ 1464 Compute minimum/maximum of columns of `values`, in row groups `labels`. 1465 1466 Parameters 1467 ---------- 1468 out : np.ndarray[numeric_t, ndim=2] 1469 Array to store result in. 1470 counts : np.ndarray[int64] 1471 Input as a zeroed array, populated by group sizes during algorithm 1472 values : array 1473 Values to find column-wise min/max of. 1474 labels : np.ndarray[np.intp] 1475 Labels to group by. 1476 min_count : Py_ssize_t, default -1 1477 The minimum number of non-NA group elements, NA result if threshold 1478 is not met 1479 is_datetimelike : bool 1480 True if `values` contains datetime-like entries. 1481 compute_max : bint, default True 1482 True to compute group-wise max, False to compute min 1483 mask : ndarray[bool, ndim=2], optional 1484 If not None, indices represent missing values, 1485 otherwise the mask will not be used 1486 result_mask : ndarray[bool, ndim=2], optional 1487 If not None, these specify locations in the output that are NA. 1488 Modified in-place. 1489 1490 Notes 1491 ----- 1492 This method modifies the `out` parameter, rather than returning an object. 1493 `counts` is modified to hold group sizes 1494 """ 1495 cdef: 1496 Py_ssize_t i, j, N, K, lab, ngroups = len(counts) 1497 numeric_t val 1498 ndarray[numeric_t, ndim=2] group_min_or_max 1499 int64_t[:, ::1] nobs 1500 bint uses_mask = mask is not None 1501 bint isna_entry 1502 1503 # TODO(cython3): 1504 # Instead of `labels.shape[0]` use `len(labels)` 1505 if not len(values) == labels.shape[0]: 1506 raise AssertionError("len(index) != len(labels)") 1507 1508 min_count = max(min_count, 1) 1509 nobs = np.zeros((<object>out).shape, dtype=np.int64) 1510 1511 group_min_or_max = np.empty_like(out) 1512 group_min_or_max[:] = _get_min_or_max(<numeric_t>0, compute_max, is_datetimelike) 1513 1514 N, K = (<object>values).shape 1515 1516 with nogil: 1517 for i in range(N): 1518 lab = labels[i] 1519 if lab < 0: 1520 continue 1521 1522 counts[lab] += 1 1523 for j in range(K): 1524 val = values[i, j] 1525 1526 if uses_mask: 1527 isna_entry = mask[i, j] 1528 else: 1529 isna_entry = _treat_as_na(val, is_datetimelike) 1530 1531 if not isna_entry: 1532 nobs[lab, j] += 1 1533 if compute_max: 1534 if val > group_min_or_max[lab, j]: 1535 group_min_or_max[lab, j] = val 1536 else: 1537 if val < group_min_or_max[lab, j]: 1538 group_min_or_max[lab, j] = val 1539 1540 for i in range(ngroups): 1541 for j in range(K): 1542 if nobs[i, j] < min_count: 1543 # if we are integer dtype, not is_datetimelike, and 1544 # not uses_mask, then getting here implies that 1545 # counts[i] < min_count, which means we will 1546 # be cast to float64 and masked at the end 1547 # of WrappedCythonOp._call_cython_op. So we can safely 1548 # set a placeholder value in out[i, j]. 1549 if uses_mask: 1550 result_mask[i, j] = True 1551 # set out[i, j] to 0 to be deterministic, as 1552 # it was initialized with np.empty. Also ensures 1553 # we can downcast out if appropriate. 1554 out[i, j] = 0 1555 elif numeric_t is float32_t or numeric_t is float64_t: 1556 out[i, j] = NAN 1557 elif numeric_t is int64_t: 1558 # Per above, this is a placeholder in 1559 # non-is_datetimelike cases. 1560 out[i, j] = NPY_NAT 1561 else: 1562 # placeholder, see above 1563 out[i, j] = 0 1564 else: 1565 out[i, j] = group_min_or_max[i, j] 1566 1567 1568 @cython.wraparound(False) 1569 @cython.boundscheck(False) 1570 def group_max( 1571 numeric_t[:, ::1] out, 1572 int64_t[::1] counts, 1573 ndarray[numeric_t, ndim=2] values, 1574 const intp_t[::1] labels, 1575 Py_ssize_t min_count=-1, 1576 bint is_datetimelike=False, 1577 const uint8_t[:, ::1] mask=None, 1578 uint8_t[:, ::1] result_mask=None, 1579 ) -> None: 1580 """See group_min_max.__doc__""" 1581 group_min_max( 1582 out, 1583 counts, 1584 values, 1585 labels, 1586 min_count=min_count, 1587 is_datetimelike=is_datetimelike, 1588 compute_max=True, 1589 mask=mask, 1590 result_mask=result_mask, 1591 ) 1592 1593 1594 @cython.wraparound(False) 1595 @cython.boundscheck(False) 1596 def group_min( 1597 numeric_t[:, ::1] out, 1598 int64_t[::1] counts, 1599 ndarray[numeric_t, ndim=2] values, 1600 const intp_t[::1] labels, 1601 Py_ssize_t min_count=-1, 1602 bint is_datetimelike=False, 1603 const uint8_t[:, ::1] mask=None, 1604 uint8_t[:, ::1] result_mask=None, 1605 ) -> None: 1606 """See group_min_max.__doc__""" 1607 group_min_max( 1608 out, 1609 counts, 1610 values, 1611 labels, 1612 min_count=min_count, 1613 is_datetimelike=is_datetimelike, 1614 compute_max=False, 1615 mask=mask, 1616 result_mask=result_mask, 1617 ) 1618 1619 1620 @cython.boundscheck(False) 1621 @cython.wraparound(False) 1622 cdef group_cummin_max( 1623 numeric_t[:, ::1] out, 1624 ndarray[numeric_t, ndim=2] values, 1625 const uint8_t[:, ::1] mask, 1626 uint8_t[:, ::1] result_mask, 1627 const intp_t[::1] labels, 1628 int ngroups, 1629 bint is_datetimelike, 1630 bint skipna, 1631 bint compute_max, 1632 ): 1633 """ 1634 Cumulative minimum/maximum of columns of `values`, in row groups `labels`. 1635 1636 Parameters 1637 ---------- 1638 out : np.ndarray[numeric_t, ndim=2] 1639 Array to store cummin/max in. 1640 values : np.ndarray[numeric_t, ndim=2] 1641 Values to take cummin/max of. 1642 mask : np.ndarray[bool] or None 1643 If not None, indices represent missing values, 1644 otherwise the mask will not be used 1645 result_mask : ndarray[bool, ndim=2], optional 1646 If not None, these specify locations in the output that are NA. 1647 Modified in-place. 1648 labels : np.ndarray[np.intp] 1649 Labels to group by. 1650 ngroups : int 1651 Number of groups, larger than all entries of `labels`. 1652 is_datetimelike : bool 1653 True if `values` contains datetime-like entries. 1654 skipna : bool 1655 If True, ignore nans in `values`. 1656 compute_max : bool 1657 True if cumulative maximum should be computed, False 1658 if cumulative minimum should be computed 1659 1660 Notes 1661 ----- 1662 This method modifies the `out` parameter, rather than returning an object. 1663 """ 1664 cdef: 1665 numeric_t[:, ::1] accum 1666 Py_ssize_t i, j, N, K 1667 numeric_t val, mval, na_val 1668 uint8_t[:, ::1] seen_na 1669 intp_t lab 1670 bint na_possible 1671 bint uses_mask = mask is not None 1672 bint isna_entry 1673 1674 accum = np.empty((ngroups, (<object>values).shape[1]), dtype=values.dtype) 1675 accum[:] = _get_min_or_max(<numeric_t>0, compute_max, is_datetimelike) 1676 1677 na_val = _get_na_val(<numeric_t>0, is_datetimelike) 1678 1679 if uses_mask: 1680 na_possible = True 1681 # Will never be used, just to avoid uninitialized warning 1682 na_val = 0 1683 elif numeric_t is float64_t or numeric_t is float32_t: 1684 na_possible = True 1685 elif is_datetimelike: 1686 na_possible = True 1687 else: 1688 # Will never be used, just to avoid uninitialized warning 1689 na_possible = False 1690 1691 if na_possible: 1692 seen_na = np.zeros((<object>accum).shape, dtype=np.uint8) 1693 1694 N, K = (<object>values).shape 1695 with nogil: 1696 for i in range(N): 1697 lab = labels[i] 1698 if lab < 0: 1699 continue 1700 for j in range(K): 1701 1702 if not skipna and na_possible and seen_na[lab, j]: 1703 if uses_mask: 1704 result_mask[i, j] = 1 1705 # Set to 0 ensures that we are deterministic and can 1706 # downcast if appropriate 1707 out[i, j] = 0 1708 1709 else: 1710 out[i, j] = na_val 1711 else: 1712 val = values[i, j] 1713 1714 if uses_mask: 1715 isna_entry = mask[i, j] 1716 else: 1717 isna_entry = _treat_as_na(val, is_datetimelike) 1718 1719 if not isna_entry: 1720 mval = accum[lab, j] 1721 if compute_max: 1722 if val > mval: 1723 accum[lab, j] = mval = val 1724 else: 1725 if val < mval: 1726 accum[lab, j] = mval = val 1727 out[i, j] = mval 1728 else: 1729 seen_na[lab, j] = 1 1730 out[i, j] = val 1731 1732 1733 @cython.boundscheck(False) 1734 @cython.wraparound(False) 1735 def group_cummin( 1736 numeric_t[:, ::1] out, 1737 ndarray[numeric_t, ndim=2] values, 1738 const intp_t[::1] labels, 1739 int ngroups, 1740 bint is_datetimelike, 1741 const uint8_t[:, ::1] mask=None, 1742 uint8_t[:, ::1] result_mask=None, 1743 bint skipna=True, 1744 ) -> None: 1745 """See group_cummin_max.__doc__""" 1746 group_cummin_max( 1747 out=out, 1748 values=values, 1749 mask=mask, 1750 result_mask=result_mask, 1751 labels=labels, 1752 ngroups=ngroups, 1753 is_datetimelike=is_datetimelike, 1754 skipna=skipna, 1755 compute_max=False, 1756 ) 1757 1758 1759 @cython.boundscheck(False) 1760 @cython.wraparound(False) 1761 def group_cummax( 1762 numeric_t[:, ::1] out, 1763 ndarray[numeric_t, ndim=2] values, 1764 const intp_t[::1] labels, 1765 int ngroups, 1766 bint is_datetimelike, 1767 const uint8_t[:, ::1] mask=None, 1768 uint8_t[:, ::1] result_mask=None, 1769 bint skipna=True, 1770 ) -> None: 1771 """See group_cummin_max.__doc__""" 1772 group_cummin_max( 1773 out=out, 1774 values=values, 1775 mask=mask, 1776 result_mask=result_mask, 1777 labels=labels, 1778 ngroups=ngroups, 1779 is_datetimelike=is_datetimelike, 1780 skipna=skipna, 1781 compute_max=True, 1782 )