bcopy.s
1 /* 2 * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 * 28 * This file implements the following functions for the arm64 architecture. 29 * 30 * void bcopy(const void * source, 31 * void * destination, 32 * size_t length); 33 * 34 * void *memmove(void * destination, 35 * const void * source, 36 * size_t n); 37 * 38 * void *memcpy(void * restrict destination, 39 * const void * restrict source, 40 * size_t n); 41 * 42 * All copy n successive bytes from source to destination. Memmove and memcpy 43 * return destination, whereas bcopy has no return value. Copying takes place 44 * as if it were through a temporary buffer -- after return destination 45 * contains exactly the bytes from source, even if the buffers overlap (this is 46 * not required of memcpy by the C standard; its behavior is undefined if the 47 * buffers overlap, but we are holding ourselves to the historical behavior of 48 * this function on MacOS). 49 */ 50 51 #include "asm.h" 52 53 .globl _bcopy 54 .globl _ovbcopy 55 .globl _memcpy 56 .globl _memmove 57 58 /***************************************************************************** 59 * Macros * 60 *****************************************************************************/ 61 62 #define kSmallCopy 64 63 64 /***************************************************************************** 65 * Entrypoints * 66 *****************************************************************************/ 67 68 .text 69 .align 5 70 _bcopy: 71 _ovbcopy: 72 // Translate bcopy into memcpy by swapping the first and second arguments. 73 mov x3, x0 74 mov x0, x1 75 mov x1, x3 76 77 .align 4 78 _memcpy: 79 _memmove: 80 // Our preference is to copy the data in ascending address order, but if the 81 // buffers overlap such that the beginning of the destination buffer aliases 82 // the end of the source buffer, we need to copy in descending address order 83 // instead to preserve the memmove semantics. We detect this case with the 84 // test: 85 // 86 // destination - source < length (unsigned compare) 87 // 88 // If the address of the source buffer is higher than the address of the 89 // destination buffer, this arithmetic can overflow, but the overflowed value 90 // can only be smaller than length if the buffers do not overlap, so we don't 91 // need to worry about false positives due to the overflow (they happen, but 92 // only in cases where copying in either order is correct). 93 ARM64_STACK_PROLOG 94 PUSH_FRAME 95 sub x3, x0, x1 96 cmp x3, x2 97 b.cc L_reverse 98 mov x3, x0 // copy destination pointer 99 cmp x2, #(kSmallCopy) 100 b.cc L_forwardSmallCopy 101 102 /***************************************************************************** 103 * Forward large copy * 104 *****************************************************************************/ 105 106 // Load the first 32 bytes from src, and compute the number of bytes to the 107 // first 32-byte aligned location in dst. Even though we are going to copy 108 // 32 bytes, only those preceeding that 32-byte location "count" towards 109 // reducing the length of the buffer or advancing the pointers. We will need 110 // to issue the first load from the advanced src pointer BEFORE the store to 111 // the unmodified dst pointer. 112 add x3, x3, #32 113 and x3, x3, #-32 // aligned dst 114 ldp x12,x13,[x1] 115 ldp x14,x15,[x1, #16] 116 sub x5, x3, x0 // bytes between original dst and aligned dst 117 add x1, x1, x5 // update src pointer 118 119 // At this point, data in the following registers is in flight: 120 // 121 // x0 original dst pointer 122 // x1 corresponding location in src buffer. 123 // x2 length from aligned location in dst to end of buffer. This is 124 // guaranteed to be >= (64 - 32). 125 // x3 aligned location in dst buffer. 126 // x12:x15 first 32 bytes of src buffer. 127 // 128 // We now load 32 bytes from x1, and store 32 bytes from x12:x15 to x3. The 129 // store *may* overlap the first 32 bytes of the load, so in order to get 130 // correct memmove semantics, the first 32 byte load must occur before the 131 // store. 132 // 133 // After loading these 32 bytes, we advance x1, and decrement the length by 134 // 64. If the remaining length of the buffer was less than 64, then we jump 135 // directly to the cleanup path. 136 ldp x8, x9, [x1] 137 ldp x10,x11,[x1, #16] 138 add x1, x1, #32 139 sub x2, x2, x5 // update length 140 stp x12,x13,[x0] // initial unaligned store 141 stp x14,x15,[x0, #16] // initial unaligned store 142 subs x2, x2, #64 143 b.ls L_forwardCleanup 144 145 L_forwardCopyLoop: 146 // Main copy loop: 147 // 148 // 1. store the 32 bytes loaded in the previous loop iteration 149 // 2. advance the destination pointer 150 // 3. load the next 32 bytes 151 // 4. advance the source pointer 152 // 5. subtract 32 from the length 153 // 154 // The loop is terminated when 32 or fewer bytes remain to be loaded. Those 155 // trailing 1-32 bytes will be copied in the loop cleanup. 156 stnp x8, x9, [x3] 157 stnp x10,x11,[x3, #16] 158 add x3, x3, #32 159 ldnp x8, x9, [x1] 160 ldnp x10,x11,[x1, #16] 161 add x1, x1, #32 162 subs x2, x2, #32 163 b.hi L_forwardCopyLoop 164 165 L_forwardCleanup: 166 // There are 32 bytes in x8-x11 that were loaded in the previous loop 167 // iteration, which need to be stored to [x3,x3+32). In addition, between 168 // 0 and 32 more bytes need to be copied from x1 to x3 + 32. The exact 169 // number of bytes to copy is x2 + 32. Instead of using smaller conditional 170 // copies, we simply copy 32 unaligned bytes from x1+x2 to 64+x3+x2. 171 // This copy may overlap with the first store, so the loads must come before 172 // the store of the data from the previous loop iteration. 173 add x1, x1, x2 174 ldp x12,x13,[x1] 175 ldp x14,x15,[x1, #16] 176 stp x8, x9, [x3] 177 stp x10,x11,[x3, #16] 178 add x3, x3, x2 179 stp x12,x13,[x3, #32] 180 stp x14,x15,[x3, #48] 181 POP_FRAME 182 ARM64_STACK_EPILOG 183 184 /***************************************************************************** 185 * forward small copy * 186 *****************************************************************************/ 187 188 // Copy one quadword at a time until less than 8 bytes remain to be copied. 189 // At the point of entry to L_forwardSmallCopy, the "calling convention" 190 // is as follows: 191 // 192 // x0 pointer to first byte of destination 193 // x1 pointer to first byte of source 194 // x2 length of buffers 195 // x3 pointer to first byte of destination 196 0: ldr x6, [x1],#8 197 str x6, [x3],#8 198 L_forwardSmallCopy: 199 subs x2, x2, #8 200 b.cs 0b 201 adds x2, x2, #8 202 b.eq 2f 203 1: ldrb w6, [x1],#1 204 strb w6, [x3],#1 205 subs x2, x2, #1 206 b.ne 1b 207 2: POP_FRAME 208 ARM64_STACK_EPILOG 209 210 /***************************************************************************** 211 * Reverse copy engines * 212 *****************************************************************************/ 213 214 // The reverse copy engines are identical in every way to the forward copy 215 // engines, except in that they do everything backwards. For this reason, they 216 // are somewhat more sparsely commented than the forward copy loops. I have 217 // tried to only comment things that might be somewhat surprising in how they 218 // differ from the forward implementation. 219 // 220 // The one important thing to note is that (almost without fail), x1 and x3 221 // will point to ONE BYTE BEYOND the "right-hand edge" of the active buffer 222 // throughout these copy loops. They are initially advanced to that position 223 // in the L_reverse jump island. Because of this, whereas the forward copy 224 // loops generally follow a "copy data, then advance pointers" scheme, in the 225 // reverse copy loops, we advance the pointers, then copy the data. 226 227 L_reverse: 228 // As a minor optimization, we early out if dst == src. 229 cbz x3, L_return 230 // advance both pointers to the ends of their respective buffers before 231 // jumping into the appropriate reverse copy loop. 232 add x4, x0, x2 233 add x1, x1, x2 234 cmp x2, #(kSmallCopy) 235 b.cc L_reverseSmallCopy 236 237 /***************************************************************************** 238 * Reverse large copy * 239 *****************************************************************************/ 240 241 ldp x12,x13,[x1, #-16] 242 ldp x14,x15,[x1, #-32] 243 sub x3, x4, #1 // In the forward copy, we used dst+32 & -32 244 and x3, x3, #-32 // to find an aligned location in the dest 245 sub x5, x4, x3 // buffer. Here we use dst-1 & -32 instead, 246 sub x1, x1, x5 // because we are going backwards. 247 sub x2, x2, x5 248 ldp x8, x9, [x1, #-16] 249 ldp x10,x11,[x1, #-32] 250 stp x12,x13,[x4, #-16] 251 stp x14,x15,[x4, #-32] 252 sub x1, x1, #32 253 subs x2, x2, #64 254 b.ls L_reverseCleanup 255 256 L_reverseCopyLoop: 257 stnp x8, x9, [x3, #-16] 258 stnp x10,x11,[x3, #-32] 259 sub x3, x3, #32 260 ldnp x8, x9, [x1, #-16] 261 ldnp x10,x11,[x1, #-32] 262 sub x1, x1, #32 263 subs x2, x2, #32 264 b.hi L_reverseCopyLoop 265 266 L_reverseCleanup: 267 sub x1, x1, x2 268 ldp x12,x13,[x1, #-16] 269 ldp x14,x15,[x1, #-32] 270 stp x8, x9, [x3, #-16] 271 stp x10,x11,[x3, #-32] 272 stp x12,x13,[x0, #16] // In the forward copy, we need to compute the 273 stp x14,x15,[x0] // address of these stores, but here we already 274 POP_FRAME // have a pointer to the start of the buffer. 275 ARM64_STACK_EPILOG 276 277 /***************************************************************************** 278 * reverse small copy * 279 *****************************************************************************/ 280 281 0: ldr x6, [x1,#-8]! 282 str x6, [x4,#-8]! 283 L_reverseSmallCopy: 284 subs x2, x2, #8 285 b.cs 0b 286 adds x2, x2, #8 287 b.eq 2f 288 1: ldrb w6, [x1,#-1]! 289 strb w6, [x4,#-1]! 290 subs x2, x2, #1 291 b.ne 1b 292 2: POP_FRAME 293 ARM64_STACK_EPILOG 294 295 296 L_return: 297 POP_FRAME 298 ARM64_STACK_EPILOG