/ duct-tape / xnu / osfmk / arm64 / bcopy.s
bcopy.s
  1  /*
  2   * Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
  3   *
  4   * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  5   * 
  6   * This file contains Original Code and/or Modifications of Original Code
  7   * as defined in and that are subject to the Apple Public Source License
  8   * Version 2.0 (the 'License'). You may not use this file except in
  9   * compliance with the License. The rights granted to you under the License
 10   * may not be used to create, or enable the creation or redistribution of,
 11   * unlawful or unlicensed copies of an Apple operating system, or to
 12   * circumvent, violate, or enable the circumvention or violation of, any
 13   * terms of an Apple operating system software license agreement.
 14   * 
 15   * Please obtain a copy of the License at
 16   * http://www.opensource.apple.com/apsl/ and read it before using this file.
 17   * 
 18   * The Original Code and all software distributed under the License are
 19   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 20   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 21   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 22   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 23   * Please see the License for the specific language governing rights and
 24   * limitations under the License.
 25   * 
 26   * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 27   *
 28   *  This file implements the following functions for the arm64 architecture.
 29   *
 30   *  void bcopy(const void * source,
 31   *             void * destination,
 32   *             size_t length);
 33   *
 34   *  void *memmove(void * destination,
 35   *                const void * source,
 36   *                size_t n);
 37   *
 38   *  void *memcpy(void * restrict destination,
 39   *               const void * restrict source,
 40   *               size_t n);
 41   *
 42   * All copy n successive bytes from source to destination.  Memmove and memcpy
 43   * return destination, whereas bcopy has no return value.  Copying takes place
 44   * as if it were through a temporary buffer -- after return destination
 45   * contains exactly the bytes from source, even if the buffers overlap (this is
 46   * not required of memcpy by the C standard; its behavior is undefined if the
 47   * buffers overlap, but we are holding ourselves to the historical behavior of
 48   * this function on MacOS).
 49   */
 50  
 51  #include "asm.h"
 52  
 53  .globl _bcopy
 54  .globl _ovbcopy
 55  .globl _memcpy
 56  .globl _memmove
 57  
 58  /*****************************************************************************
 59   *  Macros                                                                   *
 60   *****************************************************************************/
 61  
 62  #define kSmallCopy 64
 63  
 64  /*****************************************************************************
 65   *  Entrypoints                                                              *
 66   *****************************************************************************/
 67  
 68  .text
 69  .align 5
 70  _bcopy:
 71  _ovbcopy:
 72  //  Translate bcopy into memcpy by swapping the first and second arguments.
 73  	mov     x3,      x0
 74  	mov     x0,      x1
 75  	mov     x1,      x3
 76  
 77  .align 4
 78  _memcpy:
 79  _memmove:
 80  //	Our preference is to copy the data in ascending address order, but if the
 81  //	buffers overlap such that the beginning of the destination buffer aliases
 82  //	the end of the source buffer, we need to copy in descending address order
 83  //	instead to preserve the memmove semantics.  We detect this case with the
 84  //	test:
 85  //
 86  //	    destination - source < length    (unsigned compare)
 87  //
 88  //	If the address of the source buffer is higher than the address of the
 89  //	destination buffer, this arithmetic can overflow, but the overflowed value
 90  //	can only be smaller than length if the buffers do not overlap, so we don't
 91  //	need to worry about false positives due to the overflow (they happen, but
 92  //	only in cases where copying in either order is correct).
 93  	ARM64_STACK_PROLOG
 94  	PUSH_FRAME
 95  	sub     x3,      x0, x1
 96  	cmp     x3,      x2
 97  	b.cc    L_reverse
 98  	mov     x3,      x0      // copy destination pointer
 99  	cmp     x2,      #(kSmallCopy)
100  	b.cc    L_forwardSmallCopy
101  
102  /*****************************************************************************
103   *  Forward large copy                                                       *
104   *****************************************************************************/
105  
106  //	Load the first 32 bytes from src, and compute the number of bytes to the
107  //	first 32-byte aligned location in dst.  Even though we are going to copy
108  //	32 bytes, only those preceeding that 32-byte location "count" towards
109  //	reducing the length of the buffer or advancing the pointers.  We will need
110  //	to issue the first load from the advanced src pointer BEFORE the store to
111  //	the unmodified dst pointer.
112  	add     x3,      x3, #32
113  	and     x3,      x3, #-32 // aligned dst
114  	ldp     x12,x13,[x1]
115  	ldp     x14,x15,[x1, #16]
116  	sub     x5,      x3, x0   // bytes between original dst and aligned dst
117  	add     x1,      x1, x5   // update src pointer
118  
119  //	At this point, data in the following registers is in flight:
120  //
121  //		x0    original dst pointer
122  //		x1    corresponding location in src buffer.
123  //		x2    length from aligned location in dst to end of buffer.  This is
124  //		      guaranteed to be >= (64 - 32).
125  //		x3    aligned location in dst buffer.
126  //		x12:x15 first 32 bytes of src buffer.
127  //
128  //	We now load 32 bytes from x1, and store 32 bytes from x12:x15 to x3.  The
129  //	store *may* overlap the first 32 bytes of the load, so in order to get
130  //	correct memmove semantics, the first 32 byte load must occur before the
131  //	store.
132  //
133  //	After loading these 32 bytes, we advance x1, and decrement the length by
134  //	64.  If the remaining length of the buffer was less than 64, then we jump
135  //	directly to the cleanup path.
136  	ldp     x8, x9, [x1]
137  	ldp     x10,x11,[x1, #16]
138  	add     x1,      x1, #32
139  	sub     x2,      x2, x5   // update length
140  	stp     x12,x13,[x0]      // initial unaligned store
141  	stp     x14,x15,[x0, #16] // initial unaligned store
142  	subs    x2,      x2, #64
143  	b.ls    L_forwardCleanup
144  
145  L_forwardCopyLoop:
146  //	Main copy loop:
147  //
148  //		1. store the 32 bytes loaded in the previous loop iteration
149  //		2. advance the destination pointer
150  //		3. load the next 32 bytes
151  //		4. advance the source pointer
152  //		5. subtract 32 from the length
153  //
154  //	The loop is terminated when 32 or fewer bytes remain to be loaded.  Those
155  //	trailing 1-32 bytes will be copied in the loop cleanup.
156  	stnp    x8, x9, [x3]
157  	stnp    x10,x11,[x3, #16]
158  	add     x3,      x3, #32
159  	ldnp    x8, x9, [x1]
160  	ldnp    x10,x11,[x1, #16]
161  	add     x1,      x1, #32
162  	subs    x2,      x2, #32
163  	b.hi    L_forwardCopyLoop
164  
165  L_forwardCleanup:
166  //	There are 32 bytes in x8-x11 that were loaded in the previous loop
167  //	iteration, which need to be stored to [x3,x3+32).  In addition, between
168  //  0 and 32 more bytes need to be copied from x1 to x3 + 32.  The exact
169  //	number of bytes to copy is x2 + 32.  Instead of using smaller conditional
170  //	copies, we simply copy 32 unaligned bytes from x1+x2 to 64+x3+x2.
171  //	This copy may overlap with the first store, so the loads must come before
172  //	the store of the data from the previous loop iteration.
173  	add     x1,      x1, x2
174  	ldp     x12,x13,[x1]
175  	ldp     x14,x15,[x1, #16]
176  	stp     x8, x9, [x3]
177  	stp     x10,x11,[x3, #16]
178  	add     x3,      x3, x2
179  	stp     x12,x13,[x3, #32]
180  	stp     x14,x15,[x3, #48]
181  	POP_FRAME
182  	ARM64_STACK_EPILOG
183  
184  /*****************************************************************************
185   *  forward small copy                                                       *
186   *****************************************************************************/
187  
188  //	Copy one quadword at a time until less than 8 bytes remain to be copied.
189  //	At the point of entry to L_forwardSmallCopy, the "calling convention"
190  //	is as follows:
191  //
192  //	  x0     pointer to first byte of destination
193  //	  x1     pointer to first byte of source
194  //	  x2     length of buffers
195  //	  x3     pointer to first byte of destination
196  0:	ldr     x6,     [x1],#8
197  	str     x6,     [x3],#8
198  L_forwardSmallCopy:
199  	subs    x2,      x2, #8
200  	b.cs    0b
201  	adds    x2,      x2, #8
202  	b.eq    2f
203  1:	ldrb    w6,     [x1],#1
204  	strb    w6,     [x3],#1
205  	subs    x2,      x2, #1
206  	b.ne    1b
207  2:	POP_FRAME
208  	ARM64_STACK_EPILOG
209  
210  /*****************************************************************************
211   *  Reverse copy engines                                                     *
212   *****************************************************************************/
213  
214  //	The reverse copy engines are identical in every way to the forward copy
215  //	engines, except in that they do everything backwards.  For this reason, they
216  //	are somewhat more sparsely commented than the forward copy loops.  I have
217  //	tried to only comment things that might be somewhat surprising in how they
218  //	differ from the forward implementation.
219  //
220  //	The one important thing to note is that (almost without fail), x1 and x3
221  //	will point to ONE BYTE BEYOND the "right-hand edge" of the active buffer
222  //	throughout these copy loops.  They are initially advanced to that position
223  //	in the L_reverse jump island.  Because of this, whereas the forward copy
224  //	loops generally follow a "copy data, then advance pointers" scheme, in the
225  //	reverse copy loops, we advance the pointers, then copy the data.
226  
227  L_reverse:
228  //	As a minor optimization, we early out if dst == src.
229  	cbz     x3,      L_return
230  //	advance both pointers to the ends of their respective buffers before
231  //	jumping into the appropriate reverse copy loop.
232  	add     x4,      x0, x2
233  	add     x1,      x1, x2
234  	cmp     x2,      #(kSmallCopy)
235  	b.cc    L_reverseSmallCopy
236  
237  /*****************************************************************************
238   *  Reverse large copy                                                       *
239   *****************************************************************************/
240  
241  	ldp     x12,x13,[x1, #-16]
242  	ldp     x14,x15,[x1, #-32]
243  	sub     x3,      x4, #1   // In the forward copy, we used dst+32 & -32
244  	and     x3,      x3, #-32 // to find an aligned location in the dest
245  	sub     x5,      x4, x3   // buffer.  Here we use dst-1 & -32 instead,
246  	sub     x1,      x1, x5   // because we are going backwards.
247  	sub     x2,      x2, x5
248  	ldp     x8, x9, [x1, #-16]
249  	ldp     x10,x11,[x1, #-32]
250  	stp     x12,x13,[x4, #-16]
251  	stp     x14,x15,[x4, #-32]
252  	sub     x1,      x1, #32
253  	subs    x2,      x2, #64
254  	b.ls    L_reverseCleanup
255  
256  L_reverseCopyLoop:
257  	stnp    x8, x9, [x3, #-16]
258  	stnp    x10,x11,[x3, #-32]
259  	sub     x3,      x3, #32
260  	ldnp    x8, x9, [x1, #-16]
261  	ldnp    x10,x11,[x1, #-32]
262  	sub     x1,      x1, #32
263  	subs    x2,      x2, #32
264  	b.hi    L_reverseCopyLoop
265  
266  L_reverseCleanup:
267  	sub     x1,      x1, x2
268  	ldp     x12,x13,[x1, #-16]
269  	ldp     x14,x15,[x1, #-32]
270  	stp     x8, x9, [x3, #-16]
271  	stp     x10,x11,[x3, #-32]
272  	stp     x12,x13,[x0, #16] // In the forward copy, we need to compute the
273  	stp     x14,x15,[x0]      // address of these stores, but here we already
274  	POP_FRAME       // have a pointer to the start of the buffer.
275  	ARM64_STACK_EPILOG
276  
277  /*****************************************************************************
278   *  reverse small copy                                                       *
279   *****************************************************************************/
280  
281  0:	ldr     x6,     [x1,#-8]!
282  	str     x6,     [x4,#-8]!
283  L_reverseSmallCopy:
284  	subs    x2,      x2, #8
285  	b.cs    0b
286  	adds    x2,      x2, #8
287  	b.eq    2f
288  1:	ldrb    w6,     [x1,#-1]!
289  	strb    w6,     [x4,#-1]!
290  	subs    x2,      x2, #1
291  	b.ne    1b
292  2:	POP_FRAME
293  	ARM64_STACK_EPILOG
294  
295  
296  L_return:
297  	POP_FRAME
298  	ARM64_STACK_EPILOG