/ software / libsprite / sprite.S
sprite.S
  1  // Functions for doing simple 2D graphics operations on a RGB scanline buffer.
  2  
  3  #include "hardware/regs/addressmap.h"
  4  #include "hardware/regs/sio.h"
  5  
  6  #include "sprite_asm_const.h"
  7  
  8  #define POP2_OFFS (SIO_INTERP0_POP_FULL_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
  9  #define CTRL0_OFFS (SIO_INTERP0_CTRL_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
 10  #define INTERP1 (SIO_INTERP1_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
 11  
 12  .syntax unified
 13  .cpu cortex-m0plus
 14  .thumb
 15  
 16  // ----------------------------------------------------------------------------
 17  // Colour fill
 18  
 19  // r0: dst
 20  // r1: value
 21  // r2: count
 22  
 23  decl_func sprite_fill8
 24  	// Slide for short fills
 25  	cmp r2, #18
 26  	bhi 2f
 27  	adr r3, 1f
 28  	lsls r2, #1
 29  	subs r3, r2
 30  	adds r3, #1 // thumb bit
 31  	bx r3
 32  .align 2
 33  	strb r1, [r0, #17]
 34  	strb r1, [r0, #16]
 35  	strb r1, [r0, #15]
 36  	strb r1, [r0, #14]
 37  	strb r1, [r0, #13]
 38  	strb r1, [r0, #12]
 39  	strb r1, [r0, #11]
 40  	strb r1, [r0, #10]
 41  	strb r1, [r0, #9]
 42  	strb r1, [r0, #8]
 43  	strb r1, [r0, #7]
 44  	strb r1, [r0, #6]
 45  	strb r1, [r0, #5]
 46  	strb r1, [r0, #4]
 47  	strb r1, [r0, #3]
 48  	strb r1, [r0, #2]
 49  	strb r1, [r0, #1]
 50  	strb r1, [r0, #0]
 51  1:
 52  	bx lr
 53  2:
 54  	lsls r3, r1, #8
 55  	orrs r1, r3
 56  	lsls r3, r1, #16
 57  	orrs r1, r3
 58  	// Get r0 word-aligned:
 59  	lsrs r3, r0, #1
 60  	bcc 1f
 61  	strb r1, [r0]
 62  	adds r0, #1
 63  	subs r2, #1
 64  1:
 65  	lsrs r3, r0, #2
 66  	bcc 1f
 67  	strh r1, [r0]
 68  	adds r0, #2
 69  	subs r2, #2
 70  1:
 71  	// Set up for main loop. Limit pointer at end - (loop body size - 1)
 72  	push {r4}
 73  	adds r2, r0
 74  	subs r2, #15
 75  	mov ip, r2
 76  	mov r2, r1
 77  	mov r3, r1
 78  	mov r4, r1
 79  
 80  	// Fall straight into loop, because cases less than (loop body + max misalignment) are handled by slide
 81  1:
 82  	stmia r0!, {r1, r2, r3, r4}
 83  	cmp r0, ip
 84  	blo 1b
 85  
 86  	// Main loop done, now tidy up the odds and ends
 87  	mov r4, ip
 88  	subs r4, r0
 89  	adds r4, #15
 90  	// No more than 15 bytes remaining -- first test bit 3
 91  	lsls r4, #29
 92  	bcc 1f
 93  	stmia r0!, {r1, r2}
 94  1:
 95  	lsls r4, #1
 96  	bcc 1f
 97  	stmia r0!, {r1}
 98  1:
 99  	lsls r4, #1
100  	bcc 1f
101  	strh r1, [r0]
102  	adds r0, #2
103  1:
104  	lsls r4, #1
105  	bcc 1f
106  	strb r1, [r0]
107  1:
108  	pop {r4}
109  	bx lr
110  
111  
112  decl_func sprite_fill16
113  	// Slide for short fills
114  	cmp r2, #15
115  	bhi 2f
116  	adr r3, 1f
117  	lsls r2, #1
118  	subs r3, r2
119  	adds r3, #1
120  	bx r3
121  .align 2
122  	strh r1, [r0, #30]
123  	strh r1, [r0, #28]
124  	strh r1, [r0, #26]
125  	strh r1, [r0, #24]
126  	strh r1, [r0, #22]
127  	strh r1, [r0, #20]
128  	strh r1, [r0, #18]
129  	strh r1, [r0, #16]
130  	strh r1, [r0, #14]
131  	strh r1, [r0, #12]
132  	strh r1, [r0, #10]
133  	strh r1, [r0, #8]
134  	strh r1, [r0, #6]
135  	strh r1, [r0, #4]
136  	strh r1, [r0, #2]
137  	strh r1, [r0, #0]
138  1:
139  	bx lr
140  2:
141  	push {r4, r5, r6, r7, lr}
142  	// Get word-aligned before main fill loop
143  	lsrs r3, r2, #2
144  	bcc 1f
145  	strh r1, [r0]
146  	adds r0, #2
147  	subs r2, #1
148  1:
149  	// Set limit pointer at end - (loop body size - 1)
150  	lsls r2, #1
151  	adds r2, r0
152  	subs r2, #26
153  	mov ip, r2
154  
155  	lsls r2, r1, #16
156  	orrs r1, r2
157  	mov r2, r1
158  	mov r3, r1
159  	mov r4, r1
160  	mov r5, r1
161  	mov r6, r1
162  	mov r7, r1
163  	// We can fall through because cases < 1 loop are handled by slide
164  1:
165  	stmia r0!, {r1, r2, r3, r4, r5, r6, r7} // wheeeeeeeeeee
166  	cmp r0, ip
167  	blo 1b
168  
169  	// Most of the work done, we have a few more to tidy up
170  	movs r2, #26
171  	add r2, ip
172  	subs r2, r0
173  
174  	lsls r2, #28
175  	bcc 1f
176  	stmia r0!, {r4, r5, r6, r7}
177  1:
178  	lsls r2, #1
179  	bcc 1f
180  	stmia r0!, {r4, r5}
181  1:
182  	lsls r2, #1
183  	bcc 1f
184  	stmia r0!, {r4}
185  1:
186  	lsls r2, #1
187  	bcc 1f
188  	strh r4, [r0]
189  1:
190  	pop {r4, r5, r6, r7, pc}
191  
192  // ----------------------------------------------------------------------------
193  // Non-AT sprite
194  
195  // r0: dst
196  // r1: src
197  // r2: pixel count
198  //
199  
200  // Unrolled loop body with an initial computed branch.
201  
202  decl_func sprite_blit8
203  	mov ip, r0
204  	lsrs r3, r2, #3
205  	lsls r3, #3
206  	eors r2, r3   // r2 = pixels % 8, r3 = pixels - pixels % 8
207  
208  	add r0, r3
209  	add r1, r3
210  
211  	adr r3, 2f
212  	lsls r2, #2
213  	subs r3, r2
214  	adds r3, #1 // thumb bit >:(
215  	bx r3
216  
217  .align 2
218  1:
219  	subs r0, #8
220  	subs r1, #8
221  	ldrb r3, [r1, #7]
222  	strb r3, [r0, #7]
223  	ldrb r3, [r1, #6]
224  	strb r3, [r0, #6]
225  	ldrb r3, [r1, #5]
226  	strb r3, [r0, #5]
227  	ldrb r3, [r1, #4]
228  	strb r3, [r0, #4]
229  	ldrb r3, [r1, #3]
230  	strb r3, [r0, #3]
231  	ldrb r3, [r1, #2]
232  	strb r3, [r0, #2]
233  	ldrb r3, [r1, #1]
234  	strb r3, [r0, #1]
235  	ldrb r3, [r1, #0]
236  	strb r3, [r0, #0]
237  2:
238  	cmp r0, ip
239  	bhi 1b
240  	bx lr
241  
242  .macro sprite_blit8_alpha_body n
243  	ldrb r3, [r1, #\n]
244  	lsrs r2, r3, #ALPHA_SHIFT_8BPP
245  	bcc 2f
246  	strb r3, [r0, #\n]
247  2:
248  .endm
249  
250  decl_func sprite_blit8_alpha
251  	mov ip, r0
252  	lsrs r3, r2, #3
253  	lsls r3, #3
254  	eors r2, r3
255  
256  	add r0, r3
257  	add r1, r3
258  
259  	adr r3, 3f
260  	lsls r2, #3
261  	subs r3, r2
262  	adds r3, #1
263  	bx r3
264  
265  .align 2
266  1:
267  	subs r0, #8
268  	subs r1, #8
269  	sprite_blit8_alpha_body 7
270  	sprite_blit8_alpha_body 6
271  	sprite_blit8_alpha_body 5
272  	sprite_blit8_alpha_body 4
273  	sprite_blit8_alpha_body 3
274  	sprite_blit8_alpha_body 2
275  	sprite_blit8_alpha_body 1
276  	sprite_blit8_alpha_body 0
277  3:
278  	cmp r0, ip
279  	bhi 1b
280  	bx lr
281  
282  
283  .macro storew_alignh rd ra offs
284  	strh \rd, [\ra, #\offs]
285  	lsrs \rd, #16
286  	strh \rd, [\ra, #\offs + 2]
287  .endm
288  
289  decl_func sprite_blit16
290  	// Force source pointer to be word-aligned
291  	lsrs r3, r1, #2
292  	bcc 1f
293  	ldrh r3, [r1]
294  	strh r3, [r0]
295  	adds r0, #2
296  	adds r1, #2
297  	subs r2, #1
298  1:
299  	// Each loop is 8 pixels. Place limit pointer at 16 bytes before
300  	// end, loop until past it. There will be 0 to 7 pixels remaining.
301  	lsls r2, #1
302  	adds r2, r0
303  	subs r2, #16
304  	mov ip, r2
305  	b 2f
306  1:
307  	ldmia r1!, {r2, r3}
308  	storew_alignh r2, r0, 0
309  	storew_alignh r3, r0, 4
310  	ldmia r1!, {r2, r3}
311  	storew_alignh r2, r0, 8
312  	storew_alignh r3, r0, 12
313  	adds r0, #16
314  2:
315  	cmp r0, ip
316  	bls 1b
317  
318  	mov r2, ip
319  	subs r2, r0
320  	// At least 4 pixels?
321  	lsls r2, #29
322  	bcc 1f
323  	ldmia r1!, {r3}
324  	storew_alignh r3, r0, 0
325  	ldmia r1!, {r3}
326  	storew_alignh r3, r0, 4
327  	adds r0, #8
328  1:
329  	// At least 2 pixels?
330  	lsls r2, #1
331  	bcc 1f
332  	ldmia r1!, {r3}
333  	storew_alignh r3, r0, 0
334  	adds r0, #4
335  1:
336  	// One more pixel?
337  	lsls r2, #1
338  	bcc 1f
339  	ldrh r3, [r1]
340  	strh r3, [r0]
341  1:
342  	bx lr
343  
344  .macro sprite_blit16_alpha_body n
345  	ldrh r3, [r1, #2*\n]
346  	lsrs r2, r3, #ALPHA_SHIFT_16BPP
347  	bcc 2f
348  	strh r3, [r0, #2*\n]
349  2:
350  .endm
351  
352  decl_func sprite_blit16_alpha
353  	mov ip, r0
354  	lsrs r3, r2, #3
355  	lsls r3, #3
356  	eors r2, r3
357  
358  	lsls r3, #1
359  	add r0, r3
360  	add r1, r3
361  
362  	adr r3, 3f
363  	lsls r2, #3
364  	subs r3, r2
365  	adds r3, #1
366  	bx r3
367  
368  .align 2
369  1:
370  	subs r0, #16
371  	subs r1, #16
372  	sprite_blit16_alpha_body 7
373  	sprite_blit16_alpha_body 6
374  	sprite_blit16_alpha_body 5
375  	sprite_blit16_alpha_body 4
376  	sprite_blit16_alpha_body 3
377  	sprite_blit16_alpha_body 2
378  	sprite_blit16_alpha_body 1
379  	sprite_blit16_alpha_body 0
380  3:
381  	cmp r0, ip
382  	bhi 1b
383  	bx lr
384  
385  
386  // ----------------------------------------------------------------------------
387  // Affine-transformed sprite (note these are just the inner loops -- INTERP0
388  // must be configured by the caller, which is presumably not written in asm)
389  
390  // r0: raster start pointer
391  // r1: raster span size (pixels)
392  
393  .macro sprite_ablit8_loop_body n
394  	ldr r1, [r3, #CTRL0_OFFS]
395  	ldr r2, [r3, #POP2_OFFS]
396  	lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
397  	bcs 2f
398  	ldrb r2, [r2]
399  	strb r2, [r0, #\n]
400  2:
401  .endm
402  
403  decl_func sprite_ablit8_loop
404  	mov ip, r0
405  
406  	lsrs r2, r1, #3
407  	lsls r2, #3
408  	eors r1, r2
409  	add r0, r2
410  
411  	adr r2, 3f
412  	movs r3, #12 // Each (non-unrolled) loop body is 12 bytes
413  	muls r1, r3
414  	subs r2, r1
415  	adds r2, #1
416  
417  	ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
418  	bx r2
419  
420  .align 2
421  	nop
422  1:
423  	subs r0, #8
424  	sprite_ablit8_loop_body 7
425  	sprite_ablit8_loop_body 6
426  	sprite_ablit8_loop_body 5
427  	sprite_ablit8_loop_body 4
428  	sprite_ablit8_loop_body 3
429  	sprite_ablit8_loop_body 2
430  	sprite_ablit8_loop_body 1
431  	sprite_ablit8_loop_body 0
432  3:
433  	cmp r0, ip
434  	bne 1b
435  	bx lr
436  
437  
438  
439  // As above but bit 5 is assumed to be an alpha bit (RAGB2132)
440  
441  .macro sprite_ablit8_alpha_loop_body n
442  	ldr r1, [r3, #CTRL0_OFFS]
443  	ldr r2, [r3, #POP2_OFFS]
444  	lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
445  	bcs 2f
446  	ldrb r2, [r2]
447  	lsrs r1, r2, #ALPHA_SHIFT_8BPP
448  	bcc 2f
449  	strb r2, [r0, #\n]
450  2:
451  .endm
452  
453  decl_func sprite_ablit8_alpha_loop
454  	mov ip, r0
455  	ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
456  
457  	lsrs r2, r1, #3
458  	lsls r2, #3
459  	eors r1, r2
460  	add r0, r2
461  
462  	adr r2, 3f
463  	lsls r1, #4 // Each (non-unrolled) loop body is 16 bytes
464  	subs r2, r1
465  	adds r2, #1
466  	bx r2
467  
468  .align 2
469  	nop
470  1:
471  	subs r0, #8
472  	sprite_ablit8_alpha_loop_body 7
473  	sprite_ablit8_alpha_loop_body 6
474  	sprite_ablit8_alpha_loop_body 5
475  	sprite_ablit8_alpha_loop_body 4
476  	sprite_ablit8_alpha_loop_body 3
477  	sprite_ablit8_alpha_loop_body 2
478  	sprite_ablit8_alpha_loop_body 1
479  	sprite_ablit8_alpha_loop_body 0
480  3:
481  	cmp r0, ip
482  	bhi 1b
483  	bx lr
484  
485  
486  
487  .macro sprite_ablit16_loop_body n
488  	ldr r1, [r3, #CTRL0_OFFS]
489  	ldr r2, [r3, #POP2_OFFS]
490  	lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
491  	bcs 2f
492  	ldrh r2, [r2]
493  	strh r2, [r0, #2*\n]
494  2:
495  .endm
496  
497  decl_func sprite_ablit16_loop
498  	mov ip, r0
499  
500  	lsrs r2, r1, #3
501  	lsls r2, #3
502  	eors r1, r2
503  	lsls r2, #1 // Each pixel is 2 bytes
504  	add r0, r2
505  
506  	adr r2, 3f
507  	movs r3, #12 // Each (non-unrolled) loop body is 12 bytes
508  	muls r1, r3
509  	subs r2, r1
510  	adds r2, #1
511  
512  	ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
513  	bx r2
514  
515  .align 2
516  	nop
517  1:
518  	subs r0, #16
519  	sprite_ablit16_loop_body 7
520  	sprite_ablit16_loop_body 6
521  	sprite_ablit16_loop_body 5
522  	sprite_ablit16_loop_body 4
523  	sprite_ablit16_loop_body 3
524  	sprite_ablit16_loop_body 2
525  	sprite_ablit16_loop_body 1
526  	sprite_ablit16_loop_body 0
527  3:
528  	cmp r0, ip
529  	bne 1b
530  	bx lr
531  
532  
533  
534  .macro sprite_ablit16_alpha_loop_body n
535  	ldr r1, [r3, #CTRL0_OFFS]
536  	ldr r2, [r3, #POP2_OFFS]
537  	lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1
538  	bcs 2f
539  	ldrh r2, [r2]
540  	lsrs r1, r2, #ALPHA_SHIFT_16BPP
541  	bcc 2f
542  	strh r2, [r0, #2*\n]
543  2:
544  .endm
545  
546  decl_func sprite_ablit16_alpha_loop
547  	mov ip, r0
548  	ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
549  
550  	lsrs r2, r1, #3
551  	lsls r2, #3
552  	eors r1, r2
553  	lsls r2, #1 // Each pixel is 2 bytes
554  	add r0, r2
555  
556  	adr r2, 3f
557  	lsls r1, #4 // Each (non-unrolled) loop body is 16 bytes
558  	subs r2, r1
559  	adds r2, #1
560  	bx r2
561  
562  .align 2
563  	nop
564  1:
565  	subs r0, #16
566  	sprite_ablit16_alpha_loop_body 7
567  	sprite_ablit16_alpha_loop_body 6
568  	sprite_ablit16_alpha_loop_body 5
569  	sprite_ablit16_alpha_loop_body 4
570  	sprite_ablit16_alpha_loop_body 3
571  	sprite_ablit16_alpha_loop_body 2
572  	sprite_ablit16_alpha_loop_body 1
573  	sprite_ablit16_alpha_loop_body 0
574  3:
575  	cmp r0, ip
576  	bhi 1b
577  	bx lr