sprite.S
1 // Functions for doing simple 2D graphics operations on a RGB scanline buffer. 2 3 #include "hardware/regs/addressmap.h" 4 #include "hardware/regs/sio.h" 5 6 #include "sprite_asm_const.h" 7 8 #define POP2_OFFS (SIO_INTERP0_POP_FULL_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) 9 #define CTRL0_OFFS (SIO_INTERP0_CTRL_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) 10 #define INTERP1 (SIO_INTERP1_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) 11 12 .syntax unified 13 .cpu cortex-m0plus 14 .thumb 15 16 // ---------------------------------------------------------------------------- 17 // Colour fill 18 19 // r0: dst 20 // r1: value 21 // r2: count 22 23 decl_func sprite_fill8 24 // Slide for short fills 25 cmp r2, #18 26 bhi 2f 27 adr r3, 1f 28 lsls r2, #1 29 subs r3, r2 30 adds r3, #1 // thumb bit 31 bx r3 32 .align 2 33 strb r1, [r0, #17] 34 strb r1, [r0, #16] 35 strb r1, [r0, #15] 36 strb r1, [r0, #14] 37 strb r1, [r0, #13] 38 strb r1, [r0, #12] 39 strb r1, [r0, #11] 40 strb r1, [r0, #10] 41 strb r1, [r0, #9] 42 strb r1, [r0, #8] 43 strb r1, [r0, #7] 44 strb r1, [r0, #6] 45 strb r1, [r0, #5] 46 strb r1, [r0, #4] 47 strb r1, [r0, #3] 48 strb r1, [r0, #2] 49 strb r1, [r0, #1] 50 strb r1, [r0, #0] 51 1: 52 bx lr 53 2: 54 lsls r3, r1, #8 55 orrs r1, r3 56 lsls r3, r1, #16 57 orrs r1, r3 58 // Get r0 word-aligned: 59 lsrs r3, r0, #1 60 bcc 1f 61 strb r1, [r0] 62 adds r0, #1 63 subs r2, #1 64 1: 65 lsrs r3, r0, #2 66 bcc 1f 67 strh r1, [r0] 68 adds r0, #2 69 subs r2, #2 70 1: 71 // Set up for main loop. Limit pointer at end - (loop body size - 1) 72 push {r4} 73 adds r2, r0 74 subs r2, #15 75 mov ip, r2 76 mov r2, r1 77 mov r3, r1 78 mov r4, r1 79 80 // Fall straight into loop, because cases less than (loop body + max misalignment) are handled by slide 81 1: 82 stmia r0!, {r1, r2, r3, r4} 83 cmp r0, ip 84 blo 1b 85 86 // Main loop done, now tidy up the odds and ends 87 mov r4, ip 88 subs r4, r0 89 adds r4, #15 90 // No more than 15 bytes remaining -- first test bit 3 91 lsls r4, #29 92 bcc 1f 93 stmia r0!, {r1, r2} 94 1: 95 lsls r4, #1 96 bcc 1f 97 stmia r0!, {r1} 98 1: 99 lsls r4, #1 100 bcc 1f 101 strh r1, [r0] 102 adds r0, #2 103 1: 104 lsls r4, #1 105 bcc 1f 106 strb r1, [r0] 107 1: 108 pop {r4} 109 bx lr 110 111 112 decl_func sprite_fill16 113 // Slide for short fills 114 cmp r2, #15 115 bhi 2f 116 adr r3, 1f 117 lsls r2, #1 118 subs r3, r2 119 adds r3, #1 120 bx r3 121 .align 2 122 strh r1, [r0, #30] 123 strh r1, [r0, #28] 124 strh r1, [r0, #26] 125 strh r1, [r0, #24] 126 strh r1, [r0, #22] 127 strh r1, [r0, #20] 128 strh r1, [r0, #18] 129 strh r1, [r0, #16] 130 strh r1, [r0, #14] 131 strh r1, [r0, #12] 132 strh r1, [r0, #10] 133 strh r1, [r0, #8] 134 strh r1, [r0, #6] 135 strh r1, [r0, #4] 136 strh r1, [r0, #2] 137 strh r1, [r0, #0] 138 1: 139 bx lr 140 2: 141 push {r4, r5, r6, r7, lr} 142 // Get word-aligned before main fill loop 143 lsrs r3, r2, #2 144 bcc 1f 145 strh r1, [r0] 146 adds r0, #2 147 subs r2, #1 148 1: 149 // Set limit pointer at end - (loop body size - 1) 150 lsls r2, #1 151 adds r2, r0 152 subs r2, #26 153 mov ip, r2 154 155 lsls r2, r1, #16 156 orrs r1, r2 157 mov r2, r1 158 mov r3, r1 159 mov r4, r1 160 mov r5, r1 161 mov r6, r1 162 mov r7, r1 163 // We can fall through because cases < 1 loop are handled by slide 164 1: 165 stmia r0!, {r1, r2, r3, r4, r5, r6, r7} // wheeeeeeeeeee 166 cmp r0, ip 167 blo 1b 168 169 // Most of the work done, we have a few more to tidy up 170 movs r2, #26 171 add r2, ip 172 subs r2, r0 173 174 lsls r2, #28 175 bcc 1f 176 stmia r0!, {r4, r5, r6, r7} 177 1: 178 lsls r2, #1 179 bcc 1f 180 stmia r0!, {r4, r5} 181 1: 182 lsls r2, #1 183 bcc 1f 184 stmia r0!, {r4} 185 1: 186 lsls r2, #1 187 bcc 1f 188 strh r4, [r0] 189 1: 190 pop {r4, r5, r6, r7, pc} 191 192 // ---------------------------------------------------------------------------- 193 // Non-AT sprite 194 195 // r0: dst 196 // r1: src 197 // r2: pixel count 198 // 199 200 // Unrolled loop body with an initial computed branch. 201 202 decl_func sprite_blit8 203 mov ip, r0 204 lsrs r3, r2, #3 205 lsls r3, #3 206 eors r2, r3 // r2 = pixels % 8, r3 = pixels - pixels % 8 207 208 add r0, r3 209 add r1, r3 210 211 adr r3, 2f 212 lsls r2, #2 213 subs r3, r2 214 adds r3, #1 // thumb bit >:( 215 bx r3 216 217 .align 2 218 1: 219 subs r0, #8 220 subs r1, #8 221 ldrb r3, [r1, #7] 222 strb r3, [r0, #7] 223 ldrb r3, [r1, #6] 224 strb r3, [r0, #6] 225 ldrb r3, [r1, #5] 226 strb r3, [r0, #5] 227 ldrb r3, [r1, #4] 228 strb r3, [r0, #4] 229 ldrb r3, [r1, #3] 230 strb r3, [r0, #3] 231 ldrb r3, [r1, #2] 232 strb r3, [r0, #2] 233 ldrb r3, [r1, #1] 234 strb r3, [r0, #1] 235 ldrb r3, [r1, #0] 236 strb r3, [r0, #0] 237 2: 238 cmp r0, ip 239 bhi 1b 240 bx lr 241 242 .macro sprite_blit8_alpha_body n 243 ldrb r3, [r1, #\n] 244 lsrs r2, r3, #ALPHA_SHIFT_8BPP 245 bcc 2f 246 strb r3, [r0, #\n] 247 2: 248 .endm 249 250 decl_func sprite_blit8_alpha 251 mov ip, r0 252 lsrs r3, r2, #3 253 lsls r3, #3 254 eors r2, r3 255 256 add r0, r3 257 add r1, r3 258 259 adr r3, 3f 260 lsls r2, #3 261 subs r3, r2 262 adds r3, #1 263 bx r3 264 265 .align 2 266 1: 267 subs r0, #8 268 subs r1, #8 269 sprite_blit8_alpha_body 7 270 sprite_blit8_alpha_body 6 271 sprite_blit8_alpha_body 5 272 sprite_blit8_alpha_body 4 273 sprite_blit8_alpha_body 3 274 sprite_blit8_alpha_body 2 275 sprite_blit8_alpha_body 1 276 sprite_blit8_alpha_body 0 277 3: 278 cmp r0, ip 279 bhi 1b 280 bx lr 281 282 283 .macro storew_alignh rd ra offs 284 strh \rd, [\ra, #\offs] 285 lsrs \rd, #16 286 strh \rd, [\ra, #\offs + 2] 287 .endm 288 289 decl_func sprite_blit16 290 // Force source pointer to be word-aligned 291 lsrs r3, r1, #2 292 bcc 1f 293 ldrh r3, [r1] 294 strh r3, [r0] 295 adds r0, #2 296 adds r1, #2 297 subs r2, #1 298 1: 299 // Each loop is 8 pixels. Place limit pointer at 16 bytes before 300 // end, loop until past it. There will be 0 to 7 pixels remaining. 301 lsls r2, #1 302 adds r2, r0 303 subs r2, #16 304 mov ip, r2 305 b 2f 306 1: 307 ldmia r1!, {r2, r3} 308 storew_alignh r2, r0, 0 309 storew_alignh r3, r0, 4 310 ldmia r1!, {r2, r3} 311 storew_alignh r2, r0, 8 312 storew_alignh r3, r0, 12 313 adds r0, #16 314 2: 315 cmp r0, ip 316 bls 1b 317 318 mov r2, ip 319 subs r2, r0 320 // At least 4 pixels? 321 lsls r2, #29 322 bcc 1f 323 ldmia r1!, {r3} 324 storew_alignh r3, r0, 0 325 ldmia r1!, {r3} 326 storew_alignh r3, r0, 4 327 adds r0, #8 328 1: 329 // At least 2 pixels? 330 lsls r2, #1 331 bcc 1f 332 ldmia r1!, {r3} 333 storew_alignh r3, r0, 0 334 adds r0, #4 335 1: 336 // One more pixel? 337 lsls r2, #1 338 bcc 1f 339 ldrh r3, [r1] 340 strh r3, [r0] 341 1: 342 bx lr 343 344 .macro sprite_blit16_alpha_body n 345 ldrh r3, [r1, #2*\n] 346 lsrs r2, r3, #ALPHA_SHIFT_16BPP 347 bcc 2f 348 strh r3, [r0, #2*\n] 349 2: 350 .endm 351 352 decl_func sprite_blit16_alpha 353 mov ip, r0 354 lsrs r3, r2, #3 355 lsls r3, #3 356 eors r2, r3 357 358 lsls r3, #1 359 add r0, r3 360 add r1, r3 361 362 adr r3, 3f 363 lsls r2, #3 364 subs r3, r2 365 adds r3, #1 366 bx r3 367 368 .align 2 369 1: 370 subs r0, #16 371 subs r1, #16 372 sprite_blit16_alpha_body 7 373 sprite_blit16_alpha_body 6 374 sprite_blit16_alpha_body 5 375 sprite_blit16_alpha_body 4 376 sprite_blit16_alpha_body 3 377 sprite_blit16_alpha_body 2 378 sprite_blit16_alpha_body 1 379 sprite_blit16_alpha_body 0 380 3: 381 cmp r0, ip 382 bhi 1b 383 bx lr 384 385 386 // ---------------------------------------------------------------------------- 387 // Affine-transformed sprite (note these are just the inner loops -- INTERP0 388 // must be configured by the caller, which is presumably not written in asm) 389 390 // r0: raster start pointer 391 // r1: raster span size (pixels) 392 393 .macro sprite_ablit8_loop_body n 394 ldr r1, [r3, #CTRL0_OFFS] 395 ldr r2, [r3, #POP2_OFFS] 396 lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1 397 bcs 2f 398 ldrb r2, [r2] 399 strb r2, [r0, #\n] 400 2: 401 .endm 402 403 decl_func sprite_ablit8_loop 404 mov ip, r0 405 406 lsrs r2, r1, #3 407 lsls r2, #3 408 eors r1, r2 409 add r0, r2 410 411 adr r2, 3f 412 movs r3, #12 // Each (non-unrolled) loop body is 12 bytes 413 muls r1, r3 414 subs r2, r1 415 adds r2, #1 416 417 ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) 418 bx r2 419 420 .align 2 421 nop 422 1: 423 subs r0, #8 424 sprite_ablit8_loop_body 7 425 sprite_ablit8_loop_body 6 426 sprite_ablit8_loop_body 5 427 sprite_ablit8_loop_body 4 428 sprite_ablit8_loop_body 3 429 sprite_ablit8_loop_body 2 430 sprite_ablit8_loop_body 1 431 sprite_ablit8_loop_body 0 432 3: 433 cmp r0, ip 434 bne 1b 435 bx lr 436 437 438 439 // As above but bit 5 is assumed to be an alpha bit (RAGB2132) 440 441 .macro sprite_ablit8_alpha_loop_body n 442 ldr r1, [r3, #CTRL0_OFFS] 443 ldr r2, [r3, #POP2_OFFS] 444 lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1 445 bcs 2f 446 ldrb r2, [r2] 447 lsrs r1, r2, #ALPHA_SHIFT_8BPP 448 bcc 2f 449 strb r2, [r0, #\n] 450 2: 451 .endm 452 453 decl_func sprite_ablit8_alpha_loop 454 mov ip, r0 455 ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) 456 457 lsrs r2, r1, #3 458 lsls r2, #3 459 eors r1, r2 460 add r0, r2 461 462 adr r2, 3f 463 lsls r1, #4 // Each (non-unrolled) loop body is 16 bytes 464 subs r2, r1 465 adds r2, #1 466 bx r2 467 468 .align 2 469 nop 470 1: 471 subs r0, #8 472 sprite_ablit8_alpha_loop_body 7 473 sprite_ablit8_alpha_loop_body 6 474 sprite_ablit8_alpha_loop_body 5 475 sprite_ablit8_alpha_loop_body 4 476 sprite_ablit8_alpha_loop_body 3 477 sprite_ablit8_alpha_loop_body 2 478 sprite_ablit8_alpha_loop_body 1 479 sprite_ablit8_alpha_loop_body 0 480 3: 481 cmp r0, ip 482 bhi 1b 483 bx lr 484 485 486 487 .macro sprite_ablit16_loop_body n 488 ldr r1, [r3, #CTRL0_OFFS] 489 ldr r2, [r3, #POP2_OFFS] 490 lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1 491 bcs 2f 492 ldrh r2, [r2] 493 strh r2, [r0, #2*\n] 494 2: 495 .endm 496 497 decl_func sprite_ablit16_loop 498 mov ip, r0 499 500 lsrs r2, r1, #3 501 lsls r2, #3 502 eors r1, r2 503 lsls r2, #1 // Each pixel is 2 bytes 504 add r0, r2 505 506 adr r2, 3f 507 movs r3, #12 // Each (non-unrolled) loop body is 12 bytes 508 muls r1, r3 509 subs r2, r1 510 adds r2, #1 511 512 ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) 513 bx r2 514 515 .align 2 516 nop 517 1: 518 subs r0, #16 519 sprite_ablit16_loop_body 7 520 sprite_ablit16_loop_body 6 521 sprite_ablit16_loop_body 5 522 sprite_ablit16_loop_body 4 523 sprite_ablit16_loop_body 3 524 sprite_ablit16_loop_body 2 525 sprite_ablit16_loop_body 1 526 sprite_ablit16_loop_body 0 527 3: 528 cmp r0, ip 529 bne 1b 530 bx lr 531 532 533 534 .macro sprite_ablit16_alpha_loop_body n 535 ldr r1, [r3, #CTRL0_OFFS] 536 ldr r2, [r3, #POP2_OFFS] 537 lsrs r1, #SIO_INTERP0_CTRL_LANE0_OVERF_LSB + 1 538 bcs 2f 539 ldrh r2, [r2] 540 lsrs r1, r2, #ALPHA_SHIFT_16BPP 541 bcc 2f 542 strh r2, [r0, #2*\n] 543 2: 544 .endm 545 546 decl_func sprite_ablit16_alpha_loop 547 mov ip, r0 548 ldr r3, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) 549 550 lsrs r2, r1, #3 551 lsls r2, #3 552 eors r1, r2 553 lsls r2, #1 // Each pixel is 2 bytes 554 add r0, r2 555 556 adr r2, 3f 557 lsls r1, #4 // Each (non-unrolled) loop body is 16 bytes 558 subs r2, r1 559 adds r2, #1 560 bx r2 561 562 .align 2 563 nop 564 1: 565 subs r0, #16 566 sprite_ablit16_alpha_loop_body 7 567 sprite_ablit16_alpha_loop_body 6 568 sprite_ablit16_alpha_loop_body 5 569 sprite_ablit16_alpha_loop_body 4 570 sprite_ablit16_alpha_loop_body 3 571 sprite_ablit16_alpha_loop_body 2 572 sprite_ablit16_alpha_loop_body 1 573 sprite_ablit16_alpha_loop_body 0 574 3: 575 cmp r0, ip 576 bhi 1b 577 bx lr