tmds_encode.S
1 #include "hardware/regs/addressmap.h" 2 #include "hardware/regs/sio.h" 3 #include "dvi_config_defs.h" 4 5 // Offsets suitable for ldr/str (must be <= 0x7c): 6 #define ACCUM0_OFFS (SIO_INTERP0_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) 7 #define ACCUM1_OFFS (SIO_INTERP0_ACCUM1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) 8 #define ACCUM1_ADD_OFFS (SIO_INTERP0_ACCUM1_ADD_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) 9 #define PEEK0_OFFS (SIO_INTERP0_PEEK_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) 10 #define PEEK1_OFFS (SIO_INTERP0_PEEK_LANE1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) 11 #define PEEK2_OFFS (SIO_INTERP0_PEEK_FULL_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) 12 #define INTERP1 (SIO_INTERP1_ACCUM0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET) 13 // Note the entirety of INTERP0 and INTERP1 fits inside this 5-bit 14 // word-addressed space... almost as though it were intentional! :) 15 16 .syntax unified 17 .cpu cortex-m0plus 18 .thumb 19 20 .macro decl_func_x name 21 .section .scratch_x.\name, "ax" 22 .global \name 23 .type \name,%function 24 .thumb_func 25 \name: 26 .endm 27 28 .macro decl_func_y name 29 .section .scratch_y.\name, "ax" 30 .global \name 31 .type \name,%function 32 .thumb_func 33 \name: 34 .endm 35 36 #define decl_func decl_func_x 37 38 // ---------------------------------------------------------------------------- 39 // Pixel-doubling encoders for RGB 40 41 // r0: Input buffer (word-aligned) 42 // r1: Output buffer (word-aligned) 43 // r2: Input size (pixels) 44 45 .macro do_channel_16bpp r_ibase r_inout0 r_out1 46 str \r_inout0, [\r_ibase, #ACCUM0_OFFS] 47 ldr \r_inout0, [\r_ibase, #PEEK0_OFFS] 48 ldr \r_inout0, [\r_inout0] 49 ldr \r_out1, [\r_ibase, #PEEK1_OFFS] 50 ldr \r_out1, [\r_out1] 51 .endm 52 53 decl_func tmds_encode_loop_16bpp 54 push {r4, r5, r6, r7, lr} 55 lsls r2, #2 56 add r2, r1 57 mov ip, r2 58 ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) 59 b 2f 60 .align 2 61 1: 62 .rept TMDS_ENCODE_UNROLL 63 ldmia r0!, {r4, r6} 64 do_channel_16bpp r2, r4, r5 65 do_channel_16bpp r2, r6, r7 66 stmia r1!, {r4, r5, r6, r7} 67 .endr 68 2: 69 cmp r1, ip 70 bne 1b 71 pop {r4, r5, r6, r7, pc} 72 73 // Same as above, but scale data to make up for lack of left shift 74 // in interpolator (costs 1 cycle per 2 pixels) 75 // 76 // r0: Input buffer (word-aligned) 77 // r1: Output buffer (word-aligned) 78 // r2: Input size (pixels) 79 // r3: Left shift amount 80 81 decl_func tmds_encode_loop_16bpp_leftshift 82 push {r4, r5, r6, r7, lr} 83 lsls r2, #2 84 add r2, r1 85 mov ip, r2 86 ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) 87 b 2f 88 .align 2 89 1: 90 .rept TMDS_ENCODE_UNROLL 91 ldmia r0!, {r4, r6} 92 lsls r4, r3 93 do_channel_16bpp r2, r4, r5 94 lsls r6, r3 95 do_channel_16bpp r2, r6, r7 96 stmia r1!, {r4, r5, r6, r7} 97 .endr 98 2: 99 cmp r1, ip 100 bne 1b 101 pop {r4, r5, r6, r7, pc} 102 103 // r0: Input buffer (word-aligned) 104 // r1: Output buffer (word-aligned) 105 // r2: Input size (pixels) 106 107 decl_func tmds_encode_loop_8bpp 108 push {r4, r5, r6, r7, lr} 109 lsls r2, #2 110 add r2, r1 111 mov ip, r2 112 ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) 113 b 2f 114 .align 2 115 1: 116 .rept TMDS_ENCODE_UNROLL 117 ldmia r0!, {r4} 118 str r4, [r2, #ACCUM0_OFFS + INTERP1] 119 str r4, [r2, #ACCUM0_OFFS] 120 ldr r4, [r2, #PEEK0_OFFS] 121 ldr r4, [r4] 122 ldr r5, [r2, #PEEK1_OFFS] 123 ldr r5, [r5] 124 ldr r6, [r2, #PEEK0_OFFS + INTERP1] 125 ldr r6, [r6] 126 ldr r7, [r2, #PEEK1_OFFS + INTERP1] 127 ldr r7, [r7] 128 stmia r1!, {r4, r5, r6, r7} 129 .endr 130 2: 131 cmp r1, ip 132 bne 1b 133 pop {r4, r5, r6, r7, pc} 134 135 // r0: Input buffer (word-aligned) 136 // r1: Output buffer (word-aligned) 137 // r2: Input size (pixels) 138 // r3: Left shift amount 139 // 140 // Note that only the data written to interp0 (pixel 0, 1) is leftshifted, not 141 // the data written to interp1 (pixel 2, 3). Otherwise we always lose MSBs, as 142 // the LUT offset MSB is at bit 8, so pixel 0 always requires some left shift, 143 // since its channel MSBs are no greater than 7. 144 145 decl_func tmds_encode_loop_8bpp_leftshift 146 push {r4, r5, r6, r7, lr} 147 lsls r2, #3 148 add r2, r1 149 mov ip, r2 150 ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) 151 b 2f 152 .align 2 153 1: 154 .rept TMDS_ENCODE_UNROLL 155 ldmia r0!, {r4} 156 str r4, [r2, #ACCUM0_OFFS + INTERP1] 157 lsls r4, r3 158 str r4, [r2, #ACCUM0_OFFS] 159 ldr r4, [r2, #PEEK0_OFFS] 160 ldr r4, [r4] 161 ldr r5, [r2, #PEEK1_OFFS] 162 ldr r5, [r5] 163 ldr r6, [r2, #PEEK0_OFFS + INTERP1] 164 ldr r6, [r6] 165 ldr r7, [r2, #PEEK1_OFFS + INTERP1] 166 ldr r7, [r7] 167 stmia r1!, {r4, r5, r6, r7} 168 .endr 169 2: 170 cmp r1, ip 171 bne 1b 172 pop {r4, r5, r6, r7, pc} 173 174 // ---------------------------------------------------------------------------- 175 // Fast 1bpp black/white encoder (full res) 176 177 // Taking the encoder from DVI spec, with initial balance 0: 178 // 179 // - Encoding either 0x00 or 0xff will produce a running balance of -8, with 180 // output symbol of 0x100 or 0x200 181 // 182 // - Subsequently encoding either 0x01 or 0xfe will return the balance to 0, with 183 // output symbol of 0x1ff or 0x2ff 184 // 185 // So we can do 1bpp encode with a lookup of x coordinate LSB, and input 186 // colour bit. If we process pixels in even-sized blocks, only the colour 187 // lookup is needed. 188 189 // Encode 8 pixels @ 1bpp (using two table lookups) 190 // r3 contains lookup mask (preshifted) 191 // r8 contains pointer to encode table 192 // 2.125 cyc/pix 193 .macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1 194 \shift_instr0 r4, r2, #\shamt0 195 ands r4, r3 196 add r4, r8 197 ldmia r4, {r4, r5} 198 \shift_instr1 r6, r2, #\shamt1 199 ands r6, r3 200 add r6, r8 201 ldmia r6, {r6, r7} 202 stmia r1!, {r4, r5, r6, r7} 203 .endm 204 205 // r0: input buffer (word-aligned) 206 // r1: output buffer (word-aligned) 207 // r2: output pixel count 208 decl_func tmds_encode_1bpp 209 push {r4-r7, lr} 210 mov r7, r8 211 push {r7} 212 lsls r2, #1 213 add r2, r1 214 mov ip, r2 215 adr r4, tmds_1bpp_table 216 mov r8, r4 217 // Mask: 4 bit index, 8 bytes per entry 218 movs r3, #0x78 219 b 2f 220 1: 221 ldmia r0!, {r2} 222 #if !DVI_1BPP_BIT_REVERSE 223 tmds_encode_1bpp_body lsls 3 lsrs 1 224 tmds_encode_1bpp_body lsrs 5 lsrs 9 225 tmds_encode_1bpp_body lsrs 13 lsrs 17 226 tmds_encode_1bpp_body lsrs 21 lsrs 25 227 #else 228 tmds_encode_1bpp_body lsrs 1 lsls 3 229 tmds_encode_1bpp_body lsrs 9 lsrs 5 230 tmds_encode_1bpp_body lsrs 17 lsrs 13 231 tmds_encode_1bpp_body lsrs 25 lsrs 21 232 #endif 233 2: 234 cmp r1, ip 235 blo 1b 236 237 pop {r7} 238 mov r8, r7 239 pop {r4-r7, pc} 240 241 .align 2 242 tmds_1bpp_table: 243 #if !DVI_1BPP_BIT_REVERSE 244 .word 0x7fd00, 0x7fd00 // 0000 245 .word 0x7fe00, 0x7fd00 // 0001 246 .word 0xbfd00, 0x7fd00 // 0010 247 .word 0xbfe00, 0x7fd00 // 0011 248 .word 0x7fd00, 0x7fe00 // 0100 249 .word 0x7fe00, 0x7fe00 // 0101 250 .word 0xbfd00, 0x7fe00 // 0110 251 .word 0xbfe00, 0x7fe00 // 0111 252 .word 0x7fd00, 0xbfd00 // 1000 253 .word 0x7fe00, 0xbfd00 // 1001 254 .word 0xbfd00, 0xbfd00 // 1010 255 .word 0xbfe00, 0xbfd00 // 1011 256 .word 0x7fd00, 0xbfe00 // 1100 257 .word 0x7fe00, 0xbfe00 // 1101 258 .word 0xbfd00, 0xbfe00 // 1110 259 .word 0xbfe00, 0xbfe00 // 1111 260 #else 261 .word 0x7fd00, 0x7fd00 // 0000 262 .word 0x7fd00, 0xbfd00 // 1000 263 .word 0x7fd00, 0x7fe00 // 0100 264 .word 0x7fd00, 0xbfe00 // 1100 265 .word 0xbfd00, 0x7fd00 // 0010 266 .word 0xbfd00, 0xbfd00 // 1010 267 .word 0xbfd00, 0x7fe00 // 0110 268 .word 0xbfd00, 0xbfe00 // 1110 269 .word 0x7fe00, 0x7fd00 // 0001 270 .word 0x7fe00, 0xbfd00 // 1001 271 .word 0x7fe00, 0x7fe00 // 0101 272 .word 0x7fe00, 0xbfe00 // 1101 273 .word 0xbfe00, 0x7fd00 // 0011 274 .word 0xbfe00, 0xbfd00 // 1011 275 .word 0xbfe00, 0x7fe00 // 0111 276 .word 0xbfe00, 0xbfe00 // 1111 277 #endif 278 279 280 // ---------------------------------------------------------------------------- 281 // Full-resolution 2bpp encode (for 2bpp grayscale, or bitplaned RGB222) 282 283 // Even-x-position pixels are encoded as symbols with imbalance -4, and odd 284 // pixels with +4, so that we can mix-and-match our even/odd codewords and 285 // always get a properly balanced sequence: 286 // 287 // level 0: (05 -> 103), then (04 -> 1fc) (decimal 5, 4) 288 // level 1: (50 -> 130), then (51 -> 1cf) (decimal 80, 81) 289 // level 2: (af -> 230), then (ae -> 2cf) (decimal 175, 174) 290 // level 3: (fa -> 203), then (fb -> 2fc) (decimal 250, 251) 291 // 292 // These correspond to roughly 255 times (0, 1/3, 2/3, 1). 293 // 294 // Alternatively we could use symbols with 0 balance, which results in lower 295 // contrast but avoids the LSB bobble: 296 // 297 // level 0: (10 -> 1f0) always 298 // level 1: (5a -> 263) always 299 // level 2: (a5 -> 163) always 300 // level 3: (ef -> 2f0) always 301 302 // Table base pointer in r0. Input pixels in r2. 303 .macro encode_2bpp_body shift_instr shamt rd 304 \shift_instr \rd, r2, #\shamt 305 ands \rd, r3 306 ldr \rd, [r0, \rd] 307 .endm 308 309 // r0: input buffer (word-aligned) 310 // r1: output buffer (word-aligned) 311 // r2: output pixel count 312 decl_func tmds_encode_2bpp 313 push {r4-r7, lr} 314 mov r7, r8 315 push {r7} 316 mov r8, r0 317 adr r0, tmds_2bpp_table 318 // Mask: 4-bit index into 4-byte entries. 319 movs r3, #0x3c 320 // Limit pointer: 1 word per 2 pixels 321 lsls r2, #1 322 add r2, r1 323 mov ip, r2 324 b 2f 325 1: 326 mov r4, r8 327 ldmia r4!, {r2} 328 mov r8, r4 329 encode_2bpp_body lsls 2 r4 330 encode_2bpp_body lsrs 2 r5 331 encode_2bpp_body lsrs 6 r6 332 encode_2bpp_body lsrs 10 r7 333 stmia r1!, {r4-r7} 334 encode_2bpp_body lsrs 14 r4 335 encode_2bpp_body lsrs 18 r5 336 encode_2bpp_body lsrs 22 r6 337 encode_2bpp_body lsrs 26 r7 338 stmia r1!, {r4-r7} 339 2: 340 cmp r1, ip 341 blo 1b 342 pop {r7} 343 mov r8, r7 344 pop {r4-r7, pc} 345 346 .align 2 347 tmds_2bpp_table: 348 .word 0x7f103 // 00, 00 349 .word 0x7f130 // 01, 00 350 .word 0x7f230 // 10, 00 351 .word 0x7f203 // 11, 00 352 .word 0x73d03 // 00, 01 353 .word 0x73d30 // 01, 01 354 .word 0x73e30 // 10, 01 355 .word 0x73e03 // 11, 01 356 .word 0xb3d03 // 00, 10 357 .word 0xb3d30 // 01, 10 358 .word 0xb3e30 // 10, 10 359 .word 0xb3e03 // 11, 10 360 .word 0xbf103 // 00, 11 361 .word 0xbf130 // 01, 11 362 .word 0xbf230 // 10, 11 363 .word 0xbf203 // 11, 11 364 365 // ---------------------------------------------------------------------------- 366 // Full-resolution RGB encode (not very practical) 367 368 // Non-doubled TMDS encode. 8.333 cycles per pixel, no exceptions. (This is 369 // taking horizontal blanking (at VGA) and dual core into account, and 370 // assuming the 3 channels are encoded individually.) 371 // 372 // Here is an idea 373 // Have a table with a 7 bit lookup. The lookup is the 6 colour data bits (in 374 // ACCUM0), concatenated with the sign bit of our running disparity (from 375 // ACCUM1). Each table entry is a 20-bit TMDS symbol (pseudodifferential), 376 // with the symbol's disparity stored left-justified in the upper 12 bits, as 377 // e.g. a 6 bit signed integer. 378 // 379 // - Load pixel data. cyc: 0.75 (ldmia 2 words, every 4 pixels) 380 // - Write pixel to ACCUM0. cyc: 1 381 // - Read address from PEEK2. cyc: 1 382 // - Load encoded pixel from address. cyc: 2 383 // - Write disparity data to ACCUM1_ADD cyc: 1 384 // - Write encoded data to output buffer. cyc: 1.25 (stmia 4 words, every 4 pixels) 385 // 386 // With decent register allocation we may be able to load 4 pixels at 387 // once (2 words), and write 4 at once (4 words). This gives 7 cyc/pix. 388 // 389 // One issue is that the TMDS data in the bottom of ACCUM1 will eventually 390 // overflow and affect the running disparity, but with 16 zeroes in between, 391 // this would take much longer than one scanline, so everything is fine if 392 // we clear the accumulator at the start of the scanline. 393 // 394 // Note that we need to use two interpolators to get the bits from both pixels 395 // -- we are not outputting a single DC-balanced stream, but rather two 396 // interleaved streams which are each DC-balanced. This is fine electrically, 397 // but our output here will *NOT* match the TMDS encoder given in the DVI 398 // spec. 399 400 // You can define TMDS_FULLRES_NO_DC_BALANCE to disable the running balance 401 // feedback. With the feedback enabled (default), the output is DC balanced, 402 // but there are just barely enough CPU cycles to do all the encode, so it's 403 // essentially a party trick. If you disable DC balancing, the performance is 404 // much better, and many monitors will still accept the signals as long as you 405 // DC couple your DVI signals. 406 407 .macro tmds_fullres_encode_loop_body ra rb 408 str \ra, [r2, #ACCUM0_OFFS + INTERP1] 409 str \ra, [r2, #ACCUM0_OFFS] 410 ldr \ra, [r2, #PEEK2_OFFS] 411 ldr \ra, [\ra] 412 #if !TMDS_FULLRES_NO_DC_BALANCE 413 str \ra, [r2, #ACCUM1_ADD_OFFS] 414 #endif 415 ldr \rb, [r2, #PEEK2_OFFS + INTERP1] 416 ldr \rb, [\rb] 417 #if !TMDS_FULLRES_NO_DC_BALANCE 418 str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1] 419 #endif 420 .endm 421 422 // r0: Input buffer (word-aligned) 423 // r1: Output buffer (word-aligned) 424 // r2: Pixel count 425 426 .macro tmds_fullres_encode_loop_16bpp 427 push {r4-r7, lr} 428 mov r4, r8 429 push {r4} 430 431 432 lsls r2, #2 433 add r2, r1 434 mov ip, r2 435 ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) 436 // DC balance defined to be 0 at start of scanline: 437 movs r4, #0 438 str r4, [r2, #ACCUM1_OFFS] 439 #if TMDS_FULLRES_NO_DC_BALANCE 440 // Alternate parity between odd/even symbols if no feedback 441 mvns r4, r4 442 #endif 443 str r4, [r2, #ACCUM1_OFFS + INTERP1] 444 445 // Keep loop start pointer in r8 so we can get a longer backward branch 446 adr r4, 1f 447 adds r4, #1 // god damn thumb bit why is this a thing 448 mov r8, r4 449 b 2f 450 .align 2 451 1: 452 .rept 16 453 ldmia r0!, {r4, r6} 454 tmds_fullres_encode_loop_body r4 r5 455 tmds_fullres_encode_loop_body r6 r7 456 stmia r1!, {r4, r5, r6, r7} 457 .endr 458 2: 459 cmp r1, ip 460 beq 1f 461 bx r8 462 1: 463 pop {r4} 464 mov r8, r4 465 pop {r4-r7, pc} 466 .endm 467 468 // One copy each in X and Y, so the two cores don't step on each other 469 decl_func_x tmds_fullres_encode_loop_16bpp_x 470 tmds_fullres_encode_loop_16bpp 471 decl_func_y tmds_fullres_encode_loop_16bpp_y 472 tmds_fullres_encode_loop_16bpp 473 474 475 .macro tmds_fullres_encode_loop_body_leftshift ra rb 476 // Note we apply the leftshift for INTERP0 only 477 str \ra, [r2, #ACCUM0_OFFS + INTERP1] 478 lsls \ra, r3 479 str \ra, [r2, #ACCUM0_OFFS] 480 ldr \ra, [r2, #PEEK2_OFFS] 481 ldr \ra, [\ra] 482 #if !TMDS_FULLRES_NO_DC_BALANCE 483 str \ra, [r2, #ACCUM1_ADD_OFFS] 484 #endif 485 ldr \rb, [r2, #PEEK2_OFFS + INTERP1] 486 ldr \rb, [\rb] 487 #if !TMDS_FULLRES_NO_DC_BALANCE 488 str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1] 489 #endif 490 .endm 491 492 // r0: Input buffer (word-aligned) 493 // r1: Output buffer (word-aligned) 494 // r2: Pixel count 495 // r3: Left shift amount 496 497 .macro tmds_fullres_encode_loop_16bpp_leftshift 498 push {r4-r7, lr} 499 mov r4, r8 500 mov r5, r9 501 push {r4-r5} 502 503 lsls r2, #2 504 add r2, r1 505 mov ip, r2 506 ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) 507 // DC balance defined to be 0 at start of scanline: 508 movs r4, #0 509 str r4, [r2, #ACCUM1_OFFS] 510 #if TMDS_FULLRES_NO_DC_BALANCE 511 // Alternate parity between odd/even symbols if there's no balance feedback 512 mvns r4, r4 513 #endif 514 str r4, [r2, #ACCUM1_OFFS + INTERP1] 515 516 adr r4, 1f 517 adds r4, #1 518 mov r8, r4 519 b 2f 520 .align 2 521 1: 522 .rept 16 // 64 pixels per iteration 523 ldmia r0!, {r4, r6} 524 tmds_fullres_encode_loop_body_leftshift r4 r5 525 tmds_fullres_encode_loop_body_leftshift r6 r7 526 stmia r1!, {r4, r5, r6, r7} 527 .endr 528 2: 529 cmp r1, ip 530 beq 1f 531 bx r8 532 1: 533 pop {r4-r5} 534 mov r8, r4 535 mov r9, r5 536 pop {r4-r7, pc} 537 .endm 538 539 decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x 540 tmds_fullres_encode_loop_16bpp_leftshift 541 decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y 542 tmds_fullres_encode_loop_16bpp_leftshift 543 544 545 // ---------------------------------------------------------------------------- 546 // Full-resolution 8bpp paletted encode 547 548 // Variant of tmds_fullres_encode_loop_16bpp that reads 549 // 8-bit wide pixels packed 4 per word. The interpolator 550 // base is set to a reordered list of TMDS symbols based 551 // on a user colour palette. 552 553 // Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. r2 contains 554 // interp base pointer. r7 used as temporary. 555 .macro tmds_palette_encode_loop_body rd 556 str \rd, [r2, #ACCUM0_OFFS] 557 str \rd, [r2, #ACCUM0_OFFS + INTERP1] 558 ldr \rd, [r2, #PEEK2_OFFS] 559 ldr \rd, [\rd] 560 #if !TMDS_FULLRES_NO_DC_BALANCE 561 str \rd, [r2, #ACCUM1_ADD_OFFS] 562 #endif 563 ldr r7, [r2, #PEEK2_OFFS + INTERP1] 564 ldr r7, [r7] 565 #if !TMDS_FULLRES_NO_DC_BALANCE 566 str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1] 567 #endif 568 lsls r7, #10 569 orrs \rd, r7 570 .endm 571 572 .macro tmds_palette_encode_loop 573 push {r4-r7, lr} 574 mov r4, r8 575 push {r4} 576 577 578 lsls r2, #1 579 add r2, r1 580 mov ip, r2 581 ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET) 582 // DC balance defined to be 0 at start of scanline: 583 movs r4, #0 584 str r4, [r2, #ACCUM1_OFFS] 585 #if TMDS_FULLRES_NO_DC_BALANCE 586 // Alternate parity between odd/even symbols if there's no balance feedback 587 mvns r4, r4 588 #endif 589 str r4, [r2, #ACCUM1_OFFS + INTERP1] 590 591 // Keep loop start pointer in r8 so we can get a longer backward branch 592 adr r4, 1f 593 adds r4, #1 // god damn thumb bit why is this a thing 594 mov r8, r4 595 b 2f 596 .align 2 597 1: 598 .rept 10 599 ldmia r0!, {r3, r5} 600 lsrs r4, r3, #14 601 lsls r3, #2 602 lsrs r6, r5, #14 603 lsls r5, #2 604 tmds_palette_encode_loop_body r3 605 tmds_palette_encode_loop_body r4 606 tmds_palette_encode_loop_body r5 607 tmds_palette_encode_loop_body r6 608 stmia r1!, {r3, r4, r5, r6} 609 .endr 610 2: 611 cmp r1, ip 612 beq 1f 613 bx r8 614 1: 615 pop {r4} 616 mov r8, r4 617 pop {r4-r7, pc} 618 .endm 619 620 decl_func_x tmds_palette_encode_loop_x 621 tmds_palette_encode_loop 622 decl_func_y tmds_palette_encode_loop_y 623 tmds_palette_encode_loop