Cradicle Explorer

tmds_encode.S
  1  #include "hardware/regs/addressmap.h"
  2  #include "hardware/regs/sio.h"
  3  #include "dvi_config_defs.h"
  4  
  5  // Offsets suitable for ldr/str (must be <= 0x7c):
  6  #define ACCUM0_OFFS     (SIO_INTERP0_ACCUM0_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
  7  #define ACCUM1_OFFS     (SIO_INTERP0_ACCUM1_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
  8  #define ACCUM1_ADD_OFFS (SIO_INTERP0_ACCUM1_ADD_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
  9  #define PEEK0_OFFS      (SIO_INTERP0_PEEK_LANE0_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
 10  #define PEEK1_OFFS      (SIO_INTERP0_PEEK_LANE1_OFFSET - SIO_INTERP0_ACCUM0_OFFSET)
 11  #define PEEK2_OFFS      (SIO_INTERP0_PEEK_FULL_OFFSET  - SIO_INTERP0_ACCUM0_OFFSET)
 12  #define INTERP1         (SIO_INTERP1_ACCUM0_OFFSET     - SIO_INTERP0_ACCUM0_OFFSET)
 13  // Note the entirety of INTERP0 and INTERP1 fits inside this 5-bit
 14  // word-addressed space... almost as though it were intentional! :)
 15  
 16  .syntax unified
 17  .cpu cortex-m0plus
 18  .thumb
 19  
 20  .macro decl_func_x name
 21  .section .scratch_x.\name, "ax"
 22  .global \name
 23  .type \name,%function
 24  .thumb_func
 25  \name:
 26  .endm
 27  
 28  .macro decl_func_y name
 29  .section .scratch_y.\name, "ax"
 30  .global \name
 31  .type \name,%function
 32  .thumb_func
 33  \name:
 34  .endm
 35  
 36  #define decl_func decl_func_x
 37  
 38  // ----------------------------------------------------------------------------
 39  // Pixel-doubling encoders for RGB
 40  
 41  // r0: Input buffer (word-aligned)
 42  // r1: Output buffer (word-aligned)
 43  // r2: Input size (pixels)
 44  
 45  .macro do_channel_16bpp r_ibase r_inout0 r_out1
 46  	str \r_inout0, [\r_ibase, #ACCUM0_OFFS]
 47  	ldr \r_inout0, [\r_ibase, #PEEK0_OFFS]
 48  	ldr \r_inout0, [\r_inout0]
 49  	ldr \r_out1, [\r_ibase, #PEEK1_OFFS]
 50  	ldr \r_out1, [\r_out1]
 51  .endm
 52  
 53  decl_func tmds_encode_loop_16bpp
 54  	push {r4, r5, r6, r7, lr}
 55  	lsls r2, #2
 56  	add r2, r1
 57  	mov ip, r2
 58  	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
 59  	b 2f
 60  .align 2
 61  1:
 62  .rept TMDS_ENCODE_UNROLL
 63  	ldmia r0!, {r4, r6}
 64  	do_channel_16bpp r2, r4, r5
 65  	do_channel_16bpp r2, r6, r7
 66  	stmia r1!, {r4, r5, r6, r7}
 67  .endr
 68  2:
 69  	cmp r1, ip
 70  	bne 1b
 71  	pop {r4, r5, r6, r7, pc}
 72  
 73  // Same as above, but scale data to make up for lack of left shift
 74  // in interpolator (costs 1 cycle per 2 pixels)
 75  //
 76  // r0: Input buffer (word-aligned)
 77  // r1: Output buffer (word-aligned)
 78  // r2: Input size (pixels)
 79  // r3: Left shift amount
 80  
 81  decl_func tmds_encode_loop_16bpp_leftshift
 82  	push {r4, r5, r6, r7, lr}
 83  	lsls r2, #2
 84  	add r2, r1
 85  	mov ip, r2
 86  	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
 87  	b 2f
 88  .align 2
 89  1:
 90  .rept TMDS_ENCODE_UNROLL
 91  	ldmia r0!, {r4, r6}
 92  	lsls r4, r3
 93  	do_channel_16bpp r2, r4, r5
 94  	lsls r6, r3
 95  	do_channel_16bpp r2, r6, r7
 96  	stmia r1!, {r4, r5, r6, r7}
 97  .endr
 98  2:
 99  	cmp r1, ip
100  	bne 1b
101  	pop {r4, r5, r6, r7, pc}
102  
103  // r0: Input buffer (word-aligned)
104  // r1: Output buffer (word-aligned)
105  // r2: Input size (pixels)
106  
107  decl_func tmds_encode_loop_8bpp
108  	push {r4, r5, r6, r7, lr}
109  	lsls r2, #2
110  	add r2, r1
111  	mov ip, r2
112  	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
113  	b 2f
114  .align 2
115  1:
116  .rept TMDS_ENCODE_UNROLL
117  	ldmia  r0!, {r4}
118  	str r4, [r2, #ACCUM0_OFFS + INTERP1]
119  	str r4, [r2, #ACCUM0_OFFS]
120  	ldr r4, [r2, #PEEK0_OFFS]
121  	ldr r4, [r4]
122  	ldr r5, [r2, #PEEK1_OFFS]
123  	ldr r5, [r5]
124  	ldr r6, [r2, #PEEK0_OFFS + INTERP1]
125  	ldr r6, [r6]
126  	ldr r7, [r2, #PEEK1_OFFS + INTERP1]
127  	ldr r7, [r7]
128  	stmia r1!, {r4, r5, r6, r7}
129  .endr
130  2:
131  	cmp r1, ip
132  	bne 1b
133  	pop {r4, r5, r6, r7, pc}
134  
135  // r0: Input buffer (word-aligned)
136  // r1: Output buffer (word-aligned)
137  // r2: Input size (pixels)
138  // r3: Left shift amount
139  //
140  // Note that only the data written to interp0 (pixel 0, 1) is leftshifted, not
141  // the data written to interp1 (pixel 2, 3). Otherwise we always lose MSBs, as
142  // the LUT offset MSB is at bit 8, so pixel 0 always requires some left shift,
143  // since its channel MSBs are no greater than 7.
144  
145  decl_func tmds_encode_loop_8bpp_leftshift
146  	push {r4, r5, r6, r7, lr}
147  	lsls r2, #3
148  	add r2, r1
149  	mov ip, r2
150  	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
151  	b 2f
152  .align 2
153  1:
154  .rept TMDS_ENCODE_UNROLL
155  	ldmia  r0!, {r4}
156  	str r4, [r2, #ACCUM0_OFFS + INTERP1]
157  	lsls r4, r3
158  	str r4, [r2, #ACCUM0_OFFS]
159  	ldr r4, [r2, #PEEK0_OFFS]
160  	ldr r4, [r4]
161  	ldr r5, [r2, #PEEK1_OFFS]
162  	ldr r5, [r5]
163  	ldr r6, [r2, #PEEK0_OFFS + INTERP1]
164  	ldr r6, [r6]
165  	ldr r7, [r2, #PEEK1_OFFS + INTERP1]
166  	ldr r7, [r7]
167  	stmia r1!, {r4, r5, r6, r7}
168  .endr
169  2:
170  	cmp r1, ip
171  	bne 1b
172  	pop {r4, r5, r6, r7, pc}
173  
174  // ----------------------------------------------------------------------------
175  // Fast 1bpp black/white encoder (full res)
176  
177  // Taking the encoder from DVI spec, with initial balance 0:
178  // 
179  // - Encoding either 0x00 or 0xff will produce a running balance of -8, with
180  //   output symbol of 0x100 or 0x200
181  // 
182  // - Subsequently encoding either 0x01 or 0xfe will return the balance to 0, with
183  //  output symbol of 0x1ff or 0x2ff
184  // 
185  // So we can do 1bpp encode with a lookup of x coordinate LSB, and input
186  // colour bit. If we process pixels in even-sized blocks, only the colour
187  // lookup is needed.
188  
189  // Encode 8 pixels @ 1bpp (using two table lookups)
190  // r3 contains lookup mask (preshifted)
191  // r8 contains pointer to encode table
192  // 2.125 cyc/pix
193  .macro tmds_encode_1bpp_body shift_instr0 shamt0 shift_instr1 shamt1
194  	\shift_instr0 r4, r2, #\shamt0
195  	ands r4, r3
196  	add r4, r8
197  	ldmia r4, {r4, r5}
198  	\shift_instr1 r6, r2, #\shamt1
199  	ands r6, r3
200  	add r6, r8
201  	ldmia r6, {r6, r7}
202  	stmia r1!, {r4, r5, r6, r7}
203  .endm
204  
205  // r0: input buffer (word-aligned)
206  // r1: output buffer (word-aligned)
207  // r2: output pixel count
208  decl_func tmds_encode_1bpp
209  	push {r4-r7, lr}
210  	mov r7, r8
211  	push {r7}
212  	lsls r2, #1
213  	add r2, r1
214  	mov ip, r2
215  	adr r4, tmds_1bpp_table
216  	mov r8, r4
217  	// Mask: 4 bit index, 8 bytes per entry
218  	movs r3, #0x78
219  	b 2f
220  1:
221  	ldmia r0!, {r2}
222  #if !DVI_1BPP_BIT_REVERSE
223  	tmds_encode_1bpp_body lsls 3  lsrs 1
224  	tmds_encode_1bpp_body lsrs 5  lsrs 9
225  	tmds_encode_1bpp_body lsrs 13 lsrs 17
226  	tmds_encode_1bpp_body lsrs 21 lsrs 25
227  #else
228  	tmds_encode_1bpp_body lsrs 1   lsls 3
229  	tmds_encode_1bpp_body lsrs 9   lsrs 5
230  	tmds_encode_1bpp_body lsrs 17  lsrs 13
231  	tmds_encode_1bpp_body lsrs 25  lsrs 21
232  #endif
233  2:
234  	cmp r1, ip
235  	blo 1b
236  
237  	pop {r7}
238  	mov r8, r7
239  	pop {r4-r7, pc}
240  
241  .align 2
242  tmds_1bpp_table:
243  #if !DVI_1BPP_BIT_REVERSE
244  	.word 0x7fd00, 0x7fd00  // 0000
245  	.word 0x7fe00, 0x7fd00  // 0001
246  	.word 0xbfd00, 0x7fd00  // 0010
247  	.word 0xbfe00, 0x7fd00  // 0011
248  	.word 0x7fd00, 0x7fe00  // 0100
249  	.word 0x7fe00, 0x7fe00  // 0101
250  	.word 0xbfd00, 0x7fe00  // 0110
251  	.word 0xbfe00, 0x7fe00  // 0111
252  	.word 0x7fd00, 0xbfd00  // 1000
253  	.word 0x7fe00, 0xbfd00  // 1001
254  	.word 0xbfd00, 0xbfd00  // 1010
255  	.word 0xbfe00, 0xbfd00  // 1011
256  	.word 0x7fd00, 0xbfe00  // 1100
257  	.word 0x7fe00, 0xbfe00  // 1101
258  	.word 0xbfd00, 0xbfe00  // 1110
259  	.word 0xbfe00, 0xbfe00  // 1111
260  #else
261  	.word 0x7fd00, 0x7fd00  // 0000
262  	.word 0x7fd00, 0xbfd00  // 1000
263  	.word 0x7fd00, 0x7fe00  // 0100
264  	.word 0x7fd00, 0xbfe00  // 1100
265  	.word 0xbfd00, 0x7fd00  // 0010
266  	.word 0xbfd00, 0xbfd00  // 1010
267  	.word 0xbfd00, 0x7fe00  // 0110
268  	.word 0xbfd00, 0xbfe00  // 1110
269  	.word 0x7fe00, 0x7fd00  // 0001
270  	.word 0x7fe00, 0xbfd00  // 1001
271  	.word 0x7fe00, 0x7fe00  // 0101
272  	.word 0x7fe00, 0xbfe00  // 1101
273  	.word 0xbfe00, 0x7fd00  // 0011
274  	.word 0xbfe00, 0xbfd00  // 1011
275  	.word 0xbfe00, 0x7fe00  // 0111
276  	.word 0xbfe00, 0xbfe00  // 1111
277  #endif
278  
279  
280  // ----------------------------------------------------------------------------
281  // Full-resolution 2bpp encode (for 2bpp grayscale, or bitplaned RGB222)
282  
283  // Even-x-position pixels are encoded as symbols with imbalance -4, and odd
284  // pixels with +4, so that we can mix-and-match our even/odd codewords and
285  // always get a properly balanced sequence:
286  //
287  // level 0: (05 -> 103), then (04 -> 1fc)  (decimal 5, 4)
288  // level 1: (50 -> 130), then (51 -> 1cf)  (decimal 80, 81)
289  // level 2: (af -> 230), then (ae -> 2cf)  (decimal 175, 174)
290  // level 3: (fa -> 203), then (fb -> 2fc)  (decimal 250, 251)
291  //
292  // These correspond to roughly 255 times (0, 1/3, 2/3, 1).
293  //
294  // Alternatively we could use symbols with 0 balance, which results in lower
295  // contrast but avoids the LSB bobble:
296  //
297  // level 0: (10 -> 1f0) always
298  // level 1: (5a -> 263) always
299  // level 2: (a5 -> 163) always
300  // level 3: (ef -> 2f0) always
301  
302  // Table base pointer in r0. Input pixels in r2.
303  .macro encode_2bpp_body shift_instr shamt rd
304  	\shift_instr \rd, r2, #\shamt
305  	ands \rd, r3
306  	ldr \rd, [r0, \rd]
307  .endm
308  
309  // r0: input buffer (word-aligned)
310  // r1: output buffer (word-aligned)
311  // r2: output pixel count
312  decl_func tmds_encode_2bpp
313  	push {r4-r7, lr}
314  	mov r7, r8
315  	push {r7}
316  	mov r8, r0
317  	adr r0, tmds_2bpp_table
318  	// Mask: 4-bit index into 4-byte entries.
319  	movs r3, #0x3c
320  	// Limit pointer: 1 word per 2 pixels
321  	lsls r2, #1
322  	add r2, r1
323  	mov ip, r2
324  	b 2f
325  1:
326  	mov r4, r8
327  	ldmia r4!, {r2}
328  	mov r8, r4
329  	encode_2bpp_body lsls 2  r4
330  	encode_2bpp_body lsrs 2  r5
331  	encode_2bpp_body lsrs 6  r6
332  	encode_2bpp_body lsrs 10 r7
333  	stmia r1!, {r4-r7}
334  	encode_2bpp_body lsrs 14 r4
335  	encode_2bpp_body lsrs 18 r5
336  	encode_2bpp_body lsrs 22 r6
337  	encode_2bpp_body lsrs 26 r7
338  	stmia r1!, {r4-r7}
339  2:
340  	cmp r1, ip
341  	blo 1b
342  	pop {r7}
343  	mov r8, r7
344  	pop {r4-r7, pc}
345  
346  .align 2
347  tmds_2bpp_table:
348  	.word 0x7f103 // 00, 00
349  	.word 0x7f130 // 01, 00
350  	.word 0x7f230 // 10, 00
351  	.word 0x7f203 // 11, 00
352  	.word 0x73d03 // 00, 01
353  	.word 0x73d30 // 01, 01
354  	.word 0x73e30 // 10, 01
355  	.word 0x73e03 // 11, 01
356  	.word 0xb3d03 // 00, 10
357  	.word 0xb3d30 // 01, 10
358  	.word 0xb3e30 // 10, 10
359  	.word 0xb3e03 // 11, 10
360  	.word 0xbf103 // 00, 11
361  	.word 0xbf130 // 01, 11
362  	.word 0xbf230 // 10, 11
363  	.word 0xbf203 // 11, 11
364  
365  // ----------------------------------------------------------------------------
366  // Full-resolution RGB encode (not very practical)
367  
368  // Non-doubled TMDS encode. 8.333 cycles per pixel, no exceptions. (This is
369  // taking horizontal blanking (at VGA) and dual core into account, and
370  // assuming the 3 channels are encoded individually.)
371  //
372  // Here is an idea
373  // Have a table with a 7 bit lookup. The lookup is the 6 colour data bits (in
374  // ACCUM0), concatenated with the sign bit of our running disparity (from
375  // ACCUM1). Each table entry is a 20-bit TMDS symbol (pseudodifferential),
376  // with the symbol's disparity stored left-justified in the upper 12 bits, as
377  // e.g. a 6 bit signed integer.
378  //
379  // - Load pixel data.                        cyc: 0.75 (ldmia 2 words, every 4 pixels)
380  // - Write pixel to ACCUM0.                  cyc: 1
381  // - Read address from PEEK2.                cyc: 1
382  // - Load encoded pixel from address.        cyc: 2
383  // - Write disparity data to ACCUM1_ADD      cyc: 1
384  // - Write encoded data to output buffer.    cyc: 1.25 (stmia 4 words, every 4 pixels)
385  //
386  // With decent register allocation we may be able to load 4 pixels at
387  // once (2 words), and write 4 at once (4 words). This gives 7 cyc/pix.
388  //
389  // One issue is that the TMDS data in the bottom of ACCUM1 will eventually
390  // overflow and affect the running disparity, but with 16 zeroes in between,
391  // this would take much longer than one scanline, so everything is fine if
392  // we clear the accumulator at the start of the scanline.
393  //
394  // Note that we need to use two interpolators to get the bits from both pixels
395  // -- we are not outputting a single DC-balanced stream, but rather two
396  // interleaved streams which are each DC-balanced. This is fine electrically,
397  // but our output here will *NOT* match the TMDS encoder given in the DVI
398  // spec.
399  
400  // You can define TMDS_FULLRES_NO_DC_BALANCE to disable the running balance
401  // feedback. With the feedback enabled (default), the output is DC balanced,
402  // but there are just barely enough CPU cycles to do all the encode, so it's
403  // essentially a party trick. If you disable DC balancing, the performance is
404  // much better, and many monitors will still accept the signals as long as you
405  // DC couple your DVI signals.
406  
407  .macro tmds_fullres_encode_loop_body ra rb
408  	str \ra, [r2, #ACCUM0_OFFS + INTERP1]
409  	str \ra, [r2, #ACCUM0_OFFS]
410  	ldr \ra, [r2, #PEEK2_OFFS]
411  	ldr \ra, [\ra]
412  #if !TMDS_FULLRES_NO_DC_BALANCE
413  	str \ra, [r2, #ACCUM1_ADD_OFFS]
414  #endif
415  	ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
416  	ldr \rb, [\rb]
417  #if !TMDS_FULLRES_NO_DC_BALANCE
418  	str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
419  #endif
420  .endm
421  
422  // r0: Input buffer (word-aligned)
423  // r1: Output buffer (word-aligned)
424  // r2: Pixel count
425  
426  .macro tmds_fullres_encode_loop_16bpp
427  	push {r4-r7, lr}
428  	mov r4, r8
429  	push {r4}
430  
431  
432  	lsls r2, #2
433  	add r2, r1
434  	mov ip, r2
435  	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
436  	// DC balance defined to be 0 at start of scanline:
437  	movs r4, #0
438  	str r4, [r2, #ACCUM1_OFFS]
439  #if TMDS_FULLRES_NO_DC_BALANCE
440  	// Alternate parity between odd/even symbols if no feedback
441  	mvns r4, r4
442  #endif
443  	str r4, [r2, #ACCUM1_OFFS + INTERP1]
444  
445  	// Keep loop start pointer in r8 so we can get a longer backward branch
446  	adr r4, 1f
447  	adds r4, #1 // god damn thumb bit why is this a thing
448  	mov r8, r4
449  	b 2f
450  	.align 2
451  1:
452  .rept 16
453  	ldmia r0!, {r4, r6}
454  	tmds_fullres_encode_loop_body r4 r5
455  	tmds_fullres_encode_loop_body r6 r7
456  	stmia r1!, {r4, r5, r6, r7}
457  .endr
458  2:
459  	cmp r1, ip
460  	beq 1f
461  	bx r8
462  1:
463  	pop {r4}
464  	mov r8, r4
465  	pop {r4-r7, pc}
466  .endm
467  
468  // One copy each in X and Y, so the two cores don't step on each other
469  decl_func_x tmds_fullres_encode_loop_16bpp_x
470  	tmds_fullres_encode_loop_16bpp
471  decl_func_y tmds_fullres_encode_loop_16bpp_y
472  	tmds_fullres_encode_loop_16bpp
473  
474  
475  .macro tmds_fullres_encode_loop_body_leftshift ra rb
476  	// Note we apply the leftshift for INTERP0 only
477  	str \ra, [r2, #ACCUM0_OFFS + INTERP1]
478  	lsls \ra, r3
479  	str \ra, [r2, #ACCUM0_OFFS]
480  	ldr \ra, [r2, #PEEK2_OFFS]
481  	ldr \ra, [\ra]
482  #if !TMDS_FULLRES_NO_DC_BALANCE
483  	str \ra, [r2, #ACCUM1_ADD_OFFS]
484  #endif
485  	ldr \rb, [r2, #PEEK2_OFFS + INTERP1]
486  	ldr \rb, [\rb]
487  #if !TMDS_FULLRES_NO_DC_BALANCE
488  	str \rb, [r2, #ACCUM1_ADD_OFFS + INTERP1]
489  #endif
490  .endm
491  
492  // r0: Input buffer (word-aligned)
493  // r1: Output buffer (word-aligned)
494  // r2: Pixel count
495  // r3: Left shift amount
496  
497  .macro tmds_fullres_encode_loop_16bpp_leftshift
498  	push {r4-r7, lr}
499  	mov r4, r8
500  	mov r5, r9
501  	push {r4-r5}
502  
503  	lsls r2, #2
504  	add r2, r1
505  	mov ip, r2
506  	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
507  	// DC balance defined to be 0 at start of scanline:
508  	movs r4, #0
509  	str r4, [r2, #ACCUM1_OFFS]
510  #if TMDS_FULLRES_NO_DC_BALANCE
511  	// Alternate parity between odd/even symbols if there's no balance feedback
512  	mvns r4, r4
513  #endif
514  	str r4, [r2, #ACCUM1_OFFS + INTERP1]
515  
516  	adr r4, 1f
517  	adds r4, #1
518  	mov r8, r4
519  	b 2f
520  	.align 2
521  1:
522  .rept 16 // 64 pixels per iteration
523  	ldmia r0!, {r4, r6}
524  	tmds_fullres_encode_loop_body_leftshift r4 r5
525  	tmds_fullres_encode_loop_body_leftshift r6 r7
526  	stmia r1!, {r4, r5, r6, r7}
527  .endr
528  2:
529  	cmp r1, ip
530  	beq 1f
531  	bx r8
532  1:
533  	pop {r4-r5}
534  	mov r8, r4
535  	mov r9, r5
536  	pop {r4-r7, pc}
537  .endm
538  
539  decl_func_x tmds_fullres_encode_loop_16bpp_leftshift_x
540  	tmds_fullres_encode_loop_16bpp_leftshift
541  decl_func_y tmds_fullres_encode_loop_16bpp_leftshift_y
542  	tmds_fullres_encode_loop_16bpp_leftshift
543  
544  
545  // ----------------------------------------------------------------------------
546  // Full-resolution 8bpp paletted encode
547  
548  // Variant of tmds_fullres_encode_loop_16bpp that reads
549  // 8-bit wide pixels packed 4 per word.  The interpolator
550  // base is set to a reordered list of TMDS symbols based
551  // on a user colour palette.
552  
553  // Two pixels input in rd[17:2]. Two symbols output in rd[19:0]. r2 contains
554  // interp base pointer. r7 used as temporary.
555  .macro tmds_palette_encode_loop_body rd
556  	str \rd, [r2, #ACCUM0_OFFS]
557  	str \rd, [r2, #ACCUM0_OFFS + INTERP1]
558  	ldr \rd, [r2, #PEEK2_OFFS]
559  	ldr \rd, [\rd]
560  #if !TMDS_FULLRES_NO_DC_BALANCE
561  	str \rd, [r2, #ACCUM1_ADD_OFFS]
562  #endif
563  	ldr r7, [r2, #PEEK2_OFFS + INTERP1]
564  	ldr r7, [r7]
565  #if !TMDS_FULLRES_NO_DC_BALANCE
566  	str r7, [r2, #ACCUM1_ADD_OFFS + INTERP1]
567  #endif
568  	lsls r7, #10
569  	orrs \rd, r7
570  .endm
571  
572  .macro tmds_palette_encode_loop
573  	push {r4-r7, lr}
574  	mov r4, r8
575  	push {r4}
576  
577  
578  	lsls r2, #1
579  	add r2, r1
580  	mov ip, r2
581  	ldr r2, =(SIO_BASE + SIO_INTERP0_ACCUM0_OFFSET)
582  	// DC balance defined to be 0 at start of scanline:
583  	movs r4, #0
584  	str r4, [r2, #ACCUM1_OFFS]
585  #if TMDS_FULLRES_NO_DC_BALANCE
586  	// Alternate parity between odd/even symbols if there's no balance feedback
587  	mvns r4, r4
588  #endif
589  	str r4, [r2, #ACCUM1_OFFS + INTERP1]
590  
591  	// Keep loop start pointer in r8 so we can get a longer backward branch
592  	adr r4, 1f
593  	adds r4, #1 // god damn thumb bit why is this a thing
594  	mov r8, r4
595  	b 2f
596  	.align 2
597  1:
598  .rept 10
599  	ldmia r0!, {r3, r5}
600  	lsrs r4, r3, #14
601  	lsls r3, #2
602  	lsrs r6, r5, #14
603  	lsls r5, #2
604  	tmds_palette_encode_loop_body r3
605  	tmds_palette_encode_loop_body r4
606  	tmds_palette_encode_loop_body r5
607  	tmds_palette_encode_loop_body r6
608  	stmia r1!, {r3, r4, r5, r6}
609  .endr
610  2:
611  	cmp r1, ip
612  	beq 1f
613  	bx r8
614  1:
615  	pop {r4}
616  	mov r8, r4
617  	pop {r4-r7, pc}
618  .endm
619  
620  decl_func_x tmds_palette_encode_loop_x
621  	tmds_palette_encode_loop
622  decl_func_y tmds_palette_encode_loop_y
623  	tmds_palette_encode_loop