Cradicle Explorer

/ vendor / github.com / golang / snappy / encode_amd64.s
encode_amd64.s
  1  // Copyright 2016 The Go Authors. All rights reserved.
  2  // Use of this source code is governed by a BSD-style
  3  // license that can be found in the LICENSE file.
  4  
  5  // +build !appengine
  6  // +build gc
  7  // +build !noasm
  8  
  9  #include "textflag.h"
 10  
 11  // The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
 12  // Go toolchain regression. See https://github.com/golang/go/issues/15426 and
 13  // https://github.com/golang/snappy/issues/29
 14  //
 15  // As a workaround, the package was built with a known good assembler, and
 16  // those instructions were disassembled by "objdump -d" to yield the
 17  //	4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
 18  // style comments, in AT&T asm syntax. Note that rsp here is a physical
 19  // register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
 20  // The instructions were then encoded as "BYTE $0x.." sequences, which assemble
 21  // fine on Go 1.6.
 22  
 23  // The asm code generally follows the pure Go code in encode_other.go, except
 24  // where marked with a "!!!".
 25  
 26  // ----------------------------------------------------------------------------
 27  
 28  // func emitLiteral(dst, lit []byte) int
 29  //
 30  // All local variables fit into registers. The register allocation:
 31  //	- AX	len(lit)
 32  //	- BX	n
 33  //	- DX	return value
 34  //	- DI	&dst[i]
 35  //	- R10	&lit[0]
 36  //
 37  // The 24 bytes of stack space is to call runtime·memmove.
 38  //
 39  // The unusual register allocation of local variables, such as R10 for the
 40  // source pointer, matches the allocation used at the call site in encodeBlock,
 41  // which makes it easier to manually inline this function.
 42  TEXT ·emitLiteral(SB), NOSPLIT, $24-56
 43  	MOVQ dst_base+0(FP), DI
 44  	MOVQ lit_base+24(FP), R10
 45  	MOVQ lit_len+32(FP), AX
 46  	MOVQ AX, DX
 47  	MOVL AX, BX
 48  	SUBL $1, BX
 49  
 50  	CMPL BX, $60
 51  	JLT  oneByte
 52  	CMPL BX, $256
 53  	JLT  twoBytes
 54  
 55  threeBytes:
 56  	MOVB $0xf4, 0(DI)
 57  	MOVW BX, 1(DI)
 58  	ADDQ $3, DI
 59  	ADDQ $3, DX
 60  	JMP  memmove
 61  
 62  twoBytes:
 63  	MOVB $0xf0, 0(DI)
 64  	MOVB BX, 1(DI)
 65  	ADDQ $2, DI
 66  	ADDQ $2, DX
 67  	JMP  memmove
 68  
 69  oneByte:
 70  	SHLB $2, BX
 71  	MOVB BX, 0(DI)
 72  	ADDQ $1, DI
 73  	ADDQ $1, DX
 74  
 75  memmove:
 76  	MOVQ DX, ret+48(FP)
 77  
 78  	// copy(dst[i:], lit)
 79  	//
 80  	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
 81  	// DI, R10 and AX as arguments.
 82  	MOVQ DI, 0(SP)
 83  	MOVQ R10, 8(SP)
 84  	MOVQ AX, 16(SP)
 85  	CALL runtime·memmove(SB)
 86  	RET
 87  
 88  // ----------------------------------------------------------------------------
 89  
 90  // func emitCopy(dst []byte, offset, length int) int
 91  //
 92  // All local variables fit into registers. The register allocation:
 93  //	- AX	length
 94  //	- SI	&dst[0]
 95  //	- DI	&dst[i]
 96  //	- R11	offset
 97  //
 98  // The unusual register allocation of local variables, such as R11 for the
 99  // offset, matches the allocation used at the call site in encodeBlock, which
100  // makes it easier to manually inline this function.
101  TEXT ·emitCopy(SB), NOSPLIT, $0-48
102  	MOVQ dst_base+0(FP), DI
103  	MOVQ DI, SI
104  	MOVQ offset+24(FP), R11
105  	MOVQ length+32(FP), AX
106  
107  loop0:
108  	// for length >= 68 { etc }
109  	CMPL AX, $68
110  	JLT  step1
111  
112  	// Emit a length 64 copy, encoded as 3 bytes.
113  	MOVB $0xfe, 0(DI)
114  	MOVW R11, 1(DI)
115  	ADDQ $3, DI
116  	SUBL $64, AX
117  	JMP  loop0
118  
119  step1:
120  	// if length > 64 { etc }
121  	CMPL AX, $64
122  	JLE  step2
123  
124  	// Emit a length 60 copy, encoded as 3 bytes.
125  	MOVB $0xee, 0(DI)
126  	MOVW R11, 1(DI)
127  	ADDQ $3, DI
128  	SUBL $60, AX
129  
130  step2:
131  	// if length >= 12 || offset >= 2048 { goto step3 }
132  	CMPL AX, $12
133  	JGE  step3
134  	CMPL R11, $2048
135  	JGE  step3
136  
137  	// Emit the remaining copy, encoded as 2 bytes.
138  	MOVB R11, 1(DI)
139  	SHRL $8, R11
140  	SHLB $5, R11
141  	SUBB $4, AX
142  	SHLB $2, AX
143  	ORB  AX, R11
144  	ORB  $1, R11
145  	MOVB R11, 0(DI)
146  	ADDQ $2, DI
147  
148  	// Return the number of bytes written.
149  	SUBQ SI, DI
150  	MOVQ DI, ret+40(FP)
151  	RET
152  
153  step3:
154  	// Emit the remaining copy, encoded as 3 bytes.
155  	SUBL $1, AX
156  	SHLB $2, AX
157  	ORB  $2, AX
158  	MOVB AX, 0(DI)
159  	MOVW R11, 1(DI)
160  	ADDQ $3, DI
161  
162  	// Return the number of bytes written.
163  	SUBQ SI, DI
164  	MOVQ DI, ret+40(FP)
165  	RET
166  
167  // ----------------------------------------------------------------------------
168  
169  // func extendMatch(src []byte, i, j int) int
170  //
171  // All local variables fit into registers. The register allocation:
172  //	- DX	&src[0]
173  //	- SI	&src[j]
174  //	- R13	&src[len(src) - 8]
175  //	- R14	&src[len(src)]
176  //	- R15	&src[i]
177  //
178  // The unusual register allocation of local variables, such as R15 for a source
179  // pointer, matches the allocation used at the call site in encodeBlock, which
180  // makes it easier to manually inline this function.
181  TEXT ·extendMatch(SB), NOSPLIT, $0-48
182  	MOVQ src_base+0(FP), DX
183  	MOVQ src_len+8(FP), R14
184  	MOVQ i+24(FP), R15
185  	MOVQ j+32(FP), SI
186  	ADDQ DX, R14
187  	ADDQ DX, R15
188  	ADDQ DX, SI
189  	MOVQ R14, R13
190  	SUBQ $8, R13
191  
192  cmp8:
193  	// As long as we are 8 or more bytes before the end of src, we can load and
194  	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
195  	CMPQ SI, R13
196  	JA   cmp1
197  	MOVQ (R15), AX
198  	MOVQ (SI), BX
199  	CMPQ AX, BX
200  	JNE  bsf
201  	ADDQ $8, R15
202  	ADDQ $8, SI
203  	JMP  cmp8
204  
205  bsf:
206  	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
207  	// the index of the first byte that differs. The BSF instruction finds the
208  	// least significant 1 bit, the amd64 architecture is little-endian, and
209  	// the shift by 3 converts a bit index to a byte index.
210  	XORQ AX, BX
211  	BSFQ BX, BX
212  	SHRQ $3, BX
213  	ADDQ BX, SI
214  
215  	// Convert from &src[ret] to ret.
216  	SUBQ DX, SI
217  	MOVQ SI, ret+40(FP)
218  	RET
219  
220  cmp1:
221  	// In src's tail, compare 1 byte at a time.
222  	CMPQ SI, R14
223  	JAE  extendMatchEnd
224  	MOVB (R15), AX
225  	MOVB (SI), BX
226  	CMPB AX, BX
227  	JNE  extendMatchEnd
228  	ADDQ $1, R15
229  	ADDQ $1, SI
230  	JMP  cmp1
231  
232  extendMatchEnd:
233  	// Convert from &src[ret] to ret.
234  	SUBQ DX, SI
235  	MOVQ SI, ret+40(FP)
236  	RET
237  
238  // ----------------------------------------------------------------------------
239  
240  // func encodeBlock(dst, src []byte) (d int)
241  //
242  // All local variables fit into registers, other than "var table". The register
243  // allocation:
244  //	- AX	.	.
245  //	- BX	.	.
246  //	- CX	56	shift (note that amd64 shifts by non-immediates must use CX).
247  //	- DX	64	&src[0], tableSize
248  //	- SI	72	&src[s]
249  //	- DI	80	&dst[d]
250  //	- R9	88	sLimit
251  //	- R10	.	&src[nextEmit]
252  //	- R11	96	prevHash, currHash, nextHash, offset
253  //	- R12	104	&src[base], skip
254  //	- R13	.	&src[nextS], &src[len(src) - 8]
255  //	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
256  //	- R15	112	candidate
257  //
258  // The second column (56, 64, etc) is the stack offset to spill the registers
259  // when calling other functions. We could pack this slightly tighter, but it's
260  // simpler to have a dedicated spill map independent of the function called.
261  //
262  // "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
263  // extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
264  // local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
265  TEXT ·encodeBlock(SB), 0, $32888-56
266  	MOVQ dst_base+0(FP), DI
267  	MOVQ src_base+24(FP), SI
268  	MOVQ src_len+32(FP), R14
269  
270  	// shift, tableSize := uint32(32-8), 1<<8
271  	MOVQ $24, CX
272  	MOVQ $256, DX
273  
274  calcShift:
275  	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
276  	//	shift--
277  	// }
278  	CMPQ DX, $16384
279  	JGE  varTable
280  	CMPQ DX, R14
281  	JGE  varTable
282  	SUBQ $1, CX
283  	SHLQ $1, DX
284  	JMP  calcShift
285  
286  varTable:
287  	// var table [maxTableSize]uint16
288  	//
289  	// In the asm code, unlike the Go code, we can zero-initialize only the
290  	// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
291  	// writes 16 bytes, so we can do only tableSize/8 writes instead of the
292  	// 2048 writes that would zero-initialize all of table's 32768 bytes.
293  	SHRQ $3, DX
294  	LEAQ table-32768(SP), BX
295  	PXOR X0, X0
296  
297  memclr:
298  	MOVOU X0, 0(BX)
299  	ADDQ  $16, BX
300  	SUBQ  $1, DX
301  	JNZ   memclr
302  
303  	// !!! DX = &src[0]
304  	MOVQ SI, DX
305  
306  	// sLimit := len(src) - inputMargin
307  	MOVQ R14, R9
308  	SUBQ $15, R9
309  
310  	// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
311  	// change for the rest of the function.
312  	MOVQ CX, 56(SP)
313  	MOVQ DX, 64(SP)
314  	MOVQ R9, 88(SP)
315  
316  	// nextEmit := 0
317  	MOVQ DX, R10
318  
319  	// s := 1
320  	ADDQ $1, SI
321  
322  	// nextHash := hash(load32(src, s), shift)
323  	MOVL  0(SI), R11
324  	IMULL $0x1e35a7bd, R11
325  	SHRL  CX, R11
326  
327  outer:
328  	// for { etc }
329  
330  	// skip := 32
331  	MOVQ $32, R12
332  
333  	// nextS := s
334  	MOVQ SI, R13
335  
336  	// candidate := 0
337  	MOVQ $0, R15
338  
339  inner0:
340  	// for { etc }
341  
342  	// s := nextS
343  	MOVQ R13, SI
344  
345  	// bytesBetweenHashLookups := skip >> 5
346  	MOVQ R12, R14
347  	SHRQ $5, R14
348  
349  	// nextS = s + bytesBetweenHashLookups
350  	ADDQ R14, R13
351  
352  	// skip += bytesBetweenHashLookups
353  	ADDQ R14, R12
354  
355  	// if nextS > sLimit { goto emitRemainder }
356  	MOVQ R13, AX
357  	SUBQ DX, AX
358  	CMPQ AX, R9
359  	JA   emitRemainder
360  
361  	// candidate = int(table[nextHash])
362  	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
363  	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
364  	BYTE $0x4e
365  	BYTE $0x0f
366  	BYTE $0xb7
367  	BYTE $0x7c
368  	BYTE $0x5c
369  	BYTE $0x78
370  
371  	// table[nextHash] = uint16(s)
372  	MOVQ SI, AX
373  	SUBQ DX, AX
374  
375  	// XXX: MOVW AX, table-32768(SP)(R11*2)
376  	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
377  	BYTE $0x66
378  	BYTE $0x42
379  	BYTE $0x89
380  	BYTE $0x44
381  	BYTE $0x5c
382  	BYTE $0x78
383  
384  	// nextHash = hash(load32(src, nextS), shift)
385  	MOVL  0(R13), R11
386  	IMULL $0x1e35a7bd, R11
387  	SHRL  CX, R11
388  
389  	// if load32(src, s) != load32(src, candidate) { continue } break
390  	MOVL 0(SI), AX
391  	MOVL (DX)(R15*1), BX
392  	CMPL AX, BX
393  	JNE  inner0
394  
395  fourByteMatch:
396  	// As per the encode_other.go code:
397  	//
398  	// A 4-byte match has been found. We'll later see etc.
399  
400  	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
401  	// on inputMargin in encode.go.
402  	MOVQ SI, AX
403  	SUBQ R10, AX
404  	CMPQ AX, $16
405  	JLE  emitLiteralFastPath
406  
407  	// ----------------------------------------
408  	// Begin inline of the emitLiteral call.
409  	//
410  	// d += emitLiteral(dst[d:], src[nextEmit:s])
411  
412  	MOVL AX, BX
413  	SUBL $1, BX
414  
415  	CMPL BX, $60
416  	JLT  inlineEmitLiteralOneByte
417  	CMPL BX, $256
418  	JLT  inlineEmitLiteralTwoBytes
419  
420  inlineEmitLiteralThreeBytes:
421  	MOVB $0xf4, 0(DI)
422  	MOVW BX, 1(DI)
423  	ADDQ $3, DI
424  	JMP  inlineEmitLiteralMemmove
425  
426  inlineEmitLiteralTwoBytes:
427  	MOVB $0xf0, 0(DI)
428  	MOVB BX, 1(DI)
429  	ADDQ $2, DI
430  	JMP  inlineEmitLiteralMemmove
431  
432  inlineEmitLiteralOneByte:
433  	SHLB $2, BX
434  	MOVB BX, 0(DI)
435  	ADDQ $1, DI
436  
437  inlineEmitLiteralMemmove:
438  	// Spill local variables (registers) onto the stack; call; unspill.
439  	//
440  	// copy(dst[i:], lit)
441  	//
442  	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
443  	// DI, R10 and AX as arguments.
444  	MOVQ DI, 0(SP)
445  	MOVQ R10, 8(SP)
446  	MOVQ AX, 16(SP)
447  	ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
448  	MOVQ SI, 72(SP)
449  	MOVQ DI, 80(SP)
450  	MOVQ R15, 112(SP)
451  	CALL runtime·memmove(SB)
452  	MOVQ 56(SP), CX
453  	MOVQ 64(SP), DX
454  	MOVQ 72(SP), SI
455  	MOVQ 80(SP), DI
456  	MOVQ 88(SP), R9
457  	MOVQ 112(SP), R15
458  	JMP  inner1
459  
460  inlineEmitLiteralEnd:
461  	// End inline of the emitLiteral call.
462  	// ----------------------------------------
463  
464  emitLiteralFastPath:
465  	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
466  	MOVB AX, BX
467  	SUBB $1, BX
468  	SHLB $2, BX
469  	MOVB BX, (DI)
470  	ADDQ $1, DI
471  
472  	// !!! Implement the copy from lit to dst as a 16-byte load and store.
473  	// (Encode's documentation says that dst and src must not overlap.)
474  	//
475  	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
476  	// OK. Subsequent iterations will fix up the overrun.
477  	//
478  	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
479  	// 16-byte loads and stores. This technique probably wouldn't be as
480  	// effective on architectures that are fussier about alignment.
481  	MOVOU 0(R10), X0
482  	MOVOU X0, 0(DI)
483  	ADDQ  AX, DI
484  
485  inner1:
486  	// for { etc }
487  
488  	// base := s
489  	MOVQ SI, R12
490  
491  	// !!! offset := base - candidate
492  	MOVQ R12, R11
493  	SUBQ R15, R11
494  	SUBQ DX, R11
495  
496  	// ----------------------------------------
497  	// Begin inline of the extendMatch call.
498  	//
499  	// s = extendMatch(src, candidate+4, s+4)
500  
501  	// !!! R14 = &src[len(src)]
502  	MOVQ src_len+32(FP), R14
503  	ADDQ DX, R14
504  
505  	// !!! R13 = &src[len(src) - 8]
506  	MOVQ R14, R13
507  	SUBQ $8, R13
508  
509  	// !!! R15 = &src[candidate + 4]
510  	ADDQ $4, R15
511  	ADDQ DX, R15
512  
513  	// !!! s += 4
514  	ADDQ $4, SI
515  
516  inlineExtendMatchCmp8:
517  	// As long as we are 8 or more bytes before the end of src, we can load and
518  	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
519  	CMPQ SI, R13
520  	JA   inlineExtendMatchCmp1
521  	MOVQ (R15), AX
522  	MOVQ (SI), BX
523  	CMPQ AX, BX
524  	JNE  inlineExtendMatchBSF
525  	ADDQ $8, R15
526  	ADDQ $8, SI
527  	JMP  inlineExtendMatchCmp8
528  
529  inlineExtendMatchBSF:
530  	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
531  	// the index of the first byte that differs. The BSF instruction finds the
532  	// least significant 1 bit, the amd64 architecture is little-endian, and
533  	// the shift by 3 converts a bit index to a byte index.
534  	XORQ AX, BX
535  	BSFQ BX, BX
536  	SHRQ $3, BX
537  	ADDQ BX, SI
538  	JMP  inlineExtendMatchEnd
539  
540  inlineExtendMatchCmp1:
541  	// In src's tail, compare 1 byte at a time.
542  	CMPQ SI, R14
543  	JAE  inlineExtendMatchEnd
544  	MOVB (R15), AX
545  	MOVB (SI), BX
546  	CMPB AX, BX
547  	JNE  inlineExtendMatchEnd
548  	ADDQ $1, R15
549  	ADDQ $1, SI
550  	JMP  inlineExtendMatchCmp1
551  
552  inlineExtendMatchEnd:
553  	// End inline of the extendMatch call.
554  	// ----------------------------------------
555  
556  	// ----------------------------------------
557  	// Begin inline of the emitCopy call.
558  	//
559  	// d += emitCopy(dst[d:], base-candidate, s-base)
560  
561  	// !!! length := s - base
562  	MOVQ SI, AX
563  	SUBQ R12, AX
564  
565  inlineEmitCopyLoop0:
566  	// for length >= 68 { etc }
567  	CMPL AX, $68
568  	JLT  inlineEmitCopyStep1
569  
570  	// Emit a length 64 copy, encoded as 3 bytes.
571  	MOVB $0xfe, 0(DI)
572  	MOVW R11, 1(DI)
573  	ADDQ $3, DI
574  	SUBL $64, AX
575  	JMP  inlineEmitCopyLoop0
576  
577  inlineEmitCopyStep1:
578  	// if length > 64 { etc }
579  	CMPL AX, $64
580  	JLE  inlineEmitCopyStep2
581  
582  	// Emit a length 60 copy, encoded as 3 bytes.
583  	MOVB $0xee, 0(DI)
584  	MOVW R11, 1(DI)
585  	ADDQ $3, DI
586  	SUBL $60, AX
587  
588  inlineEmitCopyStep2:
589  	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
590  	CMPL AX, $12
591  	JGE  inlineEmitCopyStep3
592  	CMPL R11, $2048
593  	JGE  inlineEmitCopyStep3
594  
595  	// Emit the remaining copy, encoded as 2 bytes.
596  	MOVB R11, 1(DI)
597  	SHRL $8, R11
598  	SHLB $5, R11
599  	SUBB $4, AX
600  	SHLB $2, AX
601  	ORB  AX, R11
602  	ORB  $1, R11
603  	MOVB R11, 0(DI)
604  	ADDQ $2, DI
605  	JMP  inlineEmitCopyEnd
606  
607  inlineEmitCopyStep3:
608  	// Emit the remaining copy, encoded as 3 bytes.
609  	SUBL $1, AX
610  	SHLB $2, AX
611  	ORB  $2, AX
612  	MOVB AX, 0(DI)
613  	MOVW R11, 1(DI)
614  	ADDQ $3, DI
615  
616  inlineEmitCopyEnd:
617  	// End inline of the emitCopy call.
618  	// ----------------------------------------
619  
620  	// nextEmit = s
621  	MOVQ SI, R10
622  
623  	// if s >= sLimit { goto emitRemainder }
624  	MOVQ SI, AX
625  	SUBQ DX, AX
626  	CMPQ AX, R9
627  	JAE  emitRemainder
628  
629  	// As per the encode_other.go code:
630  	//
631  	// We could immediately etc.
632  
633  	// x := load64(src, s-1)
634  	MOVQ -1(SI), R14
635  
636  	// prevHash := hash(uint32(x>>0), shift)
637  	MOVL  R14, R11
638  	IMULL $0x1e35a7bd, R11
639  	SHRL  CX, R11
640  
641  	// table[prevHash] = uint16(s-1)
642  	MOVQ SI, AX
643  	SUBQ DX, AX
644  	SUBQ $1, AX
645  
646  	// XXX: MOVW AX, table-32768(SP)(R11*2)
647  	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
648  	BYTE $0x66
649  	BYTE $0x42
650  	BYTE $0x89
651  	BYTE $0x44
652  	BYTE $0x5c
653  	BYTE $0x78
654  
655  	// currHash := hash(uint32(x>>8), shift)
656  	SHRQ  $8, R14
657  	MOVL  R14, R11
658  	IMULL $0x1e35a7bd, R11
659  	SHRL  CX, R11
660  
661  	// candidate = int(table[currHash])
662  	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
663  	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
664  	BYTE $0x4e
665  	BYTE $0x0f
666  	BYTE $0xb7
667  	BYTE $0x7c
668  	BYTE $0x5c
669  	BYTE $0x78
670  
671  	// table[currHash] = uint16(s)
672  	ADDQ $1, AX
673  
674  	// XXX: MOVW AX, table-32768(SP)(R11*2)
675  	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
676  	BYTE $0x66
677  	BYTE $0x42
678  	BYTE $0x89
679  	BYTE $0x44
680  	BYTE $0x5c
681  	BYTE $0x78
682  
683  	// if uint32(x>>8) == load32(src, candidate) { continue }
684  	MOVL (DX)(R15*1), BX
685  	CMPL R14, BX
686  	JEQ  inner1
687  
688  	// nextHash = hash(uint32(x>>16), shift)
689  	SHRQ  $8, R14
690  	MOVL  R14, R11
691  	IMULL $0x1e35a7bd, R11
692  	SHRL  CX, R11
693  
694  	// s++
695  	ADDQ $1, SI
696  
697  	// break out of the inner1 for loop, i.e. continue the outer loop.
698  	JMP outer
699  
700  emitRemainder:
701  	// if nextEmit < len(src) { etc }
702  	MOVQ src_len+32(FP), AX
703  	ADDQ DX, AX
704  	CMPQ R10, AX
705  	JEQ  encodeBlockEnd
706  
707  	// d += emitLiteral(dst[d:], src[nextEmit:])
708  	//
709  	// Push args.
710  	MOVQ DI, 0(SP)
711  	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
712  	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
713  	MOVQ R10, 24(SP)
714  	SUBQ R10, AX
715  	MOVQ AX, 32(SP)
716  	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
717  
718  	// Spill local variables (registers) onto the stack; call; unspill.
719  	MOVQ DI, 80(SP)
720  	CALL ·emitLiteral(SB)
721  	MOVQ 80(SP), DI
722  
723  	// Finish the "d +=" part of "d += emitLiteral(etc)".
724  	ADDQ 48(SP), DI
725  
726  encodeBlockEnd:
727  	MOVQ dst_base+0(FP), AX
728  	SUBQ AX, DI
729  	MOVQ DI, d+48(FP)
730  	RET