/ externals / xbyak / test / misc.cpp
misc.cpp
   1  #include <stdio.h>
   2  #include <string.h>
   3  #include <string>
   4  #include <xbyak/xbyak.h>
   5  #include <xbyak/xbyak_util.h>
   6  #include <cybozu/inttype.hpp>
   7  #include <cybozu/test.hpp>
   8  #include <algorithm>
   9  
  10  using namespace Xbyak;
  11  
  12  CYBOZU_TEST_AUTO(setSize)
  13  {
  14  	struct Code : Xbyak::CodeGenerator {
  15  		Code() : Xbyak::CodeGenerator(4096)
  16  		{
  17  			setSize(4095);
  18  			db(1);
  19  			size_t size = getSize();
  20  			CYBOZU_TEST_EQUAL(size, 4096u);
  21  			CYBOZU_TEST_NO_EXCEPTION(setSize(size));
  22  			CYBOZU_TEST_EXCEPTION(db(1), Xbyak::Error);
  23  		}
  24  	} code;
  25  }
  26  
  27  #ifdef XBYAK64
  28  CYBOZU_TEST_AUTO(badSSE)
  29  {
  30  	struct Code : Xbyak::CodeGenerator {
  31  		Code()
  32  		{
  33  			CYBOZU_TEST_EXCEPTION(paddd(xm16, xm1), Xbyak::Error);
  34  			CYBOZU_TEST_EXCEPTION(pslld(xm16, 1), Xbyak::Error);
  35  			CYBOZU_TEST_EXCEPTION(movapd(xm16, xm1), Xbyak::Error);
  36  			CYBOZU_TEST_EXCEPTION(movhpd(xm16, ptr[eax]), Xbyak::Error);
  37  			CYBOZU_TEST_EXCEPTION(pextrb(eax, xm16, 1), Xbyak::Error);
  38  		}
  39  	} code;
  40  }
  41  #endif
  42  
  43  CYBOZU_TEST_AUTO(compOperand)
  44  {
  45  	using namespace Xbyak::util;
  46  	CYBOZU_TEST_ASSERT(eax == eax);
  47  	CYBOZU_TEST_ASSERT(ecx != xmm0);
  48  	CYBOZU_TEST_ASSERT(ptr[eax] == ptr[eax]);
  49  	CYBOZU_TEST_ASSERT(dword[eax] != ptr[eax]);
  50  	CYBOZU_TEST_ASSERT(ptr[eax] != ptr[eax+3]);
  51  }
  52  
  53  CYBOZU_TEST_AUTO(mov_const)
  54  {
  55  	struct Code : Xbyak::CodeGenerator {
  56  		Code()
  57  		{
  58  			const struct {
  59  				uint64_t v;
  60  				int bit;
  61  				bool error;
  62  			} tbl[] = {
  63  				{ uint64_t(-1), 8, false },
  64  				{ 0x12, 8, false },
  65  				{ 0x80, 8, false },
  66  				{ 0xff, 8, false },
  67  				{ 0x100, 8, true },
  68  
  69  				{ 1, 16, false },
  70  				{ uint64_t(-1), 16, false },
  71  				{ 0x7fff, 16, false },
  72  				{ 0xffff, 16, false },
  73  				{ 0x10000, 16, true },
  74  
  75  				{ uint64_t(-1), 32, false },
  76  				{ 0x7fffffff, 32, false },
  77  				{ uint64_t(-0x7fffffff), 32, false },
  78  				{ 0xffffffff, 32, false },
  79  				{ 0x100000000ull, 32, true },
  80  
  81  #ifdef XBYAK64
  82  				{ uint64_t(-1), 64, false },
  83  				{ 0x7fffffff, 64, false },
  84  				{ 0xffffffffffffffffull, 64, false },
  85  				{ 0x80000000, 64, true },
  86  				{ 0xffffffff, 64, true },
  87  #endif
  88  			};
  89  			for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
  90  				const int bit = tbl[i].bit;
  91  				const uint64_t v = tbl[i].v;
  92  				const Xbyak::AddressFrame& af = bit == 8 ? byte : bit == 16 ? word : bit == 32 ? dword : qword;
  93  				if (tbl[i].error) {
  94  					CYBOZU_TEST_EXCEPTION(mov(af[eax], v), Xbyak::Error);
  95  				} else {
  96  					CYBOZU_TEST_NO_EXCEPTION(mov(af[eax], v));
  97  				}
  98  			}
  99  #ifdef XBYAK64
 100  			CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x7fffffff]));
 101  			if (sizeof(void*) != 4) { // sizeof(void*) == 4 on x32
 102  				CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x17fffffff]), Xbyak::Error);
 103  			}
 104  #ifdef XBYAK_OLD_DISP_CHECK
 105  			CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x80000000]));
 106  			CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0xffffffff]));
 107  #else
 108  			if (sizeof(void*) != 4) { // sizeof(void*) == 4 on x32
 109  				CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x80000000ull]), Xbyak::Error);
 110  				CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0xffffffffull]), Xbyak::Error);
 111  			}
 112  #endif
 113  #endif
 114  		}
 115  	} code;
 116  }
 117  
 118  CYBOZU_TEST_AUTO(align)
 119  {
 120  	struct Code : Xbyak::CodeGenerator {
 121  		Code()
 122  		{
 123  			const size_t alignSize = 16;
 124  			for (int padding = 0; padding < 20; padding++) {
 125  				for (int i = 0; i < padding; i++) {
 126  					db(1);
 127  				}
 128  				align(alignSize);
 129  				CYBOZU_TEST_EQUAL(size_t(getCurr()) % alignSize, 0u);
 130  			}
 131  			align(alignSize);
 132  			const uint8_t *p = getCurr();
 133  			// do nothing if aligned
 134  			align(alignSize);
 135  			CYBOZU_TEST_EQUAL(p, getCurr());
 136  		}
 137  	} c;
 138  }
 139  CYBOZU_TEST_AUTO(kmask)
 140  {
 141  	struct Code : Xbyak::CodeGenerator {
 142  		Code()
 143  		{
 144  			CYBOZU_TEST_EXCEPTION(kmovb(k1, ax), std::exception);
 145  			CYBOZU_TEST_EXCEPTION(kmovw(k1, ax), std::exception);
 146  			CYBOZU_TEST_EXCEPTION(kmovd(k1, ax), std::exception);
 147  			CYBOZU_TEST_EXCEPTION(kmovq(k1, eax), std::exception);
 148  #ifdef XBYAK64
 149  			CYBOZU_TEST_EXCEPTION(kmovb(k1, rax), std::exception);
 150  			CYBOZU_TEST_EXCEPTION(kmovw(k1, rax), std::exception);
 151  			CYBOZU_TEST_EXCEPTION(kmovd(k1, rax), std::exception);
 152  			CYBOZU_TEST_NO_EXCEPTION(kmovq(k1, rax));
 153  #endif
 154  			CYBOZU_TEST_NO_EXCEPTION(vmovaps(xm0|k0, ptr[eax]));
 155  			checkT_z();
 156  		}
 157  		void checkT_z()
 158  		{
 159  			const uint8_t *p1 = getCurr();
 160  			vmovaps(zm0, ptr[eax]);
 161  			const uint8_t *p2 = getCurr();
 162  			vmovaps(zm0|T_z, ptr[eax]);
 163  			const uint8_t *end = getCurr();
 164  			CYBOZU_TEST_EQUAL(p2 - p1, end - p2);
 165  			CYBOZU_TEST_EQUAL_ARRAY(p1, p2, end - p2);
 166  		}
 167  	} c;
 168  }
 169  
 170  CYBOZU_TEST_AUTO(gather)
 171  {
 172  	struct Code : Xbyak::CodeGenerator {
 173  		Code()
 174  		{
 175  			CYBOZU_TEST_NO_EXCEPTION(vgatherdpd(xmm1, ptr[eax+xmm2], xmm3));
 176  			CYBOZU_TEST_EXCEPTION(vgatherdpd(xmm1, ptr[eax+xmm1], xmm2), std::exception);
 177  			CYBOZU_TEST_EXCEPTION(vgatherdpd(xmm1, ptr[eax+xmm2], xmm1), std::exception);
 178  			CYBOZU_TEST_EXCEPTION(vgatherdpd(xmm2, ptr[eax+xmm1], xmm1), std::exception);
 179  
 180  			CYBOZU_TEST_NO_EXCEPTION(vgatherdpd(xmm1|k2, ptr[eax+xmm2]));
 181  			CYBOZU_TEST_EXCEPTION(vgatherdpd(xmm1, ptr[eax+xmm2]), std::exception);
 182  			CYBOZU_TEST_EXCEPTION(vgatherdpd(xmm1|k2, ptr[eax+xmm1]), std::exception);
 183  
 184  			CYBOZU_TEST_NO_EXCEPTION(vpscatterdd(ptr[eax+xmm2]|k2, xmm1));
 185  			CYBOZU_TEST_NO_EXCEPTION(vpscatterdd(ptr[eax+xmm2], xmm1|k2));
 186  			CYBOZU_TEST_NO_EXCEPTION(vpscatterdd(ptr[eax+xmm2]|k3, xmm2));
 187  
 188  			CYBOZU_TEST_EXCEPTION(vpscatterdd(ptr[eax+xmm2], xmm1), std::exception);
 189  		}
 190  	} c;
 191  }
 192  
 193  #ifdef XBYAK64
 194  CYBOZU_TEST_AUTO(vfmaddps)
 195  {
 196  	struct Code : Xbyak::CodeGenerator {
 197  		Code()
 198  		{
 199  			v4fmaddps(zmm1, zmm8, ptr [rdx + 64]);
 200  			v4fmaddss(xmm15, xmm8, ptr [rax + 64]);
 201  			v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]);
 202  			v4fnmaddss(xmm31, xmm2, ptr [rsp + 0x80]);
 203  			vp4dpwssd(zmm23 | k7 | T_z, zmm1, ptr [rax + 64]);
 204  			vp4dpwssds(zmm10 | k4, zmm3, ptr [rsp + rax * 4 + 64]);
 205  		}
 206  	} c;
 207  	const uint8_t tbl[] = {
 208  		0x62, 0xf2, 0x3f, 0x48, 0x9a, 0x4a, 0x04,
 209  		0x62, 0x72, 0x3f, 0x08, 0x9b, 0x78, 0x04,
 210  		0x62, 0xf2, 0x6f, 0x4d, 0xaa, 0x69, 0x08,
 211  		0x62, 0x62, 0x6f, 0x08, 0xab, 0x7c, 0x24, 0x08,
 212  		0x62, 0xe2, 0x77, 0xcf, 0x52, 0x78, 0x04,
 213  		0x62, 0x72, 0x67, 0x4c, 0x53, 0x54, 0x84, 0x04,
 214  	};
 215  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
 216  	CYBOZU_TEST_EQUAL(c.getSize(), n);
 217  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 218  }
 219  CYBOZU_TEST_AUTO(vaes)
 220  {
 221  	struct Code : Xbyak::CodeGenerator {
 222  		Code()
 223  		{
 224  			vaesdec(xmm20, xmm30, ptr [rcx + 64]);
 225  			vaesdec(ymm1, ymm2, ptr [rcx + 64]);
 226  			vaesdec(zmm1, zmm2, ptr [rcx + 64]);
 227  
 228  			vaesdeclast(xmm20, xmm30, ptr [rax + 64]);
 229  			vaesdeclast(ymm20, ymm30, ptr [rax + 64]);
 230  			vaesdeclast(zmm20, zmm30, ptr [rax + 64]);
 231  
 232  			vaesenc(xmm20, xmm30, ptr [rcx + 64]);
 233  			vaesenc(ymm1, ymm2, ptr [rcx + 64]);
 234  			vaesenc(zmm1, zmm2, ptr [rcx + 64]);
 235  
 236  			vaesenclast(xmm20, xmm30, ptr [rax + 64]);
 237  			vaesenclast(ymm20, ymm30, ptr [rax + 64]);
 238  			vaesenclast(zmm20, zmm30, ptr [rax + 64]);
 239  		}
 240  	} c;
 241  	const uint8_t tbl[] = {
 242  		0x62, 0xE2, 0x0D, 0x00, 0xDE, 0x61, 0x04,
 243  		0xC4, 0xE2, 0x6D, 0xDE, 0x49, 0x40,
 244  		0x62, 0xF2, 0x6D, 0x48, 0xDE, 0x49, 0x01,
 245  
 246  		0x62, 0xE2, 0x0D, 0x00, 0xDF, 0x60, 0x04,
 247  		0x62, 0xE2, 0x0D, 0x20, 0xDF, 0x60, 0x02,
 248  		0x62, 0xE2, 0x0D, 0x40, 0xDF, 0x60, 0x01,
 249  
 250  		0x62, 0xE2, 0x0D, 0x00, 0xDC, 0x61, 0x04,
 251  		0xC4, 0xE2, 0x6D, 0xDC, 0x49, 0x40,
 252  		0x62, 0xF2, 0x6D, 0x48, 0xDC, 0x49, 0x01,
 253  
 254  		0x62, 0xE2, 0x0D, 0x00, 0xDD, 0x60, 0x04,
 255  		0x62, 0xE2, 0x0D, 0x20, 0xDD, 0x60, 0x02,
 256  		0x62, 0xE2, 0x0D, 0x40, 0xDD, 0x60, 0x01,
 257  	};
 258  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
 259  	CYBOZU_TEST_EQUAL(c.getSize(), n);
 260  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 261  }
 262  CYBOZU_TEST_AUTO(vpclmulqdq)
 263  {
 264  	struct Code : Xbyak::CodeGenerator {
 265  		Code()
 266  		{
 267  			vpclmulqdq(xmm2, xmm3, ptr [rax + 64], 3);
 268  			vpclmulqdq(ymm2, ymm3, ptr [rax + 64], 3);
 269  			vpclmulqdq(zmm2, zmm3, ptr [rax + 64], 3);
 270  
 271  			vpclmulqdq(xmm20, xmm3, ptr [rax + 64], 3);
 272  			vpclmulqdq(ymm20, ymm3, ptr [rax + 64], 3);
 273  			vpclmulqdq(zmm20, zmm3, ptr [rax + 64], 3);
 274  		}
 275  	} c;
 276  	const uint8_t tbl[] = {
 277  		0xc4, 0xe3, 0x61, 0x44, 0x50, 0x40, 0x03,
 278  		0xc4, 0xe3, 0x65, 0x44, 0x50, 0x40, 0x03,
 279  		0x62, 0xf3, 0x65, 0x48, 0x44, 0x50, 0x01, 0x03,
 280  		0x62, 0xe3, 0x65, 0x08, 0x44, 0x60, 0x04, 0x03,
 281  		0x62, 0xe3, 0x65, 0x28, 0x44, 0x60, 0x02, 0x03,
 282  		0x62, 0xe3, 0x65, 0x48, 0x44, 0x60, 0x01, 0x03,
 283  	};
 284  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
 285  	CYBOZU_TEST_EQUAL(c.getSize(), n);
 286  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 287  }
 288  CYBOZU_TEST_AUTO(vcompressb_w)
 289  {
 290  	struct Code : Xbyak::CodeGenerator {
 291  		Code()
 292  		{
 293  			vcompressb(ptr[rax + 64], xmm1);
 294  			vcompressb(xmm30 | k5, xmm1);
 295  			vcompressb(ptr[rax + 64], ymm1);
 296  			vcompressb(ymm30 | k3 |T_z, ymm1);
 297  			vcompressb(ptr[rax + 64], zmm1);
 298  			vcompressb(zmm30 | k2 |T_z, zmm1);
 299  
 300  			vcompressw(ptr[rax + 64], xmm1);
 301  			vcompressw(xmm30 | k5, xmm1);
 302  			vcompressw(ptr[rax + 64], ymm1);
 303  			vcompressw(ymm30 | k3 |T_z, ymm1);
 304  			vcompressw(ptr[rax + 64], zmm1);
 305  			vcompressw(zmm30 | k2 |T_z, zmm1);
 306  		}
 307  	} c;
 308  	const uint8_t tbl[] = {
 309  		0x62, 0xf2, 0x7d, 0x08, 0x63, 0x48, 0x40,
 310  		0x62, 0x92, 0x7d, 0x0d, 0x63, 0xce,
 311  		0x62, 0xf2, 0x7d, 0x28, 0x63, 0x48, 0x40,
 312  		0x62, 0x92, 0x7d, 0xab, 0x63, 0xce,
 313  		0x62, 0xf2, 0x7d, 0x48, 0x63, 0x48, 0x40,
 314  		0x62, 0x92, 0x7d, 0xca, 0x63, 0xce,
 315  
 316  		0x62, 0xf2, 0xfd, 0x08, 0x63, 0x48, 0x20,
 317  		0x62, 0x92, 0xfd, 0x0d, 0x63, 0xce,
 318  		0x62, 0xf2, 0xfd, 0x28, 0x63, 0x48, 0x20,
 319  		0x62, 0x92, 0xfd, 0xab, 0x63, 0xce,
 320  		0x62, 0xf2, 0xfd, 0x48, 0x63, 0x48, 0x20,
 321  		0x62, 0x92, 0xfd, 0xca, 0x63, 0xce,
 322  	};
 323  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
 324  	CYBOZU_TEST_EQUAL(c.getSize(), n);
 325  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 326  }
 327  CYBOZU_TEST_AUTO(shld)
 328  {
 329  	struct Code : Xbyak::CodeGenerator {
 330  		Code()
 331  		{
 332  			vpshldw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
 333  			vpshldw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
 334  			vpshldw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
 335  
 336  			vpshldd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
 337  			vpshldd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
 338  			vpshldd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
 339  
 340  			vpshldq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
 341  			vpshldq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
 342  			vpshldq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
 343  
 344  			vpshldvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
 345  			vpshldvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
 346  			vpshldvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
 347  
 348  			vpshldvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
 349  			vpshldvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
 350  			vpshldvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
 351  
 352  			vpshldvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
 353  			vpshldvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
 354  			vpshldvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
 355  		}
 356  	} c;
 357  	const uint8_t tbl[] = {
 358  		0x62, 0xf3, 0xed, 0x8b, 0x70, 0x68, 0x04, 0x05,
 359  		0x62, 0xf3, 0xed, 0xab, 0x70, 0x68, 0x02, 0x05,
 360  		0x62, 0xf3, 0xed, 0xcb, 0x70, 0x68, 0x01, 0x05,
 361  
 362  		0x62, 0xf3, 0x6d, 0x8b, 0x71, 0x68, 0x04, 0x05,
 363  		0x62, 0xf3, 0x6d, 0xab, 0x71, 0x68, 0x02, 0x05,
 364  		0x62, 0xf3, 0x6d, 0xcb, 0x71, 0x68, 0x01, 0x05,
 365  
 366  		0x62, 0xf3, 0xed, 0x8b, 0x71, 0x68, 0x04, 0x05,
 367  		0x62, 0xf3, 0xed, 0xab, 0x71, 0x68, 0x02, 0x05,
 368  		0x62, 0xf3, 0xed, 0xcb, 0x71, 0x68, 0x01, 0x05,
 369  
 370  		0x62, 0xf2, 0xed, 0x8b, 0x70, 0x68, 0x04,
 371  		0x62, 0xf2, 0xed, 0xab, 0x70, 0x68, 0x02,
 372  		0x62, 0xf2, 0xed, 0xcb, 0x70, 0x68, 0x01,
 373  
 374  		0x62, 0xf2, 0x6d, 0x8b, 0x71, 0x68, 0x04,
 375  		0x62, 0xf2, 0x6d, 0xab, 0x71, 0x68, 0x02,
 376  		0x62, 0xf2, 0x6d, 0xcb, 0x71, 0x68, 0x01,
 377  
 378  		0x62, 0xf2, 0xed, 0x8b, 0x71, 0x68, 0x04,
 379  		0x62, 0xf2, 0xed, 0xab, 0x71, 0x68, 0x02,
 380  		0x62, 0xf2, 0xed, 0xcb, 0x71, 0x68, 0x01,
 381  	};
 382  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
 383  	CYBOZU_TEST_EQUAL(c.getSize(), n);
 384  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 385  }
 386  CYBOZU_TEST_AUTO(shrd)
 387  {
 388  	struct Code : Xbyak::CodeGenerator {
 389  		Code()
 390  		{
 391  			vpshrdw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
 392  			vpshrdw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
 393  			vpshrdw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
 394  
 395  			vpshrdd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
 396  			vpshrdd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
 397  			vpshrdd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
 398  
 399  			vpshrdq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
 400  			vpshrdq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
 401  			vpshrdq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
 402  
 403  			vpshrdvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
 404  			vpshrdvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
 405  			vpshrdvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
 406  
 407  			vpshrdvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
 408  			vpshrdvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
 409  			vpshrdvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
 410  
 411  			vpshrdvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
 412  			vpshrdvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
 413  			vpshrdvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
 414  
 415  			vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);
 416  			vpshrdd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5);
 417  			vpshrdd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5);
 418  
 419  			vpshrdq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);
 420  			vpshrdq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5);
 421  			vpshrdq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5);
 422  
 423  			vpshrdvd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]);
 424  			vpshrdvd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]);
 425  			vpshrdvd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]);
 426  
 427  			vpshrdvq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]);
 428  			vpshrdvq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]);
 429  			vpshrdvq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]);
 430  		}
 431  	} c;
 432  	const uint8_t tbl[] = {
 433  		0x62, 0xf3, 0xed, 0x8b, 0x72, 0x68, 0x04, 0x05,
 434  		0x62, 0xf3, 0xed, 0xab, 0x72, 0x68, 0x02, 0x05,
 435  		0x62, 0xf3, 0xed, 0xcb, 0x72, 0x68, 0x01, 0x05,
 436  
 437  		0x62, 0xf3, 0x6d, 0x8b, 0x73, 0x68, 0x04, 0x05,
 438  		0x62, 0xf3, 0x6d, 0xab, 0x73, 0x68, 0x02, 0x05,
 439  		0x62, 0xf3, 0x6d, 0xcb, 0x73, 0x68, 0x01, 0x05,
 440  
 441  		0x62, 0xf3, 0xed, 0x8b, 0x73, 0x68, 0x04, 0x05,
 442  		0x62, 0xf3, 0xed, 0xab, 0x73, 0x68, 0x02, 0x05,
 443  		0x62, 0xf3, 0xed, 0xcb, 0x73, 0x68, 0x01, 0x05,
 444  
 445  		0x62, 0xf2, 0xed, 0x8b, 0x72, 0x68, 0x04,
 446  		0x62, 0xf2, 0xed, 0xab, 0x72, 0x68, 0x02,
 447  		0x62, 0xf2, 0xed, 0xcb, 0x72, 0x68, 0x01,
 448  
 449  		0x62, 0xf2, 0x6d, 0x8b, 0x73, 0x68, 0x04,
 450  		0x62, 0xf2, 0x6d, 0xab, 0x73, 0x68, 0x02,
 451  		0x62, 0xf2, 0x6d, 0xcb, 0x73, 0x68, 0x01,
 452  
 453  		0x62, 0xf2, 0xed, 0x8b, 0x73, 0x68, 0x04,
 454  		0x62, 0xf2, 0xed, 0xab, 0x73, 0x68, 0x02,
 455  		0x62, 0xf2, 0xed, 0xcb, 0x73, 0x68, 0x01,
 456  
 457  		0x62, 0xf3, 0x6d, 0x9b, 0x73, 0x68, 0x10, 0x05,
 458  		0x62, 0xf3, 0x6d, 0xbb, 0x73, 0x68, 0x10, 0x05,
 459  		0x62, 0xf3, 0x6d, 0xdb, 0x73, 0x68, 0x10, 0x05,
 460  
 461  		0x62, 0xf3, 0xed, 0x9b, 0x73, 0x68, 0x08, 0x05,
 462  		0x62, 0xf3, 0xed, 0xbb, 0x73, 0x68, 0x08, 0x05,
 463  		0x62, 0xf3, 0xed, 0xdb, 0x73, 0x68, 0x08, 0x05,
 464  
 465  		0x62, 0xf2, 0x6d, 0x9b, 0x73, 0x68, 0x10,
 466  		0x62, 0xf2, 0x6d, 0xbb, 0x73, 0x68, 0x10,
 467  		0x62, 0xf2, 0x6d, 0xdb, 0x73, 0x68, 0x10,
 468  
 469  		0x62, 0xf2, 0xed, 0x9b, 0x73, 0x68, 0x08,
 470  		0x62, 0xf2, 0xed, 0xbb, 0x73, 0x68, 0x08,
 471  		0x62, 0xf2, 0xed, 0xdb, 0x73, 0x68, 0x08,
 472  	};
 473  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
 474  	CYBOZU_TEST_EQUAL(c.getSize(), n);
 475  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 476  }
 477  CYBOZU_TEST_AUTO(vpopcnt)
 478  {
 479  	struct Code : Xbyak::CodeGenerator {
 480  		Code()
 481  		{
 482  			vpopcntb(xmm5|k3|T_z, ptr [rax + 0x40]);
 483  			vpopcntb(ymm5|k3|T_z, ptr [rax + 0x40]);
 484  			vpopcntb(zmm5|k3|T_z, ptr [rax + 0x40]);
 485  
 486  			vpopcntw(xmm5|k3|T_z, ptr [rax + 0x40]);
 487  			vpopcntw(ymm5|k3|T_z, ptr [rax + 0x40]);
 488  			vpopcntw(zmm5|k3|T_z, ptr [rax + 0x40]);
 489  
 490  			vpopcntd(xmm5|k3|T_z, ptr [rax + 0x40]);
 491  			vpopcntd(ymm5|k3|T_z, ptr [rax + 0x40]);
 492  			vpopcntd(zmm5|k3|T_z, ptr [rax + 0x40]);
 493  
 494  			vpopcntd(xmm5|k3|T_z, ptr_b [rax + 0x40]);
 495  			vpopcntd(ymm5|k3|T_z, ptr_b [rax + 0x40]);
 496  			vpopcntd(zmm5|k3|T_z, ptr_b [rax + 0x40]);
 497  
 498  			vpopcntq(xmm5|k3|T_z, ptr [rax + 0x40]);
 499  			vpopcntq(ymm5|k3|T_z, ptr [rax + 0x40]);
 500  			vpopcntq(zmm5|k3|T_z, ptr [rax + 0x40]);
 501  
 502  			vpopcntq(xmm5|k3|T_z, ptr_b [rax + 0x40]);
 503  			vpopcntq(ymm5|k3|T_z, ptr_b [rax + 0x40]);
 504  			vpopcntq(zmm5|k3|T_z, ptr_b [rax + 0x40]);
 505  		}
 506  	} c;
 507  	const uint8_t tbl[] = {
 508  		0x62, 0xf2, 0x7d, 0x8b, 0x54, 0x68, 0x04,
 509  		0x62, 0xf2, 0x7d, 0xab, 0x54, 0x68, 0x02,
 510  		0x62, 0xf2, 0x7d, 0xcb, 0x54, 0x68, 0x01,
 511  
 512  		0x62, 0xf2, 0xfd, 0x8b, 0x54, 0x68, 0x04,
 513  		0x62, 0xf2, 0xfd, 0xab, 0x54, 0x68, 0x02,
 514  		0x62, 0xf2, 0xfd, 0xcb, 0x54, 0x68, 0x01,
 515  
 516  		0x62, 0xf2, 0x7d, 0x8b, 0x55, 0x68, 0x04,
 517  		0x62, 0xf2, 0x7d, 0xab, 0x55, 0x68, 0x02,
 518  		0x62, 0xf2, 0x7d, 0xcb, 0x55, 0x68, 0x01,
 519  
 520  		0x62, 0xf2, 0x7d, 0x9b, 0x55, 0x68, 0x10,
 521  		0x62, 0xf2, 0x7d, 0xbb, 0x55, 0x68, 0x10,
 522  		0x62, 0xf2, 0x7d, 0xdb, 0x55, 0x68, 0x10,
 523  
 524  		0x62, 0xf2, 0xfd, 0x8b, 0x55, 0x68, 0x04,
 525  		0x62, 0xf2, 0xfd, 0xab, 0x55, 0x68, 0x02,
 526  		0x62, 0xf2, 0xfd, 0xcb, 0x55, 0x68, 0x01,
 527  
 528  		0x62, 0xf2, 0xfd, 0x9b, 0x55, 0x68, 0x08,
 529  		0x62, 0xf2, 0xfd, 0xbb, 0x55, 0x68, 0x08,
 530  		0x62, 0xf2, 0xfd, 0xdb, 0x55, 0x68, 0x08,
 531  	};
 532  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
 533  	CYBOZU_TEST_EQUAL(c.getSize(), n);
 534  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 535  }
 536  CYBOZU_TEST_AUTO(vpdpbus)
 537  {
 538  	struct Code : Xbyak::CodeGenerator {
 539  		Code()
 540  		{
 541  			vpdpbusd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]);
 542  			vpdpbusd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]);
 543  			vpdpbusd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]);
 544  
 545  			vpdpbusd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]);
 546  			vpdpbusd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]);
 547  			vpdpbusd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]);
 548  
 549  			vpdpbusds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]);
 550  			vpdpbusds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]);
 551  			vpdpbusds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]);
 552  
 553  			vpdpbusds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]);
 554  			vpdpbusds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]);
 555  			vpdpbusds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]);
 556  
 557  			vpdpwssd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]);
 558  			vpdpwssd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]);
 559  			vpdpwssd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]);
 560  
 561  			vpdpwssd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]);
 562  			vpdpwssd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]);
 563  			vpdpwssd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]);
 564  
 565  			vpdpwssds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]);
 566  			vpdpwssds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]);
 567  			vpdpwssds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]);
 568  
 569  			vpdpwssds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]);
 570  			vpdpwssds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]);
 571  			vpdpwssds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]);
 572  		}
 573  	} c;
 574  	const uint8_t tbl[] = {
 575  		0x62, 0xf2, 0x5d, 0x83, 0x50, 0x68, 0x04,
 576  		0x62, 0xf2, 0x5d, 0xa3, 0x50, 0x68, 0x02,
 577  		0x62, 0xf2, 0x5d, 0xc3, 0x50, 0x68, 0x01,
 578  
 579  		0x62, 0xf2, 0x5d, 0x93, 0x50, 0x68, 0x10,
 580  		0x62, 0xf2, 0x5d, 0xb3, 0x50, 0x68, 0x10,
 581  		0x62, 0xf2, 0x5d, 0xd3, 0x50, 0x68, 0x10,
 582  
 583  		0x62, 0xf2, 0x5d, 0x83, 0x51, 0x68, 0x04,
 584  		0x62, 0xf2, 0x5d, 0xa3, 0x51, 0x68, 0x02,
 585  		0x62, 0xf2, 0x5d, 0xc3, 0x51, 0x68, 0x01,
 586  
 587  		0x62, 0xf2, 0x5d, 0x93, 0x51, 0x68, 0x10,
 588  		0x62, 0xf2, 0x5d, 0xb3, 0x51, 0x68, 0x10,
 589  		0x62, 0xf2, 0x5d, 0xd3, 0x51, 0x68, 0x10,
 590  
 591  		0x62, 0xf2, 0x5d, 0x83, 0x52, 0x68, 0x04,
 592  		0x62, 0xf2, 0x5d, 0xa3, 0x52, 0x68, 0x02,
 593  		0x62, 0xf2, 0x5d, 0xc3, 0x52, 0x68, 0x01,
 594  
 595  		0x62, 0xf2, 0x5d, 0x93, 0x52, 0x68, 0x10,
 596  		0x62, 0xf2, 0x5d, 0xb3, 0x52, 0x68, 0x10,
 597  		0x62, 0xf2, 0x5d, 0xd3, 0x52, 0x68, 0x10,
 598  
 599  		0x62, 0xf2, 0x5d, 0x83, 0x53, 0x68, 0x04,
 600  		0x62, 0xf2, 0x5d, 0xa3, 0x53, 0x68, 0x02,
 601  		0x62, 0xf2, 0x5d, 0xc3, 0x53, 0x68, 0x01,
 602  
 603  		0x62, 0xf2, 0x5d, 0x93, 0x53, 0x68, 0x10,
 604  		0x62, 0xf2, 0x5d, 0xb3, 0x53, 0x68, 0x10,
 605  		0x62, 0xf2, 0x5d, 0xd3, 0x53, 0x68, 0x10,
 606  	};
 607  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
 608  	CYBOZU_TEST_EQUAL(c.getSize(), n);
 609  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 610  }
 611  CYBOZU_TEST_AUTO(vexpand_vpshufbitqmb)
 612  {
 613  	struct Code : Xbyak::CodeGenerator {
 614  		Code()
 615  		{
 616  			vpexpandb(xmm5|k3|T_z, xmm30);
 617  			vpexpandb(ymm5|k3|T_z, ymm30);
 618  			vpexpandb(zmm5|k3|T_z, zmm30);
 619  			vpexpandb(xmm5|k3|T_z, ptr [rax + 0x40]);
 620  			vpexpandb(ymm5|k3|T_z, ptr [rax + 0x40]);
 621  			vpexpandb(zmm5|k3|T_z, ptr [rax + 0x40]);
 622  
 623  			vpexpandw(xmm5|k3|T_z, xmm30);
 624  			vpexpandw(ymm5|k3|T_z, ymm30);
 625  			vpexpandw(zmm5|k3|T_z, zmm30);
 626  			vpexpandw(xmm5|k3|T_z, ptr [rax + 0x40]);
 627  			vpexpandw(ymm5|k3|T_z, ptr [rax + 0x40]);
 628  			vpexpandw(zmm5|k3|T_z, ptr [rax + 0x40]);
 629  
 630  			vpshufbitqmb(k1|k2, xmm2, ptr [rax + 0x40]);
 631  			vpshufbitqmb(k1|k2, ymm2, ptr [rax + 0x40]);
 632  			vpshufbitqmb(k1|k2, zmm2, ptr [rax + 0x40]);
 633  		}
 634  	} c;
 635  	const uint8_t tbl[] = {
 636  		0x62, 0x92, 0x7d, 0x8b, 0x62, 0xee,
 637  		0x62, 0x92, 0x7d, 0xab, 0x62, 0xee,
 638  		0x62, 0x92, 0x7d, 0xcb, 0x62, 0xee,
 639  		0x62, 0xf2, 0x7d, 0x8b, 0x62, 0x68, 0x40,
 640  		0x62, 0xf2, 0x7d, 0xab, 0x62, 0x68, 0x40,
 641  		0x62, 0xf2, 0x7d, 0xcb, 0x62, 0x68, 0x40,
 642  
 643  		0x62, 0x92, 0xfd, 0x8b, 0x62, 0xee,
 644  		0x62, 0x92, 0xfd, 0xab, 0x62, 0xee,
 645  		0x62, 0x92, 0xfd, 0xcb, 0x62, 0xee,
 646  		0x62, 0xf2, 0xfd, 0x8b, 0x62, 0x68, 0x20,
 647  		0x62, 0xf2, 0xfd, 0xab, 0x62, 0x68, 0x20,
 648  		0x62, 0xf2, 0xfd, 0xcb, 0x62, 0x68, 0x20,
 649  
 650  		0x62, 0xf2, 0x6d, 0x0a, 0x8f, 0x48, 0x04,
 651  		0x62, 0xf2, 0x6d, 0x2a, 0x8f, 0x48, 0x02,
 652  		0x62, 0xf2, 0x6d, 0x4a, 0x8f, 0x48, 0x01,
 653  	};
 654  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
 655  	CYBOZU_TEST_EQUAL(c.getSize(), n);
 656  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 657  }
 658  CYBOZU_TEST_AUTO(gf2)
 659  {
 660  	struct Code : Xbyak::CodeGenerator {
 661  		Code()
 662  		{
 663  			///
 664  			gf2p8affineinvqb(xmm1, xmm2, 3);
 665  			gf2p8affineinvqb(xmm1, ptr [rax + 0x40], 3);
 666  
 667  			vgf2p8affineinvqb(xmm1, xmm5, xmm2, 3);
 668  			vgf2p8affineinvqb(ymm1, ymm5, ymm2, 3);
 669  			vgf2p8affineinvqb(xmm1, xmm5, ptr [rax + 0x40], 3);
 670  			vgf2p8affineinvqb(ymm1, ymm5, ptr [rax + 0x40], 3);
 671  
 672  			vgf2p8affineinvqb(xmm30, xmm31, xmm4, 5);
 673  			vgf2p8affineinvqb(ymm30, ymm31, ymm4, 5);
 674  			vgf2p8affineinvqb(zmm30, zmm31, zmm4, 5);
 675  
 676  			vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5);
 677  			vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5);
 678  			vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5);
 679  
 680  			vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5);
 681  			vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5);
 682  			vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5);
 683  			///
 684  			gf2p8affineqb(xmm1, xmm2, 3);
 685  			gf2p8affineqb(xmm1, ptr [rax + 0x40], 3);
 686  
 687  			vgf2p8affineqb(xmm1, xmm5, xmm2, 3);
 688  			vgf2p8affineqb(ymm1, ymm5, ymm2, 3);
 689  			vgf2p8affineqb(xmm1, xmm5, ptr [rax + 0x40], 3);
 690  			vgf2p8affineqb(ymm1, ymm5, ptr [rax + 0x40], 3);
 691  
 692  			vgf2p8affineqb(xmm30, xmm31, xmm4, 5);
 693  			vgf2p8affineqb(ymm30, ymm31, ymm4, 5);
 694  			vgf2p8affineqb(zmm30, zmm31, zmm4, 5);
 695  
 696  			vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5);
 697  			vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5);
 698  			vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5);
 699  
 700  			vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5);
 701  			vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5);
 702  			vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5);
 703  			///
 704  			gf2p8mulb(xmm1, xmm2);
 705  			gf2p8mulb(xmm1, ptr [rax + 0x40]);
 706  
 707  			vgf2p8mulb(xmm1, xmm5, xmm2);
 708  			vgf2p8mulb(ymm1, ymm5, ymm2);
 709  			vgf2p8mulb(xmm1, xmm5, ptr [rax + 0x40]);
 710  			vgf2p8mulb(ymm1, ymm5, ptr [rax + 0x40]);
 711  
 712  			vgf2p8mulb(xmm30, xmm31, xmm4);
 713  			vgf2p8mulb(ymm30, ymm31, ymm4);
 714  			vgf2p8mulb(zmm30, zmm31, zmm4);
 715  
 716  			vgf2p8mulb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40]);
 717  			vgf2p8mulb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40]);
 718  			vgf2p8mulb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40]);
 719  		}
 720  	} c;
 721  	const uint8_t tbl[] = {
 722  		0x66, 0x0f, 0x3a, 0xcf, 0xca, 0x03,
 723  		0x66, 0x0f, 0x3a, 0xcf, 0x48, 0x40, 0x03,
 724  		0xc4, 0xe3, 0xd1, 0xcf, 0xca, 0x03,
 725  		0xc4, 0xe3, 0xd5, 0xcf, 0xca, 0x03,
 726  		0xc4, 0xe3, 0xd1, 0xcf, 0x48, 0x40, 0x03,
 727  		0xc4, 0xe3, 0xd5, 0xcf, 0x48, 0x40, 0x03,
 728  		0x62, 0x63, 0x85, 0x00, 0xcf, 0xf4, 0x05,
 729  		0x62, 0x63, 0x85, 0x20, 0xcf, 0xf4, 0x05,
 730  		0x62, 0x63, 0x85, 0x40, 0xcf, 0xf4, 0x05,
 731  		0x62, 0x63, 0xd5, 0x89, 0xcf, 0x70, 0x04, 0x05,
 732  		0x62, 0x63, 0xd5, 0xa9, 0xcf, 0x70, 0x02, 0x05,
 733  		0x62, 0x63, 0xd5, 0xc9, 0xcf, 0x70, 0x01, 0x05,
 734  		0x62, 0x63, 0xd5, 0x99, 0xcf, 0x70, 0x08, 0x05,
 735  		0x62, 0x63, 0xd5, 0xb9, 0xcf, 0x70, 0x08, 0x05,
 736  		0x62, 0x63, 0xd5, 0xd9, 0xcf, 0x70, 0x08, 0x05,
 737  
 738  		0x66, 0x0f, 0x3a, 0xce, 0xca, 0x03,
 739  		0x66, 0x0f, 0x3a, 0xce, 0x48, 0x40, 0x03,
 740  		0xc4, 0xe3, 0xd1, 0xce, 0xca, 0x03,
 741  		0xc4, 0xe3, 0xd5, 0xce, 0xca, 0x03,
 742  		0xc4, 0xe3, 0xd1, 0xce, 0x48, 0x40, 0x03,
 743  		0xc4, 0xe3, 0xd5, 0xce, 0x48, 0x40, 0x03,
 744  		0x62, 0x63, 0x85, 0x00, 0xce, 0xf4, 0x05,
 745  		0x62, 0x63, 0x85, 0x20, 0xce, 0xf4, 0x05,
 746  		0x62, 0x63, 0x85, 0x40, 0xce, 0xf4, 0x05,
 747  		0x62, 0x63, 0xd5, 0x89, 0xce, 0x70, 0x04, 0x05,
 748  		0x62, 0x63, 0xd5, 0xa9, 0xce, 0x70, 0x02, 0x05,
 749  		0x62, 0x63, 0xd5, 0xc9, 0xce, 0x70, 0x01, 0x05,
 750  		0x62, 0x63, 0xd5, 0x99, 0xce, 0x70, 0x08, 0x05,
 751  		0x62, 0x63, 0xd5, 0xb9, 0xce, 0x70, 0x08, 0x05,
 752  		0x62, 0x63, 0xd5, 0xd9, 0xce, 0x70, 0x08, 0x05,
 753  
 754  		0x66, 0x0f, 0x38, 0xcf, 0xca,
 755  		0x66, 0x0f, 0x38, 0xcf, 0x48, 0x40,
 756  		0xc4, 0xe2, 0x51, 0xcf, 0xca,
 757  		0xc4, 0xe2, 0x55, 0xcf, 0xca,
 758  		0xc4, 0xe2, 0x51, 0xcf, 0x48, 0x40,
 759  		0xc4, 0xe2, 0x55, 0xcf, 0x48, 0x40,
 760  		0x62, 0x62, 0x05, 0x00, 0xcf, 0xf4,
 761  		0x62, 0x62, 0x05, 0x20, 0xcf, 0xf4,
 762  		0x62, 0x62, 0x05, 0x40, 0xcf, 0xf4,
 763  		0x62, 0x62, 0x55, 0x89, 0xcf, 0x70, 0x04,
 764  		0x62, 0x62, 0x55, 0xa9, 0xcf, 0x70, 0x02,
 765  		0x62, 0x62, 0x55, 0xc9, 0xcf, 0x70, 0x01,
 766  	};
 767  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
 768  	CYBOZU_TEST_EQUAL(c.getSize(), n);
 769  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 770  }
 771  
 772  CYBOZU_TEST_AUTO(bf16)
 773  {
 774  	struct Code : Xbyak::CodeGenerator {
 775  		Code()
 776  		{
 777  			vcvtne2ps2bf16(xmm0 | k1, xmm1, ptr [rax + 64]);
 778  			vcvtne2ps2bf16(ymm0 | k1 | T_z, ymm0, ptr [rax + 64]);
 779  			vcvtne2ps2bf16(zmm0 | k1, zmm1, ptr [rax + 64]);
 780  
 781  			vcvtneps2bf16(xmm0, xword [rax + 64]);
 782  			vcvtneps2bf16(xmm0 | k1, yword [rax + 64]);
 783  			vcvtneps2bf16(ymm0 | k1, zword [rax + 64]);
 784  			vcvtneps2bf16(ymm0 | k1, ptr [rax + 64]);
 785  
 786  			vdpbf16ps(xmm0 | k1, xmm1, ptr [rax + 64]);
 787  			vdpbf16ps(ymm0 | k1, ymm1, ptr [rax + 64]);
 788  			vdpbf16ps(zmm0 | k1, zmm1, ptr [rax + 64]);
 789  		}
 790  	} c;
 791  	const uint8_t tbl[] = {
 792  		0x62, 0xf2, 0x77, 0x09, 0x72, 0x40, 0x04,
 793  		0x62, 0xf2, 0x7f, 0xa9, 0x72, 0x40, 0x02,
 794  		0x62, 0xf2, 0x77, 0x49, 0x72, 0x40, 0x01,
 795  
 796  		0x62, 0xf2, 0x7e, 0x08, 0x72, 0x40, 0x04,
 797  		0x62, 0xf2, 0x7e, 0x29, 0x72, 0x40, 0x02,
 798  		0x62, 0xf2, 0x7e, 0x49, 0x72, 0x40, 0x01,
 799  		0x62, 0xf2, 0x7e, 0x49, 0x72, 0x40, 0x01,
 800  
 801  		0x62, 0xf2, 0x76, 0x09, 0x52, 0x40, 0x04,
 802  		0x62, 0xf2, 0x76, 0x29, 0x52, 0x40, 0x02,
 803  		0x62, 0xf2, 0x76, 0x49, 0x52, 0x40, 0x01,
 804  	};
 805  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
 806  	CYBOZU_TEST_EQUAL(c.getSize(), n);
 807  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 808  }
 809  
 810  CYBOZU_TEST_AUTO(AMX)
 811  {
 812  	struct Code : Xbyak::CodeGenerator {
 813  		Code()
 814  		{
 815  			ldtilecfg(ptr[rax + rcx * 4 + 64]);
 816  			sttilecfg(ptr[rsp + rax * 8 + 128]);
 817  			tileloadd(tmm3, ptr[rdi + rdx * 2 + 8]);
 818  			tileloaddt1(tmm4, ptr[r8 + r9 + 32]);
 819  			tilerelease();
 820  			tilestored(ptr[r10 + r11 * 2 + 32], tmm2);
 821  			tilezero(tmm7);
 822  			tdpbssd(tmm1, tmm2, tmm3);
 823  			tdpbsud(tmm2, tmm3, tmm4);
 824  			tdpbusd(tmm3, tmm4, tmm5);
 825  			tdpbuud(tmm4, tmm5, tmm6);
 826  			tdpbf16ps(tmm5, tmm6, tmm7);
 827  		}
 828  	} c;
 829  	// generated code by patch
 830  	const uint8_t tbl[] = {
 831  		0xc4, 0xe2, 0x78, 0x49, 0x44, 0x88, 0x40, 0xc4, 0xe2, 0x79, 0x49, 0x84, 0xc4, 0x80, 0x00, 0x00,
 832  		0x00, 0xc4, 0xe2, 0x7b, 0x4b, 0x5c, 0x57, 0x08, 0xc4, 0x82, 0x79, 0x4b, 0x64, 0x08, 0x20, 0xc4,
 833  		0xe2, 0x78, 0x49, 0xc0, 0xc4, 0x82, 0x7a, 0x4b, 0x54, 0x5a, 0x20, 0xc4, 0xe2, 0x7b, 0x49, 0xf8,
 834  		0xc4, 0xe2, 0x63, 0x5e, 0xca, 0xc4, 0xe2, 0x5a, 0x5e, 0xd3, 0xc4, 0xe2, 0x51, 0x5e, 0xdc, 0xc4,
 835  		0xe2, 0x48, 0x5e, 0xe5, 0xc4, 0xe2, 0x42, 0x5c, 0xee,
 836  	};
 837  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
 838  	CYBOZU_TEST_EQUAL(c.getSize(), n);
 839  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 840  }
 841  
 842  CYBOZU_TEST_AUTO(tileloadd)
 843  {
 844  	struct Code : Xbyak::CodeGenerator {
 845  		Code()
 846  		{
 847  			tileloadd(tmm1, ptr[r8+r8]);
 848  			tileloadd(tmm1, ptr[rax+rcx*4]);
 849  			tileloadd(tmm1, ptr[r8+r9*1+0x40]);
 850  		}
 851  		void notSupported()
 852  		{
 853  			tileloadd(tmm1, ptr[r8]);
 854  		}
 855  		void notSupported2()
 856  		{
 857  			tileloadd(tmm1, ptr[r8*2]);
 858  		}
 859  	} c;
 860  	const uint8_t tbl[] = {
 861  		0xC4, 0x82, 0x7B, 0x4B, 0x0C, 0x00,
 862  		0xC4, 0xE2, 0x7B, 0x4B, 0x0C, 0x88,
 863  		0xC4, 0x82, 0x7B, 0x4B, 0x4C, 0x08, 0x40,
 864  	};
 865  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
 866  	CYBOZU_TEST_EQUAL(c.getSize(), n);
 867  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 868  
 869  	// current version does not support this sibmem format
 870  	CYBOZU_TEST_EXCEPTION(c.notSupported(), std::exception);
 871  	CYBOZU_TEST_EXCEPTION(c.notSupported2(), std::exception);
 872  }
 873  
 874  CYBOZU_TEST_AUTO(vnni)
 875  {
 876  	struct Code : Xbyak::CodeGenerator {
 877  		Code()
 878  		{
 879  			// default encoding is EVEX
 880  			vpdpbusd(xm0, xm1, xm2);
 881  			vpdpbusd(xm0, xm1, xm2, EvexEncoding); // EVEX
 882  			vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX
 883  			setDefaultEncoding(VexEncoding);
 884  			vpdpbusd(xm0, xm1, xm2); // VEX
 885  			setDefaultEncoding(EvexEncoding);
 886  			vpdpbusd(xm0, xm1, xm2); // EVEX
 887  		}
 888  		void badVex()
 889  		{
 890  			vpdpbusd(xm0, xm1, xm31, VexEncoding);
 891  		}
 892  	} c;
 893  	const uint8_t tbl[] = {
 894  		0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
 895  		0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
 896  		0xC4, 0xE2, 0x71, 0x50, 0xC2,
 897  		0xC4, 0xE2, 0x71, 0x50, 0xC2,
 898  		0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2,
 899  	};
 900  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
 901  	CYBOZU_TEST_EQUAL(c.getSize(), n);
 902  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
 903  
 904  	CYBOZU_TEST_EXCEPTION(c.badVex(), std::exception);
 905  }
 906  
 907  CYBOZU_TEST_AUTO(vaddph)
 908  {
 909  	struct Code : Xbyak::CodeGenerator {
 910  		Code()
 911  		{
 912  			vaddph(zmm0, zmm1, ptr[rax+64]);
 913  			vaddph(ymm0, ymm1, ptr[rax+64]);
 914  			vaddph(xmm0, xmm1, ptr[rax+64]);
 915  
 916  			vaddph(zmm0, zmm1, ptr_b[rax+64]);
 917  			vaddph(ymm0, ymm1, ptr_b[rax+64]);
 918  			vaddph(xmm0, xmm1, ptr_b[rax+64]);
 919  
 920  			vaddsh(xmm0, xmm15, ptr[rax+64]);
 921  			vaddsh(xmm0|k5|T_z|T_rd_sae, xmm15, xmm3);
 922  
 923  			vcmpph(k1, xm15, ptr[rax+64], 1);
 924  			vcmpph(k2, ym15, ptr[rax+64], 2);
 925  			vcmpph(k3, zm15, ptr[rax+64], 3);
 926  			vcmpph(k1, xm15, ptr_b[rax+64], 1);
 927  			vcmpph(k2, ym15, ptr_b[rax+64], 2);
 928  			vcmpph(k3, zm15, ptr_b[rax+64], 3);
 929  
 930  			vcmpsh(k1, xm15, ptr[rax+64], 1);
 931  			vcmpsh(k3|k5, xmm1, xmm25|T_sae, 4);
 932  
 933  			vcomish(xmm1, ptr[rax+64]);
 934  			vcomish(xmm1|T_sae, xmm15);
 935  
 936  			vucomish(xmm1, ptr [rax+0x40]);
 937  			vucomish(xmm1|T_sae, xmm15);
 938  
 939  			vfmaddsub213ph(xmm1, xmm2, ptr [rax+0x40]);
 940  			vfmaddsub213ph(xmm1, xmm2, ptr_b [rax+0x40]);
 941  			vfmaddsub213ph(xmm1|k3, xmm2, xmm5);
 942  			vfmaddsub213ph(ymm1, ymm2, ptr [rax+0x40]);
 943  			vfmaddsub213ph(ymm1, ymm2, ptr_b[rax+0x40]);
 944  			vfmaddsub213ph(ymm1|k3, ymm2, ymm5);
 945  			vfmaddsub213ph(zmm1, zmm2, ptr [rax+0x40]);
 946  			vfmaddsub213ph(zmm1, zmm2, ptr_b [rax+0x40]);
 947  			vfmaddsub213ph(zmm1|T_ru_sae, zmm2, zmm5);
 948  
 949  			vfmsubadd132ph(xmm1, xmm2, ptr [rax+0x40]);
 950  			vfmsubadd132ph(xmm1, xmm2, ptr_b [rax+0x40]);
 951  			vfmsubadd132ph(ymm1, ymm2, ptr [rax+0x40]);
 952  			vfmsubadd132ph(ymm1, ymm2, ptr_b [rax+0x40]);
 953  			vfmsubadd132ph(zmm1, zmm2, ptr [rax+0x40]);
 954  			vfmsubadd132ph(zmm1, zmm2, ptr_b [rax+0x40]);
 955  			vfmsubadd132ph(zmm1|T_ru_sae, zmm2, zmm5);
 956  
 957  			vfmadd132ph(xmm1, xmm2, ptr [rax+0x40]);
 958  			vfmadd132ph(xmm1, xmm2, ptr_b [rax+0x40]);
 959  			vfmadd132ph(ymm1, ymm2, ptr [rax+0x40]);
 960  			vfmadd132ph(ymm1, ymm2, ptr_b [rax+0x40]);
 961  			vfmadd132ph(zmm1, zmm2, ptr [rax+0x40]);
 962  			vfmadd132ph(zmm1, zmm2, ptr_b [rax+0x40]);
 963  			vfmadd132ph(zmm1|T_rd_sae, zmm2, zmm5);
 964  
 965  			vfmsub231ph(xmm1, xmm2, ptr [rax+0x40]);
 966  			vfmsub231ph(xmm1, xmm2, ptr_b [rax+0x40]);
 967  			vfmsub231ph(ymm1, ymm2, ptr [rax+0x40]);
 968  			vfmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]);
 969  			vfmsub231ph(zmm1, zmm2, ptr [rax+0x40]);
 970  			vfmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]);
 971  			vfmsub231ph(zmm1|T_rd_sae, zmm2, zmm5);
 972  
 973  			vfnmsub231ph(xmm1, xmm2, ptr [rax+0x40]);
 974  			vfnmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]);
 975  			vfnmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]);
 976  			vfnmsub231ph(zmm1|T_rd_sae, zmm2, zmm5);
 977  
 978  			vfmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
 979  			vfmadd132sh(xmm1, xmm2, ptr [rax+0x40]);
 980  
 981  			vfnmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
 982  			vfnmadd132sh(xmm1, xmm2, ptr [rax+0x40]);
 983  
 984  			vfmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
 985  			vfmsub132sh(xmm1, xmm2, ptr [rax+0x40]);
 986  			vfnmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
 987  			vfnmsub132sh(xmm1, xmm2, ptr [rax+0x40]);
 988  
 989  			vfcmaddcph(xmm1|k1|T_z, xmm2, ptr [rax+0x40]);
 990  			vfcmaddcph(ymm1|k1|T_z, ymm2, ptr [rax+0x40]);
 991  			vfcmaddcph(zmm1|k1, zmm2, ptr [rax+0x40]);
 992  			vfcmaddcph(zmm1|k1|T_rd_sae, zmm2, zmm5);
 993  			vfcmaddcph(xmm1|k1|T_z, xmm2, ptr_b [rax+0x40]);
 994  			vfcmaddcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]);
 995  			vfcmaddcph(zmm1|k1|T_z, zmm2, ptr_b [rax+0x40]);
 996  
 997  			vfmaddcph(xm1, xm2, ptr[rax+0x40]);
 998  			vfmaddcph(ym1|k1|T_z, ym2, ptr_b[rax+0x40]);
 999  			vfmaddcph(zm1, zm2, ptr_b[rax+0x40]);
1000  
1001  			vfcmulcph(xmm1, xmm2, ptr [rax+0x40]);
1002  			vfcmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]);
1003  			vfcmulcph(zmm1, zmm2, ptr_b [rax+0x40]);
1004  
1005  			vfmulcph(xmm1, xmm2, ptr [rax+0x40]);
1006  			vfmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]);
1007  			vfmulcph(zmm1, zmm2, ptr_b [rax+0x40]);
1008  
1009  			vrcpph(xmm1, ptr [rax+0x40]);
1010  			vrcpph(xmm1, ptr_b [rax+0x40]);
1011  			vrcpph(ymm1, ptr [rax+0x40]);
1012  			vrcpph(ymm1, ptr_b [rax+0x40]);
1013  			vrcpph(zmm1, ptr [rax+0x40]);
1014  			vrcpph(zmm1, ptr_b [rax+0x40]);
1015  
1016  			vrcpsh(xmm1, xmm3, ptr [rax+0x40]);
1017  
1018  			vrsqrtph(xmm1, ptr [rax+0x40]);
1019  			vrsqrtph(xmm1, ptr_b [rax+0x40]);
1020  			vrsqrtph(ymm2, ptr [rax+0x40]);
1021  			vrsqrtph(ymm2, ptr_b [rax+0x40]);
1022  			vrsqrtph(zmm2, ptr [rax+0x40]);
1023  			vrsqrtph(zmm2, ptr_b [rax+0x40]);
1024  
1025  			vrsqrtsh(xmm1|k5|T_z, xmm7, ptr [rax+0x40]);
1026  
1027  			vsqrtph(xmm1|k4|T_z, ptr [rax+0x40]);
1028  			vsqrtph(xmm1|k4|T_z, ptr_b [rax+0x40]);
1029  			vsqrtph(ymm1|k4|T_z, ptr_b [rax+0x40]);
1030  			vsqrtph(zmm1|k4|T_z, ptr [rax+0x40]);
1031  			vsqrtph(zmm1|k4|T_z, ptr_b [rax+0x40]);
1032  
1033  			vsqrtsh(xmm1|k4|T_z, xmm5, ptr [rax+0x40]);
1034  			vsqrtsh(xmm1|k4|T_z|T_rd_sae, xmm5, xmm7);
1035  
1036  			vscalefph(xmm1, xmm5, ptr [rax+0x40]);
1037  			vscalefph(xmm1, xmm5, ptr_b [rax+0x40]);
1038  			vscalefph(ymm1, ymm5, ptr [rax+0x40]);
1039  			vscalefph(ymm1, ymm5, ptr_b [rax+0x40]);
1040  			vscalefph(zmm1, zmm5, ptr [rax+0x40]);
1041  			vscalefph(zmm1, zmm5, ptr_b [rax+0x40]);
1042  			vscalefph(zmm1|k1|T_z|T_rd_sae, zmm5, zmm7);
1043  
1044  			vscalefsh(xmm1, xmm5, ptr [rax+0x40]);
1045  			vscalefsh(xmm1|k1|T_z|T_rd_sae, xmm5, xmm7);
1046  
1047  			vreduceph(xmm1, ptr [rax+0x40], 0x1);
1048  			vreduceph(xmm1, ptr_b [rax+0x40], 0x2);
1049  			vreduceph(ymm1, ptr [rax+0x40], 0x3);
1050  			vreduceph(ymm1, ptr_b [rax+0x40], 0x4);
1051  			vreduceph(zmm1, ptr [rax+0x40], 0x5);
1052  			vreduceph(zmm1, ptr_b [rax+0x40], 0x6);
1053  			vreduceph(zmm1|k1|T_z|T_sae, zmm5, 0x7);
1054  
1055  			vreducesh(xmm1, xmm3, ptr [rax+0x40], 0x1);
1056  			vreducesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2);
1057  
1058  			vrndscaleph(xmm1, ptr [rax+0x40], 0x1);
1059  			vrndscaleph(xmm1, ptr_b [rax+0x40], 0x2);
1060  			vrndscaleph(ymm1, ptr [rax+0x40], 0x3);
1061  			vrndscaleph(ymm1, ptr_b [rax+0x40], 0x4);
1062  			vrndscaleph(zmm1, ptr [rax+0x40], 0x5);
1063  			vrndscaleph(zmm1, ptr_b [rax+0x40], 0x6);
1064  			vrndscaleph(zmm1|k1|T_z|T_sae, zmm5, 0x7);
1065  
1066  			vrndscalesh(xmm1, xmm3, ptr [rax+0x40], 0x1);
1067  			vrndscalesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2);
1068  
1069  			vfpclassph(k1, xword [rax+0x40], 0x1);
1070  			vfpclassph(k1, xword_b[rax+0x40], 0x2);
1071  			vfpclassph(k1, yword [rax+0x40], 0x3);
1072  			vfpclassph(k1, yword_b[rax+0x40], 0x4);
1073  			vfpclassph(k1, zword [rax+0x40], 0x5);
1074  			vfpclassph(k1, zword_b[rax+0x40], 0x6);
1075  
1076  			vfpclasssh(k1|k2, xmm3, 0x5);
1077  			vfpclasssh(k1|k2, ptr [rax+0x40], 0x5);
1078  
1079  			vgetexpph(xmm1, ptr [rax+0x40]);
1080  			vgetexpph(ymm1, ptr_b [rax+0x40]);
1081  			vgetexpph(zmm1, ptr [rax+0x40]);
1082  			vgetexpph(zmm1|k1|T_z|T_sae, zmm5);
1083  			vgetexpsh(xmm1, xmm5, ptr [rax+0x40]);
1084  			vgetexpsh(xmm1|k1|T_z|T_sae, xmm3, xmm5);
1085  
1086  			vgetmantph(xmm1, ptr [rax+0x40], 0x1);
1087  			vgetmantph(ymm1, ptr_b [rax+0x40], 0x2);
1088  			vgetmantph(zmm1, ptr [rax+0x40], 0x3);
1089  			vgetmantph(zmm1|k1|T_z|T_sae, zmm5, 0x4);
1090  
1091  			vgetmantsh(xmm1, xmm5, ptr [rax+0x40], 0x5);
1092  			vgetmantsh(xmm1|k1|T_z|T_sae, xmm3, xmm5, 0x6);
1093  
1094  			vmovsh(xmm1|k1|T_z, ptr [rax+0x40]);
1095  			vmovsh(ptr [rax+0x40]|k1, xmm1);
1096  			vmovsh(xmm1|k2|T_z, xmm3, xmm5);
1097  
1098  			vmovw(xmm1, r13d);
1099  			vmovw(xmm3, ptr [rax+0x40]);
1100  			vmovw(r9d, xmm1);
1101  			vmovw(ptr [rax+0x40], xmm7);
1102  
1103  			vcvtsd2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
1104  			vcvtsd2sh(xmm1, xmm2, ptr [rax+0x40]);
1105  
1106  			vcvtsh2sd(xmm1|k1|T_z|T_sae, xmm2, xmm3);
1107  			vcvtsh2sd(xmm1, xmm2, ptr [rax+0x40]);
1108  
1109  			vcvtsh2ss(xmm1|k1|T_z|T_sae, xmm2, xmm3);
1110  			vcvtsh2ss(xmm1, xmm2, ptr [rax+0x40]);
1111  
1112  			vcvtss2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
1113  			vcvtss2sh(xmm1, xmm2, ptr [rax+0x40]);
1114  
1115  			vcvtsh2si(edx|T_rd_sae, xmm1);
1116  			vcvtsh2si(edx, ptr [rax+0x40]);
1117  			vcvtsh2si(rdx|T_rd_sae, xmm1);
1118  			vcvtsh2si(r8, ptr [rax+0x40]);
1119  
1120  			vcvtph2dq(xmm1, xmm5);
1121  			vcvtph2dq(xmm1, ptr [rax+0x40]);
1122  			vcvtph2dq(xmm1, ptr_b [rax+0x40]);
1123  			vcvtph2dq(ymm1|k2|T_z, xmm5);
1124  			vcvtph2dq(ymm1, ptr [rax+0x40]);
1125  			vcvtph2dq(ymm1, ptr_b [rax+0x40]);
1126  			vcvtph2dq(zmm1|k5|T_z|T_rd_sae, ymm3);
1127  			vcvtph2dq(zmm1|k5|T_z, ptr [rax+0x40]);
1128  			vcvtph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]);
1129  
1130  			vcvtph2psx(xmm1, xmm5);
1131  			vcvtph2psx(xmm1, ptr [rax+0x40]);
1132  			vcvtph2psx(xmm1, ptr_b [rax+0x40]);
1133  			vcvtph2psx(ymm1|k2|T_z, xmm5);
1134  			vcvtph2psx(ymm1, ptr [rax+0x40]);
1135  			vcvtph2psx(ymm1, ptr_b [rax+0x40]);
1136  			vcvtph2psx(zmm1|k5|T_z|T_sae, ymm3);
1137  			vcvtph2psx(zmm1|k5|T_z, ptr [rax+0x40]);
1138  			vcvtph2psx(zmm1|k5|T_z, ptr_b [rax+0x40]);
1139  
1140  			vcvtph2udq(xmm1, xmm5);
1141  			vcvtph2udq(xmm1, ptr [rax+0x40]);
1142  			vcvtph2udq(xmm1, ptr_b [rax+0x40]);
1143  			vcvtph2udq(ymm1|k2|T_z, xmm5);
1144  			vcvtph2udq(ymm1, ptr [rax+0x40]);
1145  			vcvtph2udq(ymm1, ptr_b [rax+0x40]);
1146  			vcvtph2udq(zmm1|k5|T_z|T_rd_sae, ymm3);
1147  			vcvtph2udq(zmm1|k5|T_z, ptr [rax+0x40]);
1148  			vcvtph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]);
1149  
1150  			vcvttph2dq(xmm1, xmm5);
1151  			vcvttph2dq(xmm1, ptr [rax+0x40]);
1152  			vcvttph2dq(xmm1, ptr_b [rax+0x40]);
1153  			vcvttph2dq(ymm1|k2|T_z, xmm5);
1154  			vcvttph2dq(ymm1, ptr [rax+0x40]);
1155  			vcvttph2dq(ymm1, ptr_b [rax+0x40]);
1156  			vcvttph2dq(zmm1|k5|T_z|T_sae, ymm3);
1157  			vcvttph2dq(zmm1|k5|T_z, ptr [rax+0x40]);
1158  			vcvttph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]);
1159  
1160  			vcvttph2udq(xmm1, xmm5);
1161  			vcvttph2udq(xmm1, ptr [rax+0x40]);
1162  			vcvttph2udq(xmm1, ptr_b [rax+0x40]);
1163  			vcvttph2udq(ymm1|k2|T_z, xmm5);
1164  			vcvttph2udq(ymm1, ptr [rax+0x40]);
1165  			vcvttph2udq(ymm1, ptr_b [rax+0x40]);
1166  			vcvttph2udq(zmm1|k5|T_z|T_sae, ymm3);
1167  			vcvttph2udq(zmm1|k5|T_z, ptr [rax+0x40]);
1168  			vcvttph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]);
1169  
1170  
1171  			vcvtph2pd(xmm1, xmm5);
1172  			vcvtph2pd(xmm1, ptr [rax+0x40]);
1173  			vcvtph2pd(xmm1, ptr_b [rax+0x40]);
1174  			vcvtph2pd(ymm1|k2|T_z, xmm5);
1175  			vcvtph2pd(ymm1, ptr [rax+0x40]);
1176  			vcvtph2pd(ymm1, ptr_b [rax+0x40]);
1177  			vcvtph2pd(zmm1|k5|T_z|T_sae, xmm3);
1178  			vcvtph2pd(zmm1|k5|T_z, ptr [rax+0x40]);
1179  			vcvtph2pd(zmm1|k5|T_z, ptr_b [rax+0x40]);
1180  
1181  			vcvtph2qq(xmm1, xmm5);
1182  			vcvtph2qq(xmm1, ptr [rax+0x40]);
1183  			vcvtph2qq(xmm1, ptr_b [rax+0x40]);
1184  			vcvtph2qq(ymm1|k2|T_z, xmm5);
1185  			vcvtph2qq(ymm1, ptr [rax+0x40]);
1186  			vcvtph2qq(ymm1, ptr_b [rax+0x40]);
1187  			vcvtph2qq(zmm1|k5|T_z|T_rd_sae, xmm3);
1188  			vcvtph2qq(zmm1|k5|T_z, ptr [rax+0x40]);
1189  			vcvtph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]);
1190  
1191  			vcvtph2uqq(xmm1, xmm5);
1192  			vcvtph2uqq(xmm1, ptr [rax+0x40]);
1193  			vcvtph2uqq(xmm1, ptr_b [rax+0x40]);
1194  			vcvtph2uqq(ymm1|k2|T_z, xmm5);
1195  			vcvtph2uqq(ymm1, ptr [rax+0x40]);
1196  			vcvtph2uqq(ymm1, ptr_b [rax+0x40]);
1197  			vcvtph2uqq(zmm1|k5|T_z|T_rd_sae, xmm3);
1198  			vcvtph2uqq(zmm1|k5|T_z, ptr [rax+0x40]);
1199  			vcvtph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]);
1200  
1201  			vcvttph2uqq(xmm1, xmm5);
1202  			vcvttph2uqq(xmm1, ptr [rax+0x40]);
1203  			vcvttph2uqq(xmm1, ptr_b [rax+0x40]);
1204  			vcvttph2uqq(ymm1|k2|T_z, xmm5);
1205  			vcvttph2uqq(ymm1, ptr [rax+0x40]);
1206  			vcvttph2uqq(ymm1, ptr_b [rax+0x40]);
1207  			vcvttph2uqq(zmm1|k5|T_z|T_sae, xmm3);
1208  			vcvttph2uqq(zmm1|k5|T_z, ptr [rax+0x40]);
1209  			vcvttph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]);
1210  
1211  			vcvtdq2ph(xmm1, xmm5);
1212  			vcvtdq2ph(xmm1, xword [rax+0x40]);
1213  			vcvtdq2ph(xmm1, xword_b [rax+0x40]);
1214  			vcvtdq2ph(xmm1, yword [rax+0x40]);
1215  			vcvtdq2ph(xmm1, yword_b [rax+0x40]);
1216  			vcvtdq2ph(ymm1|k2|T_z|T_rd_sae, zmm5);
1217  			vcvtdq2ph(ymm1, ptr [rax+0x40]);
1218  			vcvtdq2ph(ymm1, ptr_b [rax+0x40]);
1219  
1220  			vcvtps2phx(xmm1, xmm5);
1221  			vcvtps2phx(xmm1, xword [rax+0x40]);
1222  			vcvtps2phx(xmm1, xword_b [rax+0x40]);
1223  			vcvtps2phx(xmm1, yword [rax+0x40]);
1224  			vcvtps2phx(xmm1, yword_b [rax+0x40]);
1225  			vcvtps2phx(ymm1|k2|T_z|T_rd_sae, zmm5);
1226  			vcvtps2phx(ymm1, ptr [rax+0x40]);
1227  			vcvtps2phx(ymm1, ptr_b [rax+0x40]);
1228  
1229  			vcvtudq2ph(xmm1, xmm5);
1230  			vcvtudq2ph(xmm1, xword [rax+0x40]);
1231  			vcvtudq2ph(xmm1, xword_b [rax+0x40]);
1232  			vcvtudq2ph(xmm1, yword [rax+0x40]);
1233  			vcvtudq2ph(xmm1, yword_b [rax+0x40]);
1234  			vcvtudq2ph(ymm1|k2|T_z|T_rd_sae, zmm5);
1235  			vcvtudq2ph(ymm1, ptr [rax+0x40]);
1236  			vcvtudq2ph(ymm1, ptr_b [rax+0x40]);
1237  
1238  			vcvtpd2ph(xmm1, xmm5);
1239  			vcvtpd2ph(xmm1, ymm5);
1240  			vcvtpd2ph(xmm1|k2|T_z|T_rd_sae, zmm5);
1241  			vcvtpd2ph(xmm1, xword [rax+0x40]);
1242  			vcvtpd2ph(xmm1, xword_b [rax+0x40]);
1243  			vcvtpd2ph(xmm1, yword [rax+0x40]);
1244  			vcvtpd2ph(xmm1, yword_b [rax+0x40]);
1245  			vcvtpd2ph(xmm1, zword [rax+0x40]);
1246  			vcvtpd2ph(xmm1, zword_b [rax+0x40]);
1247  
1248  			vcvtqq2ph(xmm1, xmm5);
1249  			vcvtqq2ph(xmm1, ymm5);
1250  			vcvtqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5);
1251  			vcvtqq2ph(xmm1, xword [rax+0x40]);
1252  			vcvtqq2ph(xmm1, xword_b [rax+0x40]);
1253  			vcvtqq2ph(xmm1, yword [rax+0x40]);
1254  			vcvtqq2ph(xmm1, yword_b [rax+0x40]);
1255  			vcvtqq2ph(xmm1, zword [rax+0x40]);
1256  			vcvtqq2ph(xmm1, zword_b [rax+0x40]);
1257  
1258  			vcvtuqq2ph(xmm1, xmm5);
1259  			vcvtuqq2ph(xmm1, ymm5);
1260  			vcvtuqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5);
1261  			vcvtuqq2ph(xmm1, xword [rax+0x40]);
1262  			vcvtuqq2ph(xmm1, xword_b [rax+0x40]);
1263  			vcvtuqq2ph(xmm1, yword [rax+0x40]);
1264  			vcvtuqq2ph(xmm1, yword_b [rax+0x40]);
1265  			vcvtuqq2ph(xmm1, zword [rax+0x40]);
1266  			vcvtuqq2ph(xmm1, zword_b [rax+0x40]);
1267  
1268  			vcvtph2uw(xmm1, xmm5);
1269  			vcvtph2uw(xmm1, ptr [rax+0x40]);
1270  			vcvtph2uw(xmm1, ptr_b [rax+0x40]);
1271  			vcvtph2uw(ymm1, ptr [rax+0x40]);
1272  			vcvtph2uw(ymm1, ptr_b [rax+0x40]);
1273  			vcvtph2uw(zmm1|k2|T_z|T_rd_sae, zmm5);
1274  			vcvtph2uw(zmm1, ptr [rax+0x40]);
1275  			vcvtph2uw(zmm1, ptr_b [rax+0x40]);
1276  
1277  			vcvtph2w(xmm1, xmm5);
1278  			vcvtph2w(xmm1, ptr [rax+0x40]);
1279  			vcvtph2w(xmm1, ptr_b [rax+0x40]);
1280  			vcvtph2w(ymm1, ptr [rax+0x40]);
1281  			vcvtph2w(ymm1, ptr_b [rax+0x40]);
1282  			vcvtph2w(zmm1|k2|T_z|T_rd_sae, zmm5);
1283  			vcvtph2w(zmm1, ptr [rax+0x40]);
1284  			vcvtph2w(zmm1, ptr_b [rax+0x40]);
1285  
1286  			vcvttph2uw(xmm1, xmm5);
1287  			vcvttph2uw(xmm1, ptr [rax+0x40]);
1288  			vcvttph2uw(xmm1, ptr_b [rax+0x40]);
1289  			vcvttph2uw(ymm1, ptr [rax+0x40]);
1290  			vcvttph2uw(ymm1, ptr_b [rax+0x40]);
1291  			vcvttph2uw(zmm1|k2|T_z|T_sae, zmm5);
1292  			vcvttph2uw(zmm1, ptr [rax+0x40]);
1293  			vcvttph2uw(zmm1, ptr_b [rax+0x40]);
1294  
1295  			vcvttph2w(xmm1, xmm5);
1296  			vcvttph2w(xmm1, ptr [rax+0x40]);
1297  			vcvttph2w(xmm1, ptr_b [rax+0x40]);
1298  			vcvttph2w(ymm1, ptr [rax+0x40]);
1299  			vcvttph2w(ymm1, ptr_b [rax+0x40]);
1300  			vcvttph2w(zmm1|k2|T_z|T_sae, zmm5);
1301  			vcvttph2w(zmm1, ptr [rax+0x40]);
1302  			vcvttph2w(zmm1, ptr_b [rax+0x40]);
1303  
1304  			vcvtuw2ph(xmm1, xmm5);
1305  			vcvtuw2ph(xmm1, ptr [rax+0x40]);
1306  			vcvtuw2ph(xmm1, ptr_b [rax+0x40]);
1307  			vcvtuw2ph(ymm1, ptr [rax+0x40]);
1308  			vcvtuw2ph(ymm1, ptr_b [rax+0x40]);
1309  			vcvtuw2ph(zmm1|k2|T_z|T_rd_sae, zmm5);
1310  			vcvtuw2ph(zmm1, ptr [rax+0x40]);
1311  			vcvtuw2ph(zmm1, ptr_b [rax+0x40]);
1312  
1313  			vcvtw2ph(xmm1, xmm5);
1314  			vcvtw2ph(xmm1, ptr [rax+0x40]);
1315  			vcvtw2ph(xmm1, ptr_b [rax+0x40]);
1316  			vcvtw2ph(ymm1, ptr [rax+0x40]);
1317  			vcvtw2ph(ymm1, ptr_b [rax+0x40]);
1318  			vcvtw2ph(zmm1|k2|T_z|T_rd_sae, zmm5);
1319  			vcvtw2ph(zmm1, ptr [rax+0x40]);
1320  			vcvtw2ph(zmm1, ptr_b [rax+0x40]);
1321  
1322  			vcvtps2ph(xmm1, xmm2, 0x1);
1323  			vcvtps2ph(ptr [rax+0x40], xmm2, 0x2);
1324  			vcvtps2ph(xmm1, ymm2, 0x3);
1325  			vcvtps2ph(ptr [rax+0x40], ymm2, 0x4);
1326  			vcvtps2ph(xmm1|k1|T_z, xmm2, 0x5);
1327  			vcvtps2ph(ptr [rax+0x40]|k1, xmm3, 0x6);
1328  			vcvtps2ph(xmm1|k2, ymm4, 0x7);
1329  			vcvtps2ph(ptr [rax+0x40]|k2, ymm5, 0x8);
1330  			vcvtps2ph(ymm1|k2|T_sae, zmm5, 0x9);
1331  			vcvtps2ph(ptr [rax+0x40]|k5, zmm4, 0xa);
1332  
1333  			vcvtsh2usi(ecx|T_rd_sae, xmm1);
1334  			vcvtsh2usi(eax, ptr [rax+0x40]);
1335  			vcvtsh2usi(r9|T_rd_sae, xmm1);
1336  			vcvtsh2usi(r13, ptr [rax+0x40]);
1337  
1338  			vcvttsh2si(ecx|T_sae, xmm1);
1339  			vcvttsh2si(eax, ptr [rax+0x40]);
1340  			vcvttsh2si(r9|T_sae, xmm1);
1341  			vcvttsh2si(r13, ptr [rax+0x40]);
1342  
1343  			vcvttsh2usi(ecx|T_sae, xmm1);
1344  			vcvttsh2usi(eax, ptr [rax+0x40]);
1345  			vcvttsh2usi(r9|T_sae, xmm1);
1346  			vcvttsh2usi(r13, ptr [rax+0x40]);
1347  
1348  			vcvttph2qq(xmm1, xmm5);
1349  			vcvttph2qq(xmm1, ptr [rax+0x40]);
1350  			vcvttph2qq(xmm1, ptr_b [rax+0x40]);
1351  			vcvttph2qq(ymm1|k2|T_z, xmm5);
1352  			vcvttph2qq(ymm1, ptr [rax+0x40]);
1353  			vcvttph2qq(ymm1, ptr_b [rax+0x40]);
1354  			vcvttph2qq(zmm1|k5|T_z|T_sae, xmm3);
1355  			vcvttph2qq(zmm1|k5|T_z, ptr [rax+0x40]);
1356  			vcvttph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]);
1357  
1358  			vcvtsi2sh(xmm1|T_rd_sae, xmm2, eax);
1359  			vcvtsi2sh(xmm1, xmm2, dword [rax+0x40]);
1360  			vcvtsi2sh(xmm1|T_rd_sae, xmm2, r9);
1361  			vcvtsi2sh(xmm1, xmm2, qword [rax+0x40]);
1362  
1363  			vcvtusi2sh(xmm1|T_rd_sae, xmm2, eax);
1364  			vcvtusi2sh(xmm1, xmm2, dword [rax+0x40]);
1365  			vcvtusi2sh(xmm1|T_rd_sae, xmm2, r9);
1366  			vcvtusi2sh(xmm1, xmm2, qword [rax+0x40]);
1367  		}
1368  	} c;
1369  	const uint8_t tbl[] = {
1370  		// vaddph
1371  		0x62, 0xF5, 0x74, 0x48, 0x58, 0x40, 0x01,
1372  		0x62, 0xF5, 0x74, 0x28, 0x58, 0x40, 0x02,
1373  		0x62, 0xF5, 0x74, 0x08, 0x58, 0x40, 0x04,
1374  
1375  		0x62, 0xF5, 0x74, 0x58, 0x58, 0x40, 0x20,
1376  		0x62, 0xF5, 0x74, 0x38, 0x58, 0x40, 0x20,
1377  		0x62, 0xF5, 0x74, 0x18, 0x58, 0x40, 0x20,
1378  
1379  		// vaddsh
1380  		0x62, 0xF5, 0x06, 0x08, 0x58, 0x40, 0x20,
1381  		0x62, 0xF5, 0x06, 0xBD, 0x58, 0xC3,
1382  
1383  		// vcmpph
1384  		0x62, 0xf3, 0x04, 0x08, 0xc2, 0x48, 0x04, 0x01,
1385  		0x62, 0xf3, 0x04, 0x28, 0xc2, 0x50, 0x02, 0x02,
1386  		0x62, 0xf3, 0x04, 0x48, 0xc2, 0x58, 0x01, 0x03,
1387  		0x62, 0xf3, 0x04, 0x18, 0xc2, 0x48, 0x20, 0x01,
1388  		0x62, 0xf3, 0x04, 0x38, 0xc2, 0x50, 0x20, 0x02,
1389  		0x62, 0xf3, 0x04, 0x58, 0xc2, 0x58, 0x20, 0x03,
1390  
1391  		// vcmpsh
1392  		0x62, 0xf3, 0x06, 0x08, 0xc2, 0x48, 0x20, 0x01,
1393  		0x62, 0x93, 0x76, 0x1d, 0xc2, 0xd9, 0x04,
1394  
1395  		// vcomish
1396  		0x62, 0xf5, 0x7c, 0x08, 0x2f, 0x48, 0x20,
1397  		0x62, 0xd5, 0x7c, 0x18, 0x2f, 0xcf,
1398  
1399  		// vucomish
1400  		0x62, 0xf5, 0x7c, 0x08, 0x2e, 0x48, 0x20,
1401  		0x62, 0xd5, 0x7c, 0x18, 0x2e, 0xcf,
1402  
1403  		// vfmaddsub213ph
1404  		0x62, 0xf6, 0x6d, 0x08, 0xa6, 0x48, 0x04,
1405  		0x62, 0xf6, 0x6d, 0x18, 0xa6, 0x48, 0x20,
1406  		0x62, 0xf6, 0x6d, 0x0b, 0xa6, 0xcd,
1407  		0x62, 0xf6, 0x6d, 0x28, 0xa6, 0x48, 0x02,
1408  		0x62, 0xf6, 0x6d, 0x38, 0xa6, 0x48, 0x20,
1409  		0x62, 0xf6, 0x6d, 0x2b, 0xa6, 0xcd,
1410  		0x62, 0xf6, 0x6d, 0x48, 0xa6, 0x48, 0x01,
1411  		0x62, 0xf6, 0x6d, 0x58, 0xa6, 0x48, 0x20,
1412  		0x62, 0xf6, 0x6d, 0x58, 0xa6, 0xcd,
1413  
1414  		// vfmsubadd132ph
1415  		0x62, 0xf6, 0x6d, 0x08, 0x97, 0x48, 0x04,
1416  		0x62, 0xf6, 0x6d, 0x18, 0x97, 0x48, 0x20,
1417  		0x62, 0xf6, 0x6d, 0x28, 0x97, 0x48, 0x02,
1418  		0x62, 0xf6, 0x6d, 0x38, 0x97, 0x48, 0x20,
1419  		0x62, 0xf6, 0x6d, 0x48, 0x97, 0x48, 0x01,
1420  		0x62, 0xf6, 0x6d, 0x58, 0x97, 0x48, 0x20,
1421  		0x62, 0xf6, 0x6d, 0x58, 0x97, 0xcd,
1422  
1423  		// vfmadd132ph
1424  		0x62, 0xf6, 0x6d, 0x08, 0x98, 0x48, 0x04,
1425  		0x62, 0xf6, 0x6d, 0x18, 0x98, 0x48, 0x20,
1426  		0x62, 0xf6, 0x6d, 0x28, 0x98, 0x48, 0x02,
1427  		0x62, 0xf6, 0x6d, 0x38, 0x98, 0x48, 0x20,
1428  		0x62, 0xf6, 0x6d, 0x48, 0x98, 0x48, 0x01,
1429  		0x62, 0xf6, 0x6d, 0x58, 0x98, 0x48, 0x20,
1430  		0x62, 0xf6, 0x6d, 0x38, 0x98, 0xcd,
1431  
1432  		// vfmsub231ph
1433  		0x62, 0xf6, 0x6d, 0x08, 0xba, 0x48, 0x04,
1434  		0x62, 0xf6, 0x6d, 0x18, 0xba, 0x48, 0x20,
1435  		0x62, 0xf6, 0x6d, 0x28, 0xba, 0x48, 0x02,
1436  		0x62, 0xf6, 0x6d, 0x38, 0xba, 0x48, 0x20,
1437  		0x62, 0xf6, 0x6d, 0x48, 0xba, 0x48, 0x01,
1438  		0x62, 0xf6, 0x6d, 0x58, 0xba, 0x48, 0x20,
1439  		0x62, 0xf6, 0x6d, 0x38, 0xba, 0xcd,
1440  
1441  		// vfnmsub231ph
1442  		0x62, 0xf6, 0x6d, 0x08, 0xbe, 0x48, 0x04,
1443  		0x62, 0xf6, 0x6d, 0x38, 0xbe, 0x48, 0x20,
1444  		0x62, 0xf6, 0x6d, 0x58, 0xbe, 0x48, 0x20,
1445  		0x62, 0xf6, 0x6d, 0x38, 0xbe, 0xcd,
1446  
1447  		// vfmadd132sh
1448  		0x62, 0xf6, 0x6d, 0xb9, 0x99, 0xcb,
1449  		0x62, 0xf6, 0x6d, 0x08, 0x99, 0x48, 0x20,
1450  
1451  		// vfnmadd132sh
1452  		0x62, 0xf6, 0x6d, 0xb9, 0x9d, 0xcb,
1453  		0x62, 0xf6, 0x6d, 0x08, 0x9d, 0x48, 0x20,
1454  
1455  		// vfmsub132sh
1456  		0x62, 0xf6, 0x6d, 0xb9, 0x9b, 0xcb,
1457  		0x62, 0xf6, 0x6d, 0x08, 0x9b, 0x48, 0x20,
1458  
1459  		// vfnmsub132sh
1460  		0x62, 0xf6, 0x6d, 0xb9, 0x9f, 0xcb,
1461  		0x62, 0xf6, 0x6d, 0x08, 0x9f, 0x48, 0x20,
1462  
1463  		// vfcmaddcph
1464  		0x62, 0xf6, 0x6f, 0x89, 0x56, 0x48, 0x04,
1465  		0x62, 0xf6, 0x6f, 0xa9, 0x56, 0x48, 0x02,
1466  		0x62, 0xf6, 0x6f, 0x49, 0x56, 0x48, 0x01,
1467  		0x62, 0xf6, 0x6f, 0x39, 0x56, 0xcd,
1468  		0x62, 0xf6, 0x6f, 0x99, 0x56, 0x48, 0x10,
1469  		0x62, 0xf6, 0x6f, 0xb9, 0x56, 0x48, 0x10,
1470  		0x62, 0xf6, 0x6f, 0xd9, 0x56, 0x48, 0x10,
1471  
1472  		// vfmaddcph
1473  		0x62, 0xf6, 0x6e, 0x08, 0x56, 0x48, 0x04,
1474  		0x62, 0xf6, 0x6e, 0xb9, 0x56, 0x48, 0x10,
1475  		0x62, 0xf6, 0x6e, 0x58, 0x56, 0x48, 0x10,
1476  
1477  		// vfcmulcph
1478  		0x62, 0xf6, 0x6f, 0x08, 0xd6, 0x48, 0x04,
1479  		0x62, 0xf6, 0x6f, 0xb9, 0xd6, 0x48, 0x10,
1480  		0x62, 0xf6, 0x6f, 0x58, 0xd6, 0x48, 0x10,
1481  
1482  		// vfmulcph
1483  		0x62, 0xf6, 0x6e, 0x08, 0xd6, 0x48, 0x04,
1484  		0x62, 0xf6, 0x6e, 0xb9, 0xd6, 0x48, 0x10,
1485  		0x62, 0xf6, 0x6e, 0x58, 0xd6, 0x48, 0x10,
1486  
1487  		// vrcpph
1488  		0x62, 0xf6, 0x7d, 0x08, 0x4c, 0x48, 0x04,
1489  		0x62, 0xf6, 0x7d, 0x18, 0x4c, 0x48, 0x20,
1490  		0x62, 0xf6, 0x7d, 0x28, 0x4c, 0x48, 0x02,
1491  		0x62, 0xf6, 0x7d, 0x38, 0x4c, 0x48, 0x20,
1492  		0x62, 0xf6, 0x7d, 0x48, 0x4c, 0x48, 0x01,
1493  		0x62, 0xf6, 0x7d, 0x58, 0x4c, 0x48, 0x20,
1494  
1495  		// vrcpsh
1496  		0x62, 0xf6, 0x65, 0x08, 0x4d, 0x48, 0x20,
1497  
1498  		// vrsqrtph
1499  		0x62, 0xf6, 0x7d, 0x08, 0x4e, 0x48, 0x04,
1500  		0x62, 0xf6, 0x7d, 0x18, 0x4e, 0x48, 0x20,
1501  		0x62, 0xf6, 0x7d, 0x28, 0x4e, 0x50, 0x02,
1502  		0x62, 0xf6, 0x7d, 0x38, 0x4e, 0x50, 0x20,
1503  		0x62, 0xf6, 0x7d, 0x48, 0x4e, 0x50, 0x01,
1504  		0x62, 0xf6, 0x7d, 0x58, 0x4e, 0x50, 0x20,
1505  
1506  		// vrsqrtsh
1507  		0x62, 0xf6, 0x45, 0x8d, 0x4f, 0x48, 0x20,
1508  
1509  		// vsqrtph
1510  		0x62, 0xf5, 0x7c, 0x8c, 0x51, 0x48, 0x04,
1511  		0x62, 0xf5, 0x7c, 0x9c, 0x51, 0x48, 0x20,
1512  		0x62, 0xf5, 0x7c, 0xbc, 0x51, 0x48, 0x20,
1513  		0x62, 0xf5, 0x7c, 0xcc, 0x51, 0x48, 0x01,
1514  		0x62, 0xf5, 0x7c, 0xdc, 0x51, 0x48, 0x20,
1515  
1516  		// vsqrtsh
1517  		0x62, 0xf5, 0x56, 0x8c, 0x51, 0x48, 0x20,
1518  		0x62, 0xf5, 0x56, 0xbc, 0x51, 0xcf,
1519  
1520  		// vscalefph
1521  		0x62, 0xf6, 0x55, 0x08, 0x2c, 0x48, 0x04,
1522  		0x62, 0xf6, 0x55, 0x18, 0x2c, 0x48, 0x20,
1523  		0x62, 0xf6, 0x55, 0x28, 0x2c, 0x48, 0x02,
1524  		0x62, 0xf6, 0x55, 0x38, 0x2c, 0x48, 0x20,
1525  		0x62, 0xf6, 0x55, 0x48, 0x2c, 0x48, 0x01,
1526  		0x62, 0xf6, 0x55, 0x58, 0x2c, 0x48, 0x20,
1527  		0x62, 0xf6, 0x55, 0xb9, 0x2c, 0xcf,
1528  
1529  		// vscalefsh
1530  		0x62, 0xf6, 0x55, 0x08, 0x2d, 0x48, 0x20,
1531  		0x62, 0xf6, 0x55, 0xb9, 0x2d, 0xcf,
1532  
1533  		// vreduceph
1534  		0x62, 0xf3, 0x7c, 0x08, 0x56, 0x48, 0x04, 0x01,
1535  		0x62, 0xf3, 0x7c, 0x18, 0x56, 0x48, 0x20, 0x02,
1536  		0x62, 0xf3, 0x7c, 0x28, 0x56, 0x48, 0x02, 0x03,
1537  		0x62, 0xf3, 0x7c, 0x38, 0x56, 0x48, 0x20, 0x04,
1538  		0x62, 0xf3, 0x7c, 0x48, 0x56, 0x48, 0x01, 0x05,
1539  		0x62, 0xf3, 0x7c, 0x58, 0x56, 0x48, 0x20, 0x06,
1540  		0x62, 0xf3, 0x7c, 0x99, 0x56, 0xcd, 0x07,
1541  
1542  		// vreducesh
1543  		0x62, 0xf3, 0x64, 0x08, 0x57, 0x48, 0x20, 0x01,
1544  		0x62, 0xf3, 0x54, 0x99, 0x57, 0xcc, 0x02,
1545  
1546  		// vrndscaleph
1547  		0x62, 0xf3, 0x7c, 0x08, 0x08, 0x48, 0x04, 0x01,
1548  		0x62, 0xf3, 0x7c, 0x18, 0x08, 0x48, 0x20, 0x02,
1549  		0x62, 0xf3, 0x7c, 0x28, 0x08, 0x48, 0x02, 0x03,
1550  		0x62, 0xf3, 0x7c, 0x38, 0x08, 0x48, 0x20, 0x04,
1551  		0x62, 0xf3, 0x7c, 0x48, 0x08, 0x48, 0x01, 0x05,
1552  		0x62, 0xf3, 0x7c, 0x58, 0x08, 0x48, 0x20, 0x06,
1553  		0x62, 0xf3, 0x7c, 0x99, 0x08, 0xcd, 0x07,
1554  
1555  		// vrndscalesh
1556  		0x62, 0xf3, 0x64, 0x08, 0x0a, 0x48, 0x20, 0x01,
1557  		0x62, 0xf3, 0x54, 0x99, 0x0a, 0xcc, 0x02,
1558  
1559  		// vfpclassph
1560  		0x62, 0xf3, 0x7c, 0x08, 0x66, 0x48, 0x04, 0x01,
1561  		0x62, 0xf3, 0x7c, 0x18, 0x66, 0x48, 0x20, 0x02,
1562  		0x62, 0xf3, 0x7c, 0x28, 0x66, 0x48, 0x02, 0x03,
1563  		0x62, 0xf3, 0x7c, 0x38, 0x66, 0x48, 0x20, 0x04,
1564  		0x62, 0xf3, 0x7c, 0x48, 0x66, 0x48, 0x01, 0x05,
1565  		0x62, 0xf3, 0x7c, 0x58, 0x66, 0x48, 0x20, 0x06,
1566  
1567  		// vfpclasssh
1568  		0x62, 0xf3, 0x7c, 0x0a, 0x67, 0xcb, 0x05,
1569  		0x62, 0xf3, 0x7c, 0x0a, 0x67, 0x48, 0x20, 0x05,
1570  
1571  		// vgetexpph
1572  		0x62, 0xf6, 0x7d, 0x08, 0x42, 0x48, 0x04,
1573  		0x62, 0xf6, 0x7d, 0x38, 0x42, 0x48, 0x20,
1574  		0x62, 0xf6, 0x7d, 0x48, 0x42, 0x48, 0x01,
1575  		0x62, 0xf6, 0x7d, 0x99, 0x42, 0xcd,
1576  
1577  		// vgetexpsh
1578  		0x62, 0xf6, 0x55, 0x08, 0x43, 0x48, 0x20,
1579  		0x62, 0xf6, 0x65, 0x99, 0x43, 0xcd,
1580  
1581  		// vgetmantph
1582  		0x62, 0xf3, 0x7c, 0x08, 0x26, 0x48, 0x04, 0x01,
1583  		0x62, 0xf3, 0x7c, 0x38, 0x26, 0x48, 0x20, 0x02,
1584  		0x62, 0xf3, 0x7c, 0x48, 0x26, 0x48, 0x01, 0x03,
1585  		0x62, 0xf3, 0x7c, 0x99, 0x26, 0xcd, 0x04,
1586  
1587  		// vgetmantsh
1588  		0x62, 0xf3, 0x54, 0x08, 0x27, 0x48, 0x20, 0x05,
1589  		0x62, 0xf3, 0x64, 0x99, 0x27, 0xcd, 0x06,
1590  
1591  		// vmovsh
1592  		0x62, 0xf5, 0x7e, 0x89, 0x10, 0x48, 0x20,
1593  		0x62, 0xf5, 0x7e, 0x09, 0x11, 0x48, 0x20,
1594  		0x62, 0xf5, 0x66, 0x8a, 0x10, 0xcd,
1595  
1596  		// vmovw
1597  		0x62, 0xd5, 0x7d, 0x08, 0x6e, 0xcd,
1598  		0x62, 0xf5, 0x7d, 0x08, 0x6e, 0x58, 0x20,
1599  		0x62, 0xd5, 0x7d, 0x08, 0x7e, 0xc9,
1600  		0x62, 0xf5, 0x7d, 0x08, 0x7e, 0x78, 0x20,
1601  
1602  		// vcvtsd2sh
1603  		0x62, 0xf5, 0xef, 0xb9, 0x5a, 0xcb,
1604  		0x62, 0xf5, 0xef, 0x08, 0x5a, 0x48, 0x08,
1605  
1606  		// vcvtsh2sd
1607  		0x62, 0xf5, 0x6e, 0x99, 0x5a, 0xcb,
1608  		0x62, 0xf5, 0x6e, 0x08, 0x5a, 0x48, 0x20,
1609  
1610  		// vcvtsh2ss
1611  		0x62, 0xf6, 0x6c, 0x99, 0x13, 0xcb,
1612  		0x62, 0xf6, 0x6c, 0x08, 0x13, 0x48, 0x20,
1613  
1614  		// vcvtss2sh
1615  		0x62, 0xf5, 0x6c, 0xb9, 0x1d, 0xcb,
1616  		0x62, 0xf5, 0x6c, 0x08, 0x1d, 0x48, 0x10,
1617  
1618  		// vcvtsh2si
1619  		0x62, 0xf5, 0x7e, 0x38, 0x2d, 0xd1,
1620  		0x62, 0xf5, 0x7e, 0x08, 0x2d, 0x50, 0x20,
1621  		0x62, 0xf5, 0xfe, 0x38, 0x2d, 0xd1,
1622  		0x62, 0x75, 0xfe, 0x08, 0x2d, 0x40, 0x20,
1623  
1624  		// vcvtph2dq
1625  		0x62, 0xf5, 0x7d, 0x08, 0x5b, 0xcd,
1626  		0x62, 0xf5, 0x7d, 0x08, 0x5b, 0x48, 0x08,
1627  		0x62, 0xf5, 0x7d, 0x18, 0x5b, 0x48, 0x20,
1628  		0x62, 0xf5, 0x7d, 0xaa, 0x5b, 0xcd,
1629  		0x62, 0xf5, 0x7d, 0x28, 0x5b, 0x48, 0x04,
1630  		0x62, 0xf5, 0x7d, 0x38, 0x5b, 0x48, 0x20,
1631  		0x62, 0xf5, 0x7d, 0xbd, 0x5b, 0xcb,
1632  		0x62, 0xf5, 0x7d, 0xcd, 0x5b, 0x48, 0x02,
1633  		0x62, 0xf5, 0x7d, 0xdd, 0x5b, 0x48, 0x20,
1634  
1635  		// vcvtph2psx
1636  		0x62, 0xf6, 0x7d, 0x08, 0x13, 0xcd,
1637  		0x62, 0xf6, 0x7d, 0x08, 0x13, 0x48, 0x08,
1638  		0x62, 0xf6, 0x7d, 0x18, 0x13, 0x48, 0x20,
1639  		0x62, 0xf6, 0x7d, 0xaa, 0x13, 0xcd,
1640  		0x62, 0xf6, 0x7d, 0x28, 0x13, 0x48, 0x04,
1641  		0x62, 0xf6, 0x7d, 0x38, 0x13, 0x48, 0x20,
1642  		0x62, 0xf6, 0x7d, 0x9d, 0x13, 0xcb,
1643  		0x62, 0xf6, 0x7d, 0xcd, 0x13, 0x48, 0x02,
1644  		0x62, 0xf6, 0x7d, 0xdd, 0x13, 0x48, 0x20,
1645  
1646  		// vcvtph2udq
1647  		0x62, 0xf5, 0x7c, 0x08, 0x79, 0xcd,
1648  		0x62, 0xf5, 0x7c, 0x08, 0x79, 0x48, 0x08,
1649  		0x62, 0xf5, 0x7c, 0x18, 0x79, 0x48, 0x20,
1650  		0x62, 0xf5, 0x7c, 0xaa, 0x79, 0xcd,
1651  		0x62, 0xf5, 0x7c, 0x28, 0x79, 0x48, 0x04,
1652  		0x62, 0xf5, 0x7c, 0x38, 0x79, 0x48, 0x20,
1653  		0x62, 0xf5, 0x7c, 0xbd, 0x79, 0xcb,
1654  		0x62, 0xf5, 0x7c, 0xcd, 0x79, 0x48, 0x02,
1655  		0x62, 0xf5, 0x7c, 0xdd, 0x79, 0x48, 0x20,
1656  
1657  		// vcvttph2dq
1658  		0x62, 0xf5, 0x7e, 0x08, 0x5b, 0xcd,
1659  		0x62, 0xf5, 0x7e, 0x08, 0x5b, 0x48, 0x08,
1660  		0x62, 0xf5, 0x7e, 0x18, 0x5b, 0x48, 0x20,
1661  		0x62, 0xf5, 0x7e, 0xaa, 0x5b, 0xcd,
1662  		0x62, 0xf5, 0x7e, 0x28, 0x5b, 0x48, 0x04,
1663  		0x62, 0xf5, 0x7e, 0x38, 0x5b, 0x48, 0x20,
1664  		0x62, 0xf5, 0x7e, 0x9d, 0x5b, 0xcb,
1665  		0x62, 0xf5, 0x7e, 0xcd, 0x5b, 0x48, 0x02,
1666  		0x62, 0xf5, 0x7e, 0xdd, 0x5b, 0x48, 0x20,
1667  
1668  		// vcvttph2udq
1669  		0x62, 0xf5, 0x7c, 0x08, 0x78, 0xcd,
1670  		0x62, 0xf5, 0x7c, 0x08, 0x78, 0x48, 0x08,
1671  		0x62, 0xf5, 0x7c, 0x18, 0x78, 0x48, 0x20,
1672  		0x62, 0xf5, 0x7c, 0xaa, 0x78, 0xcd,
1673  		0x62, 0xf5, 0x7c, 0x28, 0x78, 0x48, 0x04,
1674  		0x62, 0xf5, 0x7c, 0x38, 0x78, 0x48, 0x20,
1675  		0x62, 0xf5, 0x7c, 0x9d, 0x78, 0xcb,
1676  		0x62, 0xf5, 0x7c, 0xcd, 0x78, 0x48, 0x02,
1677  		0x62, 0xf5, 0x7c, 0xdd, 0x78, 0x48, 0x20,
1678  
1679  		// vcvtph2pd
1680  		0x62, 0xf5, 0x7c, 0x08, 0x5a, 0xcd,
1681  		0x62, 0xf5, 0x7c, 0x08, 0x5a, 0x48, 0x10,
1682  		0x62, 0xf5, 0x7c, 0x18, 0x5a, 0x48, 0x20,
1683  		0x62, 0xf5, 0x7c, 0xaa, 0x5a, 0xcd,
1684  		0x62, 0xf5, 0x7c, 0x28, 0x5a, 0x48, 0x08,
1685  		0x62, 0xf5, 0x7c, 0x38, 0x5a, 0x48, 0x20,
1686  		0x62, 0xf5, 0x7c, 0x9d, 0x5a, 0xcb,
1687  		0x62, 0xf5, 0x7c, 0xcd, 0x5a, 0x48, 0x04,
1688  		0x62, 0xf5, 0x7c, 0xdd, 0x5a, 0x48, 0x20,
1689  
1690  		// vcvtph2qq
1691  		0x62, 0xf5, 0x7d, 0x08, 0x7b, 0xcd,
1692  		0x62, 0xf5, 0x7d, 0x08, 0x7b, 0x48, 0x10,
1693  		0x62, 0xf5, 0x7d, 0x18, 0x7b, 0x48, 0x20,
1694  		0x62, 0xf5, 0x7d, 0xaa, 0x7b, 0xcd,
1695  		0x62, 0xf5, 0x7d, 0x28, 0x7b, 0x48, 0x08,
1696  		0x62, 0xf5, 0x7d, 0x38, 0x7b, 0x48, 0x20,
1697  		0x62, 0xf5, 0x7d, 0xbd, 0x7b, 0xcb,
1698  		0x62, 0xf5, 0x7d, 0xcd, 0x7b, 0x48, 0x04,
1699  		0x62, 0xf5, 0x7d, 0xdd, 0x7b, 0x48, 0x20,
1700  
1701  		// vcvtph2uqq
1702  		0x62, 0xf5, 0x7d, 0x08, 0x79, 0xcd,
1703  		0x62, 0xf5, 0x7d, 0x08, 0x79, 0x48, 0x10,
1704  		0x62, 0xf5, 0x7d, 0x18, 0x79, 0x48, 0x20,
1705  		0x62, 0xf5, 0x7d, 0xaa, 0x79, 0xcd,
1706  		0x62, 0xf5, 0x7d, 0x28, 0x79, 0x48, 0x08,
1707  		0x62, 0xf5, 0x7d, 0x38, 0x79, 0x48, 0x20,
1708  		0x62, 0xf5, 0x7d, 0xbd, 0x79, 0xcb,
1709  		0x62, 0xf5, 0x7d, 0xcd, 0x79, 0x48, 0x04,
1710  		0x62, 0xf5, 0x7d, 0xdd, 0x79, 0x48, 0x20,
1711  
1712  		// vcvttph2uqq
1713  		0x62, 0xf5, 0x7d, 0x08, 0x78, 0xcd,
1714  		0x62, 0xf5, 0x7d, 0x08, 0x78, 0x48, 0x10,
1715  		0x62, 0xf5, 0x7d, 0x18, 0x78, 0x48, 0x20,
1716  		0x62, 0xf5, 0x7d, 0xaa, 0x78, 0xcd,
1717  		0x62, 0xf5, 0x7d, 0x28, 0x78, 0x48, 0x08,
1718  		0x62, 0xf5, 0x7d, 0x38, 0x78, 0x48, 0x20,
1719  		0x62, 0xf5, 0x7d, 0x9d, 0x78, 0xcb,
1720  		0x62, 0xf5, 0x7d, 0xcd, 0x78, 0x48, 0x04,
1721  		0x62, 0xf5, 0x7d, 0xdd, 0x78, 0x48, 0x20,
1722  
1723  		// vcvtdq2ph
1724  		0x62, 0xf5, 0x7c, 0x08, 0x5b, 0xcd,
1725  		0x62, 0xf5, 0x7c, 0x08, 0x5b, 0x48, 0x04,
1726  		0x62, 0xf5, 0x7c, 0x18, 0x5b, 0x48, 0x10,
1727  		0x62, 0xf5, 0x7c, 0x28, 0x5b, 0x48, 0x02,
1728  		0x62, 0xf5, 0x7c, 0x38, 0x5b, 0x48, 0x10,
1729  		0x62, 0xf5, 0x7c, 0xba, 0x5b, 0xcd,
1730  		0x62, 0xf5, 0x7c, 0x48, 0x5b, 0x48, 0x01,
1731  		0x62, 0xf5, 0x7c, 0x58, 0x5b, 0x48, 0x10,
1732  
1733  		// vcvtps2phx
1734  		0x62, 0xf5, 0x7d, 0x08, 0x1d, 0xcd,
1735  		0x62, 0xf5, 0x7d, 0x08, 0x1d, 0x48, 0x04,
1736  		0x62, 0xf5, 0x7d, 0x18, 0x1d, 0x48, 0x10,
1737  		0x62, 0xf5, 0x7d, 0x28, 0x1d, 0x48, 0x02,
1738  		0x62, 0xf5, 0x7d, 0x38, 0x1d, 0x48, 0x10,
1739  		0x62, 0xf5, 0x7d, 0xba, 0x1d, 0xcd,
1740  		0x62, 0xf5, 0x7d, 0x48, 0x1d, 0x48, 0x01,
1741  		0x62, 0xf5, 0x7d, 0x58, 0x1d, 0x48, 0x10,
1742  
1743  		// vcvtudq2ph
1744  		0x62, 0xf5, 0x7f, 0x08, 0x7a, 0xcd,
1745  		0x62, 0xf5, 0x7f, 0x08, 0x7a, 0x48, 0x04,
1746  		0x62, 0xf5, 0x7f, 0x18, 0x7a, 0x48, 0x10,
1747  		0x62, 0xf5, 0x7f, 0x28, 0x7a, 0x48, 0x02,
1748  		0x62, 0xf5, 0x7f, 0x38, 0x7a, 0x48, 0x10,
1749  		0x62, 0xf5, 0x7f, 0xba, 0x7a, 0xcd,
1750  		0x62, 0xf5, 0x7f, 0x48, 0x7a, 0x48, 0x01,
1751  		0x62, 0xf5, 0x7f, 0x58, 0x7a, 0x48, 0x10,
1752  
1753  		// vcvtpd2ph
1754  		0x62, 0xf5, 0xfd, 0x08, 0x5a, 0xcd,
1755  		0x62, 0xf5, 0xfd, 0x28, 0x5a, 0xcd,
1756  		0x62, 0xf5, 0xfd, 0xba, 0x5a, 0xcd,
1757  		0x62, 0xf5, 0xfd, 0x08, 0x5a, 0x48, 0x04,
1758  		0x62, 0xf5, 0xfd, 0x18, 0x5a, 0x48, 0x08,
1759  		0x62, 0xf5, 0xfd, 0x28, 0x5a, 0x48, 0x02,
1760  		0x62, 0xf5, 0xfd, 0x38, 0x5a, 0x48, 0x08,
1761  		0x62, 0xf5, 0xfd, 0x48, 0x5a, 0x48, 0x01,
1762  		0x62, 0xf5, 0xfd, 0x58, 0x5a, 0x48, 0x08,
1763  
1764  		// vcvtqq2ph
1765  		0x62, 0xf5, 0xfc, 0x08, 0x5b, 0xcd,
1766  		0x62, 0xf5, 0xfc, 0x28, 0x5b, 0xcd,
1767  		0x62, 0xf5, 0xfc, 0xba, 0x5b, 0xcd,
1768  		0x62, 0xf5, 0xfc, 0x08, 0x5b, 0x48, 0x04,
1769  		0x62, 0xf5, 0xfc, 0x18, 0x5b, 0x48, 0x08,
1770  		0x62, 0xf5, 0xfc, 0x28, 0x5b, 0x48, 0x02,
1771  		0x62, 0xf5, 0xfc, 0x38, 0x5b, 0x48, 0x08,
1772  		0x62, 0xf5, 0xfc, 0x48, 0x5b, 0x48, 0x01,
1773  		0x62, 0xf5, 0xfc, 0x58, 0x5b, 0x48, 0x08,
1774  
1775  		// vcvtuqq2ph
1776  		0x62, 0xf5, 0xff, 0x08, 0x7a, 0xcd,
1777  		0x62, 0xf5, 0xff, 0x28, 0x7a, 0xcd,
1778  		0x62, 0xf5, 0xff, 0xba, 0x7a, 0xcd,
1779  		0x62, 0xf5, 0xff, 0x08, 0x7a, 0x48, 0x04,
1780  		0x62, 0xf5, 0xff, 0x18, 0x7a, 0x48, 0x08,
1781  		0x62, 0xf5, 0xff, 0x28, 0x7a, 0x48, 0x02,
1782  		0x62, 0xf5, 0xff, 0x38, 0x7a, 0x48, 0x08,
1783  		0x62, 0xf5, 0xff, 0x48, 0x7a, 0x48, 0x01,
1784  		0x62, 0xf5, 0xff, 0x58, 0x7a, 0x48, 0x08,
1785  
1786  		// vcvtph2uw
1787  		0x62, 0xf5, 0x7c, 0x08, 0x7d, 0xcd,
1788  		0x62, 0xf5, 0x7c, 0x08, 0x7d, 0x48, 0x04,
1789  		0x62, 0xf5, 0x7c, 0x18, 0x7d, 0x48, 0x20,
1790  		0x62, 0xf5, 0x7c, 0x28, 0x7d, 0x48, 0x02,
1791  		0x62, 0xf5, 0x7c, 0x38, 0x7d, 0x48, 0x20,
1792  		0x62, 0xf5, 0x7c, 0xba, 0x7d, 0xcd,
1793  		0x62, 0xf5, 0x7c, 0x48, 0x7d, 0x48, 0x01,
1794  		0x62, 0xf5, 0x7c, 0x58, 0x7d, 0x48, 0x20,
1795  
1796  		// vcvtph2w
1797  		0x62, 0xf5, 0x7d, 0x08, 0x7d, 0xcd,
1798  		0x62, 0xf5, 0x7d, 0x08, 0x7d, 0x48, 0x04,
1799  		0x62, 0xf5, 0x7d, 0x18, 0x7d, 0x48, 0x20,
1800  		0x62, 0xf5, 0x7d, 0x28, 0x7d, 0x48, 0x02,
1801  		0x62, 0xf5, 0x7d, 0x38, 0x7d, 0x48, 0x20,
1802  		0x62, 0xf5, 0x7d, 0xba, 0x7d, 0xcd,
1803  		0x62, 0xf5, 0x7d, 0x48, 0x7d, 0x48, 0x01,
1804  		0x62, 0xf5, 0x7d, 0x58, 0x7d, 0x48, 0x20,
1805  
1806  		// vcvttph2uw
1807  		0x62, 0xf5, 0x7c, 0x08, 0x7c, 0xcd,
1808  		0x62, 0xf5, 0x7c, 0x08, 0x7c, 0x48, 0x04,
1809  		0x62, 0xf5, 0x7c, 0x18, 0x7c, 0x48, 0x20,
1810  		0x62, 0xf5, 0x7c, 0x28, 0x7c, 0x48, 0x02,
1811  		0x62, 0xf5, 0x7c, 0x38, 0x7c, 0x48, 0x20,
1812  		0x62, 0xf5, 0x7c, 0x9a, 0x7c, 0xcd,
1813  		0x62, 0xf5, 0x7c, 0x48, 0x7c, 0x48, 0x01,
1814  		0x62, 0xf5, 0x7c, 0x58, 0x7c, 0x48, 0x20,
1815  
1816  		// vcvttph2w
1817  		0x62, 0xf5, 0x7d, 0x08, 0x7c, 0xcd,
1818  		0x62, 0xf5, 0x7d, 0x08, 0x7c, 0x48, 0x04,
1819  		0x62, 0xf5, 0x7d, 0x18, 0x7c, 0x48, 0x20,
1820  		0x62, 0xf5, 0x7d, 0x28, 0x7c, 0x48, 0x02,
1821  		0x62, 0xf5, 0x7d, 0x38, 0x7c, 0x48, 0x20,
1822  		0x62, 0xf5, 0x7d, 0x9a, 0x7c, 0xcd,
1823  		0x62, 0xf5, 0x7d, 0x48, 0x7c, 0x48, 0x01,
1824  		0x62, 0xf5, 0x7d, 0x58, 0x7c, 0x48, 0x20,
1825  
1826  		// vcvtuw2ph
1827  		0x62, 0xf5, 0x7f, 0x08, 0x7d, 0xcd,
1828  		0x62, 0xf5, 0x7f, 0x08, 0x7d, 0x48, 0x04,
1829  		0x62, 0xf5, 0x7f, 0x18, 0x7d, 0x48, 0x20,
1830  		0x62, 0xf5, 0x7f, 0x28, 0x7d, 0x48, 0x02,
1831  		0x62, 0xf5, 0x7f, 0x38, 0x7d, 0x48, 0x20,
1832  		0x62, 0xf5, 0x7f, 0xba, 0x7d, 0xcd,
1833  		0x62, 0xf5, 0x7f, 0x48, 0x7d, 0x48, 0x01,
1834  		0x62, 0xf5, 0x7f, 0x58, 0x7d, 0x48, 0x20,
1835  
1836  		// vcvtw2ph
1837  		0x62, 0xf5, 0x7e, 0x08, 0x7d, 0xcd,
1838  		0x62, 0xf5, 0x7e, 0x08, 0x7d, 0x48, 0x04,
1839  		0x62, 0xf5, 0x7e, 0x18, 0x7d, 0x48, 0x20,
1840  		0x62, 0xf5, 0x7e, 0x28, 0x7d, 0x48, 0x02,
1841  		0x62, 0xf5, 0x7e, 0x38, 0x7d, 0x48, 0x20,
1842  		0x62, 0xf5, 0x7e, 0xba, 0x7d, 0xcd,
1843  		0x62, 0xf5, 0x7e, 0x48, 0x7d, 0x48, 0x01,
1844  		0x62, 0xf5, 0x7e, 0x58, 0x7d, 0x48, 0x20,
1845  
1846  		// vcvtps2ph
1847  		0xc4, 0xe3, 0x79, 0x1d, 0xd1, 0x01,
1848  		0xc4, 0xe3, 0x79, 0x1d, 0x50, 0x40, 0x02,
1849  		0xc4, 0xe3, 0x7d, 0x1d, 0xd1, 0x03,
1850  		0xc4, 0xe3, 0x7d, 0x1d, 0x50, 0x40, 0x04,
1851  		0x62, 0xf3, 0x7d, 0x89, 0x1d, 0xd1, 0x05,
1852  		0x62, 0xf3, 0x7d, 0x09, 0x1d, 0x58, 0x08, 0x06,
1853  		0x62, 0xf3, 0x7d, 0x2a, 0x1d, 0xe1, 0x07,
1854  		0x62, 0xf3, 0x7d, 0x2a, 0x1d, 0x68, 0x04, 0x08,
1855  		0x62, 0xf3, 0x7d, 0x1a, 0x1d, 0xe9, 0x09,
1856  		0x62, 0xf3, 0x7d, 0x4d, 0x1d, 0x60, 0x02, 0x0a,
1857  
1858  		// vcvtsh2usi
1859  		0x62, 0xf5, 0x7e, 0x38, 0x79, 0xc9,
1860  		0x62, 0xf5, 0x7e, 0x08, 0x79, 0x40, 0x20,
1861  		0x62, 0x75, 0xfe, 0x38, 0x79, 0xc9,
1862  		0x62, 0x75, 0xfe, 0x08, 0x79, 0x68, 0x20,
1863  
1864  		// vcvttsh2si
1865  		0x62, 0xf5, 0x7e, 0x18, 0x2c, 0xc9,
1866  		0x62, 0xf5, 0x7e, 0x08, 0x2c, 0x40, 0x20,
1867  		0x62, 0x75, 0xfe, 0x18, 0x2c, 0xc9,
1868  		0x62, 0x75, 0xfe, 0x08, 0x2c, 0x68, 0x20,
1869  
1870  		// vcvttsh2usi
1871  		0x62, 0xf5, 0x7e, 0x18, 0x78, 0xc9,
1872  		0x62, 0xf5, 0x7e, 0x08, 0x78, 0x40, 0x20,
1873  		0x62, 0x75, 0xfe, 0x18, 0x78, 0xc9,
1874  		0x62, 0x75, 0xfe, 0x08, 0x78, 0x68, 0x20,
1875  
1876  		// vcvttph2qq
1877  		0x62, 0xf5, 0x7d, 0x08, 0x7a, 0xcd,
1878  		0x62, 0xf5, 0x7d, 0x08, 0x7a, 0x48, 0x10,
1879  		0x62, 0xf5, 0x7d, 0x18, 0x7a, 0x48, 0x20,
1880  		0x62, 0xf5, 0x7d, 0xaa, 0x7a, 0xcd,
1881  		0x62, 0xf5, 0x7d, 0x28, 0x7a, 0x48, 0x08,
1882  		0x62, 0xf5, 0x7d, 0x38, 0x7a, 0x48, 0x20,
1883  		0x62, 0xf5, 0x7d, 0x9d, 0x7a, 0xcb,
1884  		0x62, 0xf5, 0x7d, 0xcd, 0x7a, 0x48, 0x04,
1885  		0x62, 0xf5, 0x7d, 0xdd, 0x7a, 0x48, 0x20,
1886  
1887  		// vcvtsi2sh
1888  		0x62, 0xf5, 0x6e, 0x38, 0x2a, 0xc8,
1889  		0x62, 0xf5, 0x6e, 0x08, 0x2a, 0x48, 0x10,
1890  		0x62, 0xd5, 0xee, 0x38, 0x2a, 0xc9,
1891  		0x62, 0xf5, 0xee, 0x08, 0x2a, 0x48, 0x08,
1892  
1893  		// vcvtusi2sh
1894  		0x62, 0xf5, 0x6e, 0x38, 0x7b, 0xc8,
1895  		0x62, 0xf5, 0x6e, 0x08, 0x7b, 0x48, 0x10,
1896  		0x62, 0xd5, 0xee, 0x38, 0x7b, 0xc9,
1897  		0x62, 0xf5, 0xee, 0x08, 0x7b, 0x48, 0x08,
1898  	};
1899  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
1900  	CYBOZU_TEST_EQUAL(c.getSize(), n);
1901  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
1902  }
1903  #endif
1904  
1905  CYBOZU_TEST_AUTO(waitpkg)
1906  {
1907  	struct Code : Xbyak::CodeGenerator {
1908  		Code()
1909  		{
1910  			tpause(eax);
1911  			tpause(ebx);
1912  #ifdef XBYAK32
1913  			umonitor(cx);
1914  			umonitor(ecx);
1915  #else
1916  			umonitor(ecx);
1917  			umonitor(rcx);
1918  #endif
1919  			umwait(eax);
1920  			umwait(ebx);
1921  		}
1922  	} c;
1923  	const uint8_t tbl[] = {
1924  		// tpause
1925  		0x66, 0x0f, 0xae, 0xf0,
1926  		0x66, 0x0f, 0xae, 0xf3,
1927  		// umonitor
1928  		0x67, 0xf3, 0x0f, 0xae, 0xf1,
1929  		0xf3, 0x0f, 0xae, 0xf1,
1930  		// tpause
1931  		0xf2, 0x0f, 0xae, 0xf0,
1932  		0xf2, 0x0f, 0xae, 0xf3,
1933  	};
1934  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
1935  	CYBOZU_TEST_EQUAL(c.getSize(), n);
1936  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
1937  }
1938  
1939  CYBOZU_TEST_AUTO(misc)
1940  {
1941  	struct Code : Xbyak::CodeGenerator {
1942  		Code()
1943  		{
1944  			cldemote(ptr[eax+esi*4+0x12]);
1945  			movdiri(ptr[edx+esi*2+4], eax);
1946  			movdir64b(eax, ptr[edx]);
1947  #ifdef XBYAK64
1948  			cldemote(ptr[rax+rdi*8+0x123]);
1949  			movdiri(ptr[rax+r12], r9);
1950  			movdiri(ptr[rax+r12*2+4], r9d);
1951  			movdir64b(r10, ptr[r8]);
1952  			clui();
1953  			senduipi(rax);
1954  			senduipi(r10);
1955  			stui();
1956  			testui();
1957  			uiret();
1958  #endif
1959  		}
1960  	} c;
1961  	const uint8_t tbl[] = {
1962  #ifdef XBYAK64
1963  		0x67,
1964  #endif
1965  		0x0f, 0x1c, 0x44, 0xb0, 0x12, // cldemote
1966  #ifdef XBYAK64
1967  		0x67,
1968  #endif
1969  		0x0f, 0x38, 0xf9, 0x44, 0x72, 0x04, // movdiri
1970  
1971  		0x66,
1972  #ifdef XBYAK64
1973  		0x67,
1974  #endif
1975  		0x0f, 0x38, 0xf8, 0x02, // movdir64b
1976  #ifdef XBYAK64
1977  		0x0f, 0x1c, 0x84, 0xf8, 0x23, 0x01, 0x00, 0x00, // cldemote
1978  		0x4e, 0x0f, 0x38, 0xf9, 0x0c, 0x20, // movdiri
1979  		0x46, 0x0f, 0x38, 0xf9, 0x4c, 0x60, 0x04, // movdiri
1980  		0x66, 0x45, 0x0f, 0x38, 0xf8, 0x10, // movdir64b
1981  		0xf3, 0x0f, 0x01, 0xee, // clui
1982  		0xf3, 0x0f, 0xc7, 0xf0, // senduipi rax
1983  		0xf3, 0x41, 0x0f, 0xc7, 0xf2, // senduipi r10
1984  		0xf3, 0x0f, 0x01, 0xef, // stui
1985  		0xf3, 0x0f, 0x01, 0xed, // testui
1986  		0xf3, 0x0f, 0x01, 0xec, // uiret
1987  #endif
1988  	};
1989  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
1990  	CYBOZU_TEST_EQUAL(c.getSize(), n);
1991  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
1992  }
1993  
1994  CYBOZU_TEST_AUTO(cpu)
1995  {
1996  	// https://github.com/herumi/xbyak/issues/148
1997  	using namespace Xbyak::util;
1998  	Cpu cpu;
1999  	CYBOZU_TEST_EQUAL(cpu.has(Cpu::tINTEL) && cpu.has(Cpu::tAMD), cpu.has(Cpu::tINTEL | Cpu::tAMD));
2000  }
2001  
2002  CYBOZU_TEST_AUTO(minmax)
2003  {
2004  	using namespace Xbyak::util;
2005  	CYBOZU_TEST_EQUAL((std::min)(3, 4), local::min_(3, 4));
2006  	CYBOZU_TEST_EQUAL((std::max)(3, 4), local::max_(3, 4));
2007  }
2008  
2009  CYBOZU_TEST_AUTO(rao_int)
2010  {
2011  	struct Code : Xbyak::CodeGenerator {
2012  		Code()
2013  		{
2014  #ifdef XBYAK64
2015  			aadd(ptr[rax], ecx);
2016  			aadd(ptr[eax], ecx);
2017  			aadd(ptr[rax], r10);
2018  			aand(ptr[rax], ecx);
2019  			aand(ptr[eax], ecx);
2020  			aand(ptr[rax], r10);
2021  			aor(ptr[rax], ecx);
2022  			aor(ptr[eax], ecx);
2023  			aor(ptr[rax], r10);
2024  			axor(ptr[rax], ecx);
2025  			axor(ptr[eax], ecx);
2026  			axor(ptr[rax], r10);
2027  #else
2028  			aadd(ptr[eax], ecx);
2029  			aand(ptr[eax], ecx);
2030  			aor(ptr[eax], ecx);
2031  			axor(ptr[eax], ecx);
2032  #endif
2033  		}
2034  	} c;
2035  	const uint8_t tbl[] = {
2036  #ifdef XBYAK64
2037  		// aadd
2038  		0x0f, 0x38, 0xfc, 0x08,
2039  		0x67, 0x0f, 0x38, 0xfc, 0x08,
2040  		0x4c, 0x0f, 0x38, 0xfc, 0x10,
2041  
2042  		// aand
2043  		0x66, 0x0f, 0x38, 0xfc, 0x08,
2044  		0x66, 0x67, 0x0f, 0x38, 0xfc, 0x08,
2045  		0x66, 0x4c, 0x0f, 0x38, 0xfc, 0x10,
2046  
2047  		// aor
2048  		0xf2, 0x0f, 0x38, 0xfc, 0x08,
2049  		0xf2, 0x67, 0x0f, 0x38, 0xfc, 0x08,
2050  		0xf2, 0x4c, 0x0f, 0x38, 0xfc, 0x10,
2051  
2052  		// axor
2053  		0xf3, 0x0f, 0x38, 0xfc, 0x08,
2054  		0xf3, 0x67, 0x0f, 0x38, 0xfc, 0x08,
2055  		0xf3, 0x4c, 0x0f, 0x38, 0xfc, 0x10,
2056  #else
2057  		// aadd
2058  		0x0f, 0x38, 0xfc, 0x08,
2059  		// aand
2060  		0x66, 0x0f, 0x38, 0xfc, 0x08,
2061  		// aor
2062  		0xf2, 0x0f, 0x38, 0xfc, 0x08,
2063  		// axor
2064  		0xf3, 0x0f, 0x38, 0xfc, 0x08,
2065  #endif
2066  	};
2067  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
2068  	CYBOZU_TEST_EQUAL(c.getSize(), n);
2069  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
2070  }
2071  
2072  #ifdef XBYAK64
2073  CYBOZU_TEST_AUTO(CMPccXADD)
2074  {
2075  	struct Code : Xbyak::CodeGenerator {
2076  		Code()
2077  		{
2078  			// 32bit reg
2079  			cmpbexadd(ptr[rax+r10*4], ecx, edx);
2080  			cmpbxadd(ptr[rax+r10*4], ecx, edx);
2081  			cmplexadd(ptr[rax+r10*4], ecx, edx);
2082  			cmplxadd(ptr[rax+r10*4], ecx, edx);
2083  			cmpnbexadd(ptr[rax+r10*4], ecx, edx);
2084  			cmpnbxadd(ptr[rax+r10*4], ecx, edx);
2085  			cmpnlexadd(ptr[rax+r10*4], ecx, edx);
2086  			cmpnlxadd(ptr[rax+r10*4], ecx, edx);
2087  			cmpnoxadd(ptr[rax+r10*4], ecx, edx);
2088  			cmpnpxadd(ptr[rax+r10*4], ecx, edx);
2089  			cmpnsxadd(ptr[rax+r10*4], ecx, edx);
2090  			cmpnzxadd(ptr[rax+r10*4], ecx, edx);
2091  			cmpoxadd(ptr[rax+r10*4], ecx, edx);
2092  			cmppxadd(ptr[rax+r10*4], ecx, edx);
2093  			cmpsxadd(ptr[rax+r10*4], ecx, edx);
2094  			cmpzxadd(ptr[rax+r10*4], ecx, edx);
2095  			// 64bit reg
2096  			cmpbexadd(ptr[rax+r10*4], rcx, rdx);
2097  			cmpbxadd(ptr[rax+r10*4], rcx, rdx);
2098  			cmplexadd(ptr[rax+r10*4], rcx, rdx);
2099  			cmplxadd(ptr[rax+r10*4], rcx, rdx);
2100  			cmpnbexadd(ptr[rax+r10*4], rcx, rdx);
2101  			cmpnbxadd(ptr[rax+r10*4], rcx, rdx);
2102  			cmpnlexadd(ptr[rax+r10*4], rcx, rdx);
2103  			cmpnlxadd(ptr[rax+r10*4], rcx, rdx);
2104  			cmpnoxadd(ptr[rax+r10*4], rcx, rdx);
2105  			cmpnpxadd(ptr[rax+r10*4], rcx, rdx);
2106  			cmpnsxadd(ptr[rax+r10*4], rcx, rdx);
2107  			cmpnzxadd(ptr[rax+r10*4], rcx, rdx);
2108  			cmpoxadd(ptr[rax+r10*4], rcx, rdx);
2109  			cmppxadd(ptr[rax+r10*4], rcx, rdx);
2110  			cmpsxadd(ptr[rax+r10*4], rcx, rdx);
2111  			cmpzxadd(ptr[rax+r10*4], rcx, rdx);
2112  		}
2113  	} c;
2114  	const uint8_t tbl[] = {
2115  		// 32bit reg
2116  		0xc4, 0xa2, 0x69, 0xe6, 0x0c, 0x90,
2117  		0xc4, 0xa2, 0x69, 0xe2, 0x0c, 0x90,
2118  		0xc4, 0xa2, 0x69, 0xee, 0x0c, 0x90,
2119  		0xc4, 0xa2, 0x69, 0xec, 0x0c, 0x90,
2120  		0xc4, 0xa2, 0x69, 0xe7, 0x0c, 0x90,
2121  		0xc4, 0xa2, 0x69, 0xe3, 0x0c, 0x90,
2122  		0xc4, 0xa2, 0x69, 0xef, 0x0c, 0x90,
2123  		0xc4, 0xa2, 0x69, 0xed, 0x0c, 0x90,
2124  		0xc4, 0xa2, 0x69, 0xe1, 0x0c, 0x90,
2125  		0xc4, 0xa2, 0x69, 0xeb, 0x0c, 0x90,
2126  		0xc4, 0xa2, 0x69, 0xe9, 0x0c, 0x90,
2127  		0xc4, 0xa2, 0x69, 0xe5, 0x0c, 0x90,
2128  		0xc4, 0xa2, 0x69, 0xe0, 0x0c, 0x90,
2129  		0xc4, 0xa2, 0x69, 0xea, 0x0c, 0x90,
2130  		0xc4, 0xa2, 0x69, 0xe8, 0x0c, 0x90,
2131  		0xc4, 0xa2, 0x69, 0xe4, 0x0c, 0x90,
2132  		// 64bit reg
2133  		0xc4, 0xa2, 0xe9, 0xe6, 0x0c, 0x90,
2134  		0xc4, 0xa2, 0xe9, 0xe2, 0x0c, 0x90,
2135  		0xc4, 0xa2, 0xe9, 0xee, 0x0c, 0x90,
2136  		0xc4, 0xa2, 0xe9, 0xec, 0x0c, 0x90,
2137  		0xc4, 0xa2, 0xe9, 0xe7, 0x0c, 0x90,
2138  		0xc4, 0xa2, 0xe9, 0xe3, 0x0c, 0x90,
2139  		0xc4, 0xa2, 0xe9, 0xef, 0x0c, 0x90,
2140  		0xc4, 0xa2, 0xe9, 0xed, 0x0c, 0x90,
2141  		0xc4, 0xa2, 0xe9, 0xe1, 0x0c, 0x90,
2142  		0xc4, 0xa2, 0xe9, 0xeb, 0x0c, 0x90,
2143  		0xc4, 0xa2, 0xe9, 0xe9, 0x0c, 0x90,
2144  		0xc4, 0xa2, 0xe9, 0xe5, 0x0c, 0x90,
2145  		0xc4, 0xa2, 0xe9, 0xe0, 0x0c, 0x90,
2146  		0xc4, 0xa2, 0xe9, 0xea, 0x0c, 0x90,
2147  		0xc4, 0xa2, 0xe9, 0xe8, 0x0c, 0x90,
2148  		0xc4, 0xa2, 0xe9, 0xe4, 0x0c, 0x90,
2149  	};
2150  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
2151  	CYBOZU_TEST_EQUAL(c.getSize(), n);
2152  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
2153  }
2154  
2155  CYBOZU_TEST_AUTO(prefetchiti)
2156  {
2157  	struct Code : Xbyak::CodeGenerator {
2158  		Code()
2159  		{
2160  			prefetchit0(ptr[rax]);
2161  			prefetchit1(ptr[rax]);
2162  		}
2163  	} c;
2164  	const uint8_t tbl[] = {
2165  		0x0f, 0x18, 0x38,
2166  		0x0f, 0x18, 0x30
2167  	};
2168  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
2169  	CYBOZU_TEST_EQUAL(c.getSize(), n);
2170  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
2171  }
2172  
2173  CYBOZU_TEST_AUTO(crypto)
2174  {
2175  	struct Code : Xbyak::CodeGenerator {
2176  		Code()
2177  		{
2178  			vsha512msg1(ymm3, xmm5);
2179  			vsha512msg2(ymm9, ymm10);
2180  			vsha512rnds2(ymm1, ymm3, xmm2);
2181  
2182  			vsm3msg1(xmm1, xmm2, xmm3);
2183  			vsm3msg1(xmm1, xmm2, ptr [rax]);
2184  			vsm3msg2(xmm5, xmm7, xmm3);
2185  			vsm3msg2(xmm5, xmm6, ptr [rax]);
2186  			vsm3rnds2(xmm5, xmm7, xmm3, 0x12);
2187  			vsm3rnds2(xmm5, xmm7, ptr [rcx], 0x34);
2188  
2189  			vsm4key4(xmm1, xmm2, xmm3);
2190  			vsm4key4(xmm1, xmm2, ptr [rdx]);
2191  			vsm4rnds4(xmm1, xmm2, xmm3);
2192  			vsm4rnds4(xmm5, xmm6, ptr [rcx+rax*4]);
2193  		}
2194  	} c;
2195  	const uint8_t tbl[] = {
2196  		// sha512
2197  		0xc4, 0xe2, 0x7f, 0xcc, 0xdd,
2198  		0xc4, 0x42, 0x7f, 0xcd, 0xca,
2199  		0xc4, 0xe2, 0x67, 0xcb, 0xca,
2200  
2201  		// sm3
2202  		0xC4, 0xE2, 0x68, 0xDA, 0xCB,
2203  		0xC4, 0xE2, 0x68, 0xDA, 0x08,
2204  		0xC4, 0xE2, 0x41, 0xDA, 0xEB,
2205  		0xC4, 0xE2, 0x49, 0xDA, 0x28,
2206  		0xC4, 0xE3, 0x41, 0xDE, 0xEB, 0x12,
2207  		0xC4, 0xE3, 0x41, 0xDE, 0x29, 0x34,
2208  
2209  		// sm4
2210  		0xc4, 0xe2, 0x6a, 0xda, 0xcb,
2211  		0xc4, 0xe2, 0x6a, 0xda, 0x0a,
2212  		0xc4, 0xe2, 0x6b, 0xda, 0xcb,
2213  		0xc4, 0xe2, 0x4b, 0xda, 0x2c, 0x81,
2214  	};
2215  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
2216  	CYBOZU_TEST_EQUAL(c.getSize(), n);
2217  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
2218  }
2219  
2220  CYBOZU_TEST_AUTO(avx_vnni_int)
2221  {
2222  	struct Code : Xbyak::CodeGenerator {
2223  		Code()
2224  		{
2225  			vpdpbssd(xmm1, xmm2, xmm3);
2226  			vpdpbssd(ymm1, ymm2, ptr [rax]);
2227  			vpdpbssds(xmm1, xmm2, xmm3);
2228  			vpdpbssds(ymm1, ymm2, ptr [rax]);
2229  			vpdpbsud(xmm1, xmm2, xmm3);
2230  			vpdpbsud(ymm1, ymm2, ptr [rax]);
2231  			vpdpbsuds(xmm1, xmm2, xmm3);
2232  			vpdpbsuds(ymm1, ymm2, ptr [rax]);
2233  			vpdpbuud(xmm1, xmm2, xmm3);
2234  			vpdpbuud(ymm1, ymm2, ptr [rax]);
2235  			vpdpbuuds(xmm1, xmm2, xmm3);
2236  			vpdpbuuds(ymm1, ymm2, ptr [rax]);
2237  
2238  			vpdpwsud(xmm1, xmm2, xmm3);
2239  			vpdpwsud(ymm1, ymm2, ptr [rax]);
2240  			vpdpwsuds(xmm1, xmm2, xmm3);
2241  			vpdpwsuds(ymm1, ymm2, ptr [rax]);
2242  			vpdpwusd(xmm1, xmm2, xmm3);
2243  			vpdpwusd(ymm1, ymm2, ptr [rax]);
2244  			vpdpwusds(xmm1, xmm2, xmm3);
2245  			vpdpwusds(ymm1, ymm2, ptr [rax]);
2246  			vpdpwuud(xmm1, xmm2, xmm3);
2247  			vpdpwuud(ymm1, ymm2, ptr [rax]);
2248  			vpdpwuuds(xmm1, xmm2, xmm3);
2249  			vpdpwuuds(ymm1, ymm2, ptr [rax]);
2250  		}
2251  	} c;
2252  	const uint8_t tbl[] = {
2253  		0xc4, 0xe2, 0x6b, 0x50, 0xcb,
2254  		0xc4, 0xe2, 0x6f, 0x50, 0x08,
2255  		0xc4, 0xe2, 0x6b, 0x51, 0xcb,
2256  		0xc4, 0xe2, 0x6f, 0x51, 0x08,
2257  		0xc4, 0xe2, 0x6a, 0x50, 0xcb,
2258  		0xc4, 0xe2, 0x6e, 0x50, 0x08,
2259  		0xc4, 0xe2, 0x6a, 0x51, 0xcb,
2260  		0xc4, 0xe2, 0x6e, 0x51, 0x08,
2261  		0xc4, 0xe2, 0x68, 0x50, 0xcb,
2262  		0xc4, 0xe2, 0x6c, 0x50, 0x08,
2263  		0xc4, 0xe2, 0x68, 0x51, 0xcb,
2264  		0xc4, 0xe2, 0x6c, 0x51, 0x08,
2265  		0xc4, 0xe2, 0x6a, 0xd2, 0xcb,
2266  		0xc4, 0xe2, 0x6e, 0xd2, 0x08,
2267  		0xc4, 0xe2, 0x6a, 0xd3, 0xcb,
2268  		0xc4, 0xe2, 0x6e, 0xd3, 0x08,
2269  		0xc4, 0xe2, 0x69, 0xd2, 0xcb,
2270  		0xc4, 0xe2, 0x6d, 0xd2, 0x08,
2271  		0xc4, 0xe2, 0x69, 0xd3, 0xcb,
2272  		0xc4, 0xe2, 0x6d, 0xd3, 0x08,
2273  		0xc4, 0xe2, 0x68, 0xd2, 0xcb,
2274  		0xc4, 0xe2, 0x6c, 0xd2, 0x08,
2275  		0xc4, 0xe2, 0x68, 0xd3, 0xcb,
2276  		0xc4, 0xe2, 0x6c, 0xd3, 0x08,
2277  	};
2278  	const size_t n = sizeof(tbl) / sizeof(tbl[0]);
2279  	CYBOZU_TEST_EQUAL(c.getSize(), n);
2280  	CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n);
2281  }
2282  
2283  
2284  #endif