misc.cpp
1 #include <stdio.h> 2 #include <string.h> 3 #include <string> 4 #include <xbyak/xbyak.h> 5 #include <xbyak/xbyak_util.h> 6 #include <cybozu/inttype.hpp> 7 #include <cybozu/test.hpp> 8 #include <algorithm> 9 10 using namespace Xbyak; 11 12 CYBOZU_TEST_AUTO(setSize) 13 { 14 struct Code : Xbyak::CodeGenerator { 15 Code() : Xbyak::CodeGenerator(4096) 16 { 17 setSize(4095); 18 db(1); 19 size_t size = getSize(); 20 CYBOZU_TEST_EQUAL(size, 4096u); 21 CYBOZU_TEST_NO_EXCEPTION(setSize(size)); 22 CYBOZU_TEST_EXCEPTION(db(1), Xbyak::Error); 23 } 24 } code; 25 } 26 27 #ifdef XBYAK64 28 CYBOZU_TEST_AUTO(badSSE) 29 { 30 struct Code : Xbyak::CodeGenerator { 31 Code() 32 { 33 CYBOZU_TEST_EXCEPTION(paddd(xm16, xm1), Xbyak::Error); 34 CYBOZU_TEST_EXCEPTION(pslld(xm16, 1), Xbyak::Error); 35 CYBOZU_TEST_EXCEPTION(movapd(xm16, xm1), Xbyak::Error); 36 CYBOZU_TEST_EXCEPTION(movhpd(xm16, ptr[eax]), Xbyak::Error); 37 CYBOZU_TEST_EXCEPTION(pextrb(eax, xm16, 1), Xbyak::Error); 38 } 39 } code; 40 } 41 #endif 42 43 CYBOZU_TEST_AUTO(compOperand) 44 { 45 using namespace Xbyak::util; 46 CYBOZU_TEST_ASSERT(eax == eax); 47 CYBOZU_TEST_ASSERT(ecx != xmm0); 48 CYBOZU_TEST_ASSERT(ptr[eax] == ptr[eax]); 49 CYBOZU_TEST_ASSERT(dword[eax] != ptr[eax]); 50 CYBOZU_TEST_ASSERT(ptr[eax] != ptr[eax+3]); 51 } 52 53 CYBOZU_TEST_AUTO(mov_const) 54 { 55 struct Code : Xbyak::CodeGenerator { 56 Code() 57 { 58 const struct { 59 uint64_t v; 60 int bit; 61 bool error; 62 } tbl[] = { 63 { uint64_t(-1), 8, false }, 64 { 0x12, 8, false }, 65 { 0x80, 8, false }, 66 { 0xff, 8, false }, 67 { 0x100, 8, true }, 68 69 { 1, 16, false }, 70 { uint64_t(-1), 16, false }, 71 { 0x7fff, 16, false }, 72 { 0xffff, 16, false }, 73 { 0x10000, 16, true }, 74 75 { uint64_t(-1), 32, false }, 76 { 0x7fffffff, 32, false }, 77 { uint64_t(-0x7fffffff), 32, false }, 78 { 0xffffffff, 32, false }, 79 { 0x100000000ull, 32, true }, 80 81 #ifdef XBYAK64 82 { uint64_t(-1), 64, false }, 83 { 0x7fffffff, 64, false }, 84 { 0xffffffffffffffffull, 64, false }, 85 { 0x80000000, 64, true }, 86 { 0xffffffff, 64, true }, 87 #endif 88 }; 89 for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) { 90 const int bit = tbl[i].bit; 91 const uint64_t v = tbl[i].v; 92 const Xbyak::AddressFrame& af = bit == 8 ? byte : bit == 16 ? word : bit == 32 ? dword : qword; 93 if (tbl[i].error) { 94 CYBOZU_TEST_EXCEPTION(mov(af[eax], v), Xbyak::Error); 95 } else { 96 CYBOZU_TEST_NO_EXCEPTION(mov(af[eax], v)); 97 } 98 } 99 #ifdef XBYAK64 100 CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x7fffffff])); 101 if (sizeof(void*) != 4) { // sizeof(void*) == 4 on x32 102 CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x17fffffff]), Xbyak::Error); 103 } 104 #ifdef XBYAK_OLD_DISP_CHECK 105 CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0x80000000])); 106 CYBOZU_TEST_NO_EXCEPTION(mov(rax, ptr[(void*)0xffffffff])); 107 #else 108 if (sizeof(void*) != 4) { // sizeof(void*) == 4 on x32 109 CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0x80000000ull]), Xbyak::Error); 110 CYBOZU_TEST_EXCEPTION(mov(rax, ptr[(void*)0xffffffffull]), Xbyak::Error); 111 } 112 #endif 113 #endif 114 } 115 } code; 116 } 117 118 CYBOZU_TEST_AUTO(align) 119 { 120 struct Code : Xbyak::CodeGenerator { 121 Code() 122 { 123 const size_t alignSize = 16; 124 for (int padding = 0; padding < 20; padding++) { 125 for (int i = 0; i < padding; i++) { 126 db(1); 127 } 128 align(alignSize); 129 CYBOZU_TEST_EQUAL(size_t(getCurr()) % alignSize, 0u); 130 } 131 align(alignSize); 132 const uint8_t *p = getCurr(); 133 // do nothing if aligned 134 align(alignSize); 135 CYBOZU_TEST_EQUAL(p, getCurr()); 136 } 137 } c; 138 } 139 CYBOZU_TEST_AUTO(kmask) 140 { 141 struct Code : Xbyak::CodeGenerator { 142 Code() 143 { 144 CYBOZU_TEST_EXCEPTION(kmovb(k1, ax), std::exception); 145 CYBOZU_TEST_EXCEPTION(kmovw(k1, ax), std::exception); 146 CYBOZU_TEST_EXCEPTION(kmovd(k1, ax), std::exception); 147 CYBOZU_TEST_EXCEPTION(kmovq(k1, eax), std::exception); 148 #ifdef XBYAK64 149 CYBOZU_TEST_EXCEPTION(kmovb(k1, rax), std::exception); 150 CYBOZU_TEST_EXCEPTION(kmovw(k1, rax), std::exception); 151 CYBOZU_TEST_EXCEPTION(kmovd(k1, rax), std::exception); 152 CYBOZU_TEST_NO_EXCEPTION(kmovq(k1, rax)); 153 #endif 154 CYBOZU_TEST_NO_EXCEPTION(vmovaps(xm0|k0, ptr[eax])); 155 checkT_z(); 156 } 157 void checkT_z() 158 { 159 const uint8_t *p1 = getCurr(); 160 vmovaps(zm0, ptr[eax]); 161 const uint8_t *p2 = getCurr(); 162 vmovaps(zm0|T_z, ptr[eax]); 163 const uint8_t *end = getCurr(); 164 CYBOZU_TEST_EQUAL(p2 - p1, end - p2); 165 CYBOZU_TEST_EQUAL_ARRAY(p1, p2, end - p2); 166 } 167 } c; 168 } 169 170 CYBOZU_TEST_AUTO(gather) 171 { 172 struct Code : Xbyak::CodeGenerator { 173 Code() 174 { 175 CYBOZU_TEST_NO_EXCEPTION(vgatherdpd(xmm1, ptr[eax+xmm2], xmm3)); 176 CYBOZU_TEST_EXCEPTION(vgatherdpd(xmm1, ptr[eax+xmm1], xmm2), std::exception); 177 CYBOZU_TEST_EXCEPTION(vgatherdpd(xmm1, ptr[eax+xmm2], xmm1), std::exception); 178 CYBOZU_TEST_EXCEPTION(vgatherdpd(xmm2, ptr[eax+xmm1], xmm1), std::exception); 179 180 CYBOZU_TEST_NO_EXCEPTION(vgatherdpd(xmm1|k2, ptr[eax+xmm2])); 181 CYBOZU_TEST_EXCEPTION(vgatherdpd(xmm1, ptr[eax+xmm2]), std::exception); 182 CYBOZU_TEST_EXCEPTION(vgatherdpd(xmm1|k2, ptr[eax+xmm1]), std::exception); 183 184 CYBOZU_TEST_NO_EXCEPTION(vpscatterdd(ptr[eax+xmm2]|k2, xmm1)); 185 CYBOZU_TEST_NO_EXCEPTION(vpscatterdd(ptr[eax+xmm2], xmm1|k2)); 186 CYBOZU_TEST_NO_EXCEPTION(vpscatterdd(ptr[eax+xmm2]|k3, xmm2)); 187 188 CYBOZU_TEST_EXCEPTION(vpscatterdd(ptr[eax+xmm2], xmm1), std::exception); 189 } 190 } c; 191 } 192 193 #ifdef XBYAK64 194 CYBOZU_TEST_AUTO(vfmaddps) 195 { 196 struct Code : Xbyak::CodeGenerator { 197 Code() 198 { 199 v4fmaddps(zmm1, zmm8, ptr [rdx + 64]); 200 v4fmaddss(xmm15, xmm8, ptr [rax + 64]); 201 v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]); 202 v4fnmaddss(xmm31, xmm2, ptr [rsp + 0x80]); 203 vp4dpwssd(zmm23 | k7 | T_z, zmm1, ptr [rax + 64]); 204 vp4dpwssds(zmm10 | k4, zmm3, ptr [rsp + rax * 4 + 64]); 205 } 206 } c; 207 const uint8_t tbl[] = { 208 0x62, 0xf2, 0x3f, 0x48, 0x9a, 0x4a, 0x04, 209 0x62, 0x72, 0x3f, 0x08, 0x9b, 0x78, 0x04, 210 0x62, 0xf2, 0x6f, 0x4d, 0xaa, 0x69, 0x08, 211 0x62, 0x62, 0x6f, 0x08, 0xab, 0x7c, 0x24, 0x08, 212 0x62, 0xe2, 0x77, 0xcf, 0x52, 0x78, 0x04, 213 0x62, 0x72, 0x67, 0x4c, 0x53, 0x54, 0x84, 0x04, 214 }; 215 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 216 CYBOZU_TEST_EQUAL(c.getSize(), n); 217 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 218 } 219 CYBOZU_TEST_AUTO(vaes) 220 { 221 struct Code : Xbyak::CodeGenerator { 222 Code() 223 { 224 vaesdec(xmm20, xmm30, ptr [rcx + 64]); 225 vaesdec(ymm1, ymm2, ptr [rcx + 64]); 226 vaesdec(zmm1, zmm2, ptr [rcx + 64]); 227 228 vaesdeclast(xmm20, xmm30, ptr [rax + 64]); 229 vaesdeclast(ymm20, ymm30, ptr [rax + 64]); 230 vaesdeclast(zmm20, zmm30, ptr [rax + 64]); 231 232 vaesenc(xmm20, xmm30, ptr [rcx + 64]); 233 vaesenc(ymm1, ymm2, ptr [rcx + 64]); 234 vaesenc(zmm1, zmm2, ptr [rcx + 64]); 235 236 vaesenclast(xmm20, xmm30, ptr [rax + 64]); 237 vaesenclast(ymm20, ymm30, ptr [rax + 64]); 238 vaesenclast(zmm20, zmm30, ptr [rax + 64]); 239 } 240 } c; 241 const uint8_t tbl[] = { 242 0x62, 0xE2, 0x0D, 0x00, 0xDE, 0x61, 0x04, 243 0xC4, 0xE2, 0x6D, 0xDE, 0x49, 0x40, 244 0x62, 0xF2, 0x6D, 0x48, 0xDE, 0x49, 0x01, 245 246 0x62, 0xE2, 0x0D, 0x00, 0xDF, 0x60, 0x04, 247 0x62, 0xE2, 0x0D, 0x20, 0xDF, 0x60, 0x02, 248 0x62, 0xE2, 0x0D, 0x40, 0xDF, 0x60, 0x01, 249 250 0x62, 0xE2, 0x0D, 0x00, 0xDC, 0x61, 0x04, 251 0xC4, 0xE2, 0x6D, 0xDC, 0x49, 0x40, 252 0x62, 0xF2, 0x6D, 0x48, 0xDC, 0x49, 0x01, 253 254 0x62, 0xE2, 0x0D, 0x00, 0xDD, 0x60, 0x04, 255 0x62, 0xE2, 0x0D, 0x20, 0xDD, 0x60, 0x02, 256 0x62, 0xE2, 0x0D, 0x40, 0xDD, 0x60, 0x01, 257 }; 258 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 259 CYBOZU_TEST_EQUAL(c.getSize(), n); 260 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 261 } 262 CYBOZU_TEST_AUTO(vpclmulqdq) 263 { 264 struct Code : Xbyak::CodeGenerator { 265 Code() 266 { 267 vpclmulqdq(xmm2, xmm3, ptr [rax + 64], 3); 268 vpclmulqdq(ymm2, ymm3, ptr [rax + 64], 3); 269 vpclmulqdq(zmm2, zmm3, ptr [rax + 64], 3); 270 271 vpclmulqdq(xmm20, xmm3, ptr [rax + 64], 3); 272 vpclmulqdq(ymm20, ymm3, ptr [rax + 64], 3); 273 vpclmulqdq(zmm20, zmm3, ptr [rax + 64], 3); 274 } 275 } c; 276 const uint8_t tbl[] = { 277 0xc4, 0xe3, 0x61, 0x44, 0x50, 0x40, 0x03, 278 0xc4, 0xe3, 0x65, 0x44, 0x50, 0x40, 0x03, 279 0x62, 0xf3, 0x65, 0x48, 0x44, 0x50, 0x01, 0x03, 280 0x62, 0xe3, 0x65, 0x08, 0x44, 0x60, 0x04, 0x03, 281 0x62, 0xe3, 0x65, 0x28, 0x44, 0x60, 0x02, 0x03, 282 0x62, 0xe3, 0x65, 0x48, 0x44, 0x60, 0x01, 0x03, 283 }; 284 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 285 CYBOZU_TEST_EQUAL(c.getSize(), n); 286 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 287 } 288 CYBOZU_TEST_AUTO(vcompressb_w) 289 { 290 struct Code : Xbyak::CodeGenerator { 291 Code() 292 { 293 vcompressb(ptr[rax + 64], xmm1); 294 vcompressb(xmm30 | k5, xmm1); 295 vcompressb(ptr[rax + 64], ymm1); 296 vcompressb(ymm30 | k3 |T_z, ymm1); 297 vcompressb(ptr[rax + 64], zmm1); 298 vcompressb(zmm30 | k2 |T_z, zmm1); 299 300 vcompressw(ptr[rax + 64], xmm1); 301 vcompressw(xmm30 | k5, xmm1); 302 vcompressw(ptr[rax + 64], ymm1); 303 vcompressw(ymm30 | k3 |T_z, ymm1); 304 vcompressw(ptr[rax + 64], zmm1); 305 vcompressw(zmm30 | k2 |T_z, zmm1); 306 } 307 } c; 308 const uint8_t tbl[] = { 309 0x62, 0xf2, 0x7d, 0x08, 0x63, 0x48, 0x40, 310 0x62, 0x92, 0x7d, 0x0d, 0x63, 0xce, 311 0x62, 0xf2, 0x7d, 0x28, 0x63, 0x48, 0x40, 312 0x62, 0x92, 0x7d, 0xab, 0x63, 0xce, 313 0x62, 0xf2, 0x7d, 0x48, 0x63, 0x48, 0x40, 314 0x62, 0x92, 0x7d, 0xca, 0x63, 0xce, 315 316 0x62, 0xf2, 0xfd, 0x08, 0x63, 0x48, 0x20, 317 0x62, 0x92, 0xfd, 0x0d, 0x63, 0xce, 318 0x62, 0xf2, 0xfd, 0x28, 0x63, 0x48, 0x20, 319 0x62, 0x92, 0xfd, 0xab, 0x63, 0xce, 320 0x62, 0xf2, 0xfd, 0x48, 0x63, 0x48, 0x20, 321 0x62, 0x92, 0xfd, 0xca, 0x63, 0xce, 322 }; 323 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 324 CYBOZU_TEST_EQUAL(c.getSize(), n); 325 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 326 } 327 CYBOZU_TEST_AUTO(shld) 328 { 329 struct Code : Xbyak::CodeGenerator { 330 Code() 331 { 332 vpshldw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); 333 vpshldw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); 334 vpshldw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); 335 336 vpshldd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); 337 vpshldd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); 338 vpshldd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); 339 340 vpshldq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); 341 vpshldq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); 342 vpshldq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); 343 344 vpshldvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); 345 vpshldvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); 346 vpshldvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); 347 348 vpshldvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); 349 vpshldvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); 350 vpshldvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); 351 352 vpshldvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); 353 vpshldvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); 354 vpshldvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); 355 } 356 } c; 357 const uint8_t tbl[] = { 358 0x62, 0xf3, 0xed, 0x8b, 0x70, 0x68, 0x04, 0x05, 359 0x62, 0xf3, 0xed, 0xab, 0x70, 0x68, 0x02, 0x05, 360 0x62, 0xf3, 0xed, 0xcb, 0x70, 0x68, 0x01, 0x05, 361 362 0x62, 0xf3, 0x6d, 0x8b, 0x71, 0x68, 0x04, 0x05, 363 0x62, 0xf3, 0x6d, 0xab, 0x71, 0x68, 0x02, 0x05, 364 0x62, 0xf3, 0x6d, 0xcb, 0x71, 0x68, 0x01, 0x05, 365 366 0x62, 0xf3, 0xed, 0x8b, 0x71, 0x68, 0x04, 0x05, 367 0x62, 0xf3, 0xed, 0xab, 0x71, 0x68, 0x02, 0x05, 368 0x62, 0xf3, 0xed, 0xcb, 0x71, 0x68, 0x01, 0x05, 369 370 0x62, 0xf2, 0xed, 0x8b, 0x70, 0x68, 0x04, 371 0x62, 0xf2, 0xed, 0xab, 0x70, 0x68, 0x02, 372 0x62, 0xf2, 0xed, 0xcb, 0x70, 0x68, 0x01, 373 374 0x62, 0xf2, 0x6d, 0x8b, 0x71, 0x68, 0x04, 375 0x62, 0xf2, 0x6d, 0xab, 0x71, 0x68, 0x02, 376 0x62, 0xf2, 0x6d, 0xcb, 0x71, 0x68, 0x01, 377 378 0x62, 0xf2, 0xed, 0x8b, 0x71, 0x68, 0x04, 379 0x62, 0xf2, 0xed, 0xab, 0x71, 0x68, 0x02, 380 0x62, 0xf2, 0xed, 0xcb, 0x71, 0x68, 0x01, 381 }; 382 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 383 CYBOZU_TEST_EQUAL(c.getSize(), n); 384 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 385 } 386 CYBOZU_TEST_AUTO(shrd) 387 { 388 struct Code : Xbyak::CodeGenerator { 389 Code() 390 { 391 vpshrdw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); 392 vpshrdw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); 393 vpshrdw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); 394 395 vpshrdd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); 396 vpshrdd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); 397 vpshrdd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); 398 399 vpshrdq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5); 400 vpshrdq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5); 401 vpshrdq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5); 402 403 vpshrdvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); 404 vpshrdvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); 405 vpshrdvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); 406 407 vpshrdvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); 408 vpshrdvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); 409 vpshrdvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); 410 411 vpshrdvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]); 412 vpshrdvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]); 413 vpshrdvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]); 414 415 vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5); 416 vpshrdd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5); 417 vpshrdd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5); 418 419 vpshrdq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5); 420 vpshrdq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5); 421 vpshrdq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5); 422 423 vpshrdvd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]); 424 vpshrdvd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]); 425 vpshrdvd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]); 426 427 vpshrdvq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]); 428 vpshrdvq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]); 429 vpshrdvq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]); 430 } 431 } c; 432 const uint8_t tbl[] = { 433 0x62, 0xf3, 0xed, 0x8b, 0x72, 0x68, 0x04, 0x05, 434 0x62, 0xf3, 0xed, 0xab, 0x72, 0x68, 0x02, 0x05, 435 0x62, 0xf3, 0xed, 0xcb, 0x72, 0x68, 0x01, 0x05, 436 437 0x62, 0xf3, 0x6d, 0x8b, 0x73, 0x68, 0x04, 0x05, 438 0x62, 0xf3, 0x6d, 0xab, 0x73, 0x68, 0x02, 0x05, 439 0x62, 0xf3, 0x6d, 0xcb, 0x73, 0x68, 0x01, 0x05, 440 441 0x62, 0xf3, 0xed, 0x8b, 0x73, 0x68, 0x04, 0x05, 442 0x62, 0xf3, 0xed, 0xab, 0x73, 0x68, 0x02, 0x05, 443 0x62, 0xf3, 0xed, 0xcb, 0x73, 0x68, 0x01, 0x05, 444 445 0x62, 0xf2, 0xed, 0x8b, 0x72, 0x68, 0x04, 446 0x62, 0xf2, 0xed, 0xab, 0x72, 0x68, 0x02, 447 0x62, 0xf2, 0xed, 0xcb, 0x72, 0x68, 0x01, 448 449 0x62, 0xf2, 0x6d, 0x8b, 0x73, 0x68, 0x04, 450 0x62, 0xf2, 0x6d, 0xab, 0x73, 0x68, 0x02, 451 0x62, 0xf2, 0x6d, 0xcb, 0x73, 0x68, 0x01, 452 453 0x62, 0xf2, 0xed, 0x8b, 0x73, 0x68, 0x04, 454 0x62, 0xf2, 0xed, 0xab, 0x73, 0x68, 0x02, 455 0x62, 0xf2, 0xed, 0xcb, 0x73, 0x68, 0x01, 456 457 0x62, 0xf3, 0x6d, 0x9b, 0x73, 0x68, 0x10, 0x05, 458 0x62, 0xf3, 0x6d, 0xbb, 0x73, 0x68, 0x10, 0x05, 459 0x62, 0xf3, 0x6d, 0xdb, 0x73, 0x68, 0x10, 0x05, 460 461 0x62, 0xf3, 0xed, 0x9b, 0x73, 0x68, 0x08, 0x05, 462 0x62, 0xf3, 0xed, 0xbb, 0x73, 0x68, 0x08, 0x05, 463 0x62, 0xf3, 0xed, 0xdb, 0x73, 0x68, 0x08, 0x05, 464 465 0x62, 0xf2, 0x6d, 0x9b, 0x73, 0x68, 0x10, 466 0x62, 0xf2, 0x6d, 0xbb, 0x73, 0x68, 0x10, 467 0x62, 0xf2, 0x6d, 0xdb, 0x73, 0x68, 0x10, 468 469 0x62, 0xf2, 0xed, 0x9b, 0x73, 0x68, 0x08, 470 0x62, 0xf2, 0xed, 0xbb, 0x73, 0x68, 0x08, 471 0x62, 0xf2, 0xed, 0xdb, 0x73, 0x68, 0x08, 472 }; 473 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 474 CYBOZU_TEST_EQUAL(c.getSize(), n); 475 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 476 } 477 CYBOZU_TEST_AUTO(vpopcnt) 478 { 479 struct Code : Xbyak::CodeGenerator { 480 Code() 481 { 482 vpopcntb(xmm5|k3|T_z, ptr [rax + 0x40]); 483 vpopcntb(ymm5|k3|T_z, ptr [rax + 0x40]); 484 vpopcntb(zmm5|k3|T_z, ptr [rax + 0x40]); 485 486 vpopcntw(xmm5|k3|T_z, ptr [rax + 0x40]); 487 vpopcntw(ymm5|k3|T_z, ptr [rax + 0x40]); 488 vpopcntw(zmm5|k3|T_z, ptr [rax + 0x40]); 489 490 vpopcntd(xmm5|k3|T_z, ptr [rax + 0x40]); 491 vpopcntd(ymm5|k3|T_z, ptr [rax + 0x40]); 492 vpopcntd(zmm5|k3|T_z, ptr [rax + 0x40]); 493 494 vpopcntd(xmm5|k3|T_z, ptr_b [rax + 0x40]); 495 vpopcntd(ymm5|k3|T_z, ptr_b [rax + 0x40]); 496 vpopcntd(zmm5|k3|T_z, ptr_b [rax + 0x40]); 497 498 vpopcntq(xmm5|k3|T_z, ptr [rax + 0x40]); 499 vpopcntq(ymm5|k3|T_z, ptr [rax + 0x40]); 500 vpopcntq(zmm5|k3|T_z, ptr [rax + 0x40]); 501 502 vpopcntq(xmm5|k3|T_z, ptr_b [rax + 0x40]); 503 vpopcntq(ymm5|k3|T_z, ptr_b [rax + 0x40]); 504 vpopcntq(zmm5|k3|T_z, ptr_b [rax + 0x40]); 505 } 506 } c; 507 const uint8_t tbl[] = { 508 0x62, 0xf2, 0x7d, 0x8b, 0x54, 0x68, 0x04, 509 0x62, 0xf2, 0x7d, 0xab, 0x54, 0x68, 0x02, 510 0x62, 0xf2, 0x7d, 0xcb, 0x54, 0x68, 0x01, 511 512 0x62, 0xf2, 0xfd, 0x8b, 0x54, 0x68, 0x04, 513 0x62, 0xf2, 0xfd, 0xab, 0x54, 0x68, 0x02, 514 0x62, 0xf2, 0xfd, 0xcb, 0x54, 0x68, 0x01, 515 516 0x62, 0xf2, 0x7d, 0x8b, 0x55, 0x68, 0x04, 517 0x62, 0xf2, 0x7d, 0xab, 0x55, 0x68, 0x02, 518 0x62, 0xf2, 0x7d, 0xcb, 0x55, 0x68, 0x01, 519 520 0x62, 0xf2, 0x7d, 0x9b, 0x55, 0x68, 0x10, 521 0x62, 0xf2, 0x7d, 0xbb, 0x55, 0x68, 0x10, 522 0x62, 0xf2, 0x7d, 0xdb, 0x55, 0x68, 0x10, 523 524 0x62, 0xf2, 0xfd, 0x8b, 0x55, 0x68, 0x04, 525 0x62, 0xf2, 0xfd, 0xab, 0x55, 0x68, 0x02, 526 0x62, 0xf2, 0xfd, 0xcb, 0x55, 0x68, 0x01, 527 528 0x62, 0xf2, 0xfd, 0x9b, 0x55, 0x68, 0x08, 529 0x62, 0xf2, 0xfd, 0xbb, 0x55, 0x68, 0x08, 530 0x62, 0xf2, 0xfd, 0xdb, 0x55, 0x68, 0x08, 531 }; 532 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 533 CYBOZU_TEST_EQUAL(c.getSize(), n); 534 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 535 } 536 CYBOZU_TEST_AUTO(vpdpbus) 537 { 538 struct Code : Xbyak::CodeGenerator { 539 Code() 540 { 541 vpdpbusd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); 542 vpdpbusd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); 543 vpdpbusd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); 544 545 vpdpbusd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); 546 vpdpbusd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); 547 vpdpbusd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); 548 549 vpdpbusds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); 550 vpdpbusds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); 551 vpdpbusds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); 552 553 vpdpbusds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); 554 vpdpbusds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); 555 vpdpbusds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); 556 557 vpdpwssd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); 558 vpdpwssd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); 559 vpdpwssd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); 560 561 vpdpwssd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); 562 vpdpwssd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); 563 vpdpwssd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); 564 565 vpdpwssds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]); 566 vpdpwssds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]); 567 vpdpwssds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]); 568 569 vpdpwssds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]); 570 vpdpwssds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]); 571 vpdpwssds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]); 572 } 573 } c; 574 const uint8_t tbl[] = { 575 0x62, 0xf2, 0x5d, 0x83, 0x50, 0x68, 0x04, 576 0x62, 0xf2, 0x5d, 0xa3, 0x50, 0x68, 0x02, 577 0x62, 0xf2, 0x5d, 0xc3, 0x50, 0x68, 0x01, 578 579 0x62, 0xf2, 0x5d, 0x93, 0x50, 0x68, 0x10, 580 0x62, 0xf2, 0x5d, 0xb3, 0x50, 0x68, 0x10, 581 0x62, 0xf2, 0x5d, 0xd3, 0x50, 0x68, 0x10, 582 583 0x62, 0xf2, 0x5d, 0x83, 0x51, 0x68, 0x04, 584 0x62, 0xf2, 0x5d, 0xa3, 0x51, 0x68, 0x02, 585 0x62, 0xf2, 0x5d, 0xc3, 0x51, 0x68, 0x01, 586 587 0x62, 0xf2, 0x5d, 0x93, 0x51, 0x68, 0x10, 588 0x62, 0xf2, 0x5d, 0xb3, 0x51, 0x68, 0x10, 589 0x62, 0xf2, 0x5d, 0xd3, 0x51, 0x68, 0x10, 590 591 0x62, 0xf2, 0x5d, 0x83, 0x52, 0x68, 0x04, 592 0x62, 0xf2, 0x5d, 0xa3, 0x52, 0x68, 0x02, 593 0x62, 0xf2, 0x5d, 0xc3, 0x52, 0x68, 0x01, 594 595 0x62, 0xf2, 0x5d, 0x93, 0x52, 0x68, 0x10, 596 0x62, 0xf2, 0x5d, 0xb3, 0x52, 0x68, 0x10, 597 0x62, 0xf2, 0x5d, 0xd3, 0x52, 0x68, 0x10, 598 599 0x62, 0xf2, 0x5d, 0x83, 0x53, 0x68, 0x04, 600 0x62, 0xf2, 0x5d, 0xa3, 0x53, 0x68, 0x02, 601 0x62, 0xf2, 0x5d, 0xc3, 0x53, 0x68, 0x01, 602 603 0x62, 0xf2, 0x5d, 0x93, 0x53, 0x68, 0x10, 604 0x62, 0xf2, 0x5d, 0xb3, 0x53, 0x68, 0x10, 605 0x62, 0xf2, 0x5d, 0xd3, 0x53, 0x68, 0x10, 606 }; 607 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 608 CYBOZU_TEST_EQUAL(c.getSize(), n); 609 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 610 } 611 CYBOZU_TEST_AUTO(vexpand_vpshufbitqmb) 612 { 613 struct Code : Xbyak::CodeGenerator { 614 Code() 615 { 616 vpexpandb(xmm5|k3|T_z, xmm30); 617 vpexpandb(ymm5|k3|T_z, ymm30); 618 vpexpandb(zmm5|k3|T_z, zmm30); 619 vpexpandb(xmm5|k3|T_z, ptr [rax + 0x40]); 620 vpexpandb(ymm5|k3|T_z, ptr [rax + 0x40]); 621 vpexpandb(zmm5|k3|T_z, ptr [rax + 0x40]); 622 623 vpexpandw(xmm5|k3|T_z, xmm30); 624 vpexpandw(ymm5|k3|T_z, ymm30); 625 vpexpandw(zmm5|k3|T_z, zmm30); 626 vpexpandw(xmm5|k3|T_z, ptr [rax + 0x40]); 627 vpexpandw(ymm5|k3|T_z, ptr [rax + 0x40]); 628 vpexpandw(zmm5|k3|T_z, ptr [rax + 0x40]); 629 630 vpshufbitqmb(k1|k2, xmm2, ptr [rax + 0x40]); 631 vpshufbitqmb(k1|k2, ymm2, ptr [rax + 0x40]); 632 vpshufbitqmb(k1|k2, zmm2, ptr [rax + 0x40]); 633 } 634 } c; 635 const uint8_t tbl[] = { 636 0x62, 0x92, 0x7d, 0x8b, 0x62, 0xee, 637 0x62, 0x92, 0x7d, 0xab, 0x62, 0xee, 638 0x62, 0x92, 0x7d, 0xcb, 0x62, 0xee, 639 0x62, 0xf2, 0x7d, 0x8b, 0x62, 0x68, 0x40, 640 0x62, 0xf2, 0x7d, 0xab, 0x62, 0x68, 0x40, 641 0x62, 0xf2, 0x7d, 0xcb, 0x62, 0x68, 0x40, 642 643 0x62, 0x92, 0xfd, 0x8b, 0x62, 0xee, 644 0x62, 0x92, 0xfd, 0xab, 0x62, 0xee, 645 0x62, 0x92, 0xfd, 0xcb, 0x62, 0xee, 646 0x62, 0xf2, 0xfd, 0x8b, 0x62, 0x68, 0x20, 647 0x62, 0xf2, 0xfd, 0xab, 0x62, 0x68, 0x20, 648 0x62, 0xf2, 0xfd, 0xcb, 0x62, 0x68, 0x20, 649 650 0x62, 0xf2, 0x6d, 0x0a, 0x8f, 0x48, 0x04, 651 0x62, 0xf2, 0x6d, 0x2a, 0x8f, 0x48, 0x02, 652 0x62, 0xf2, 0x6d, 0x4a, 0x8f, 0x48, 0x01, 653 }; 654 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 655 CYBOZU_TEST_EQUAL(c.getSize(), n); 656 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 657 } 658 CYBOZU_TEST_AUTO(gf2) 659 { 660 struct Code : Xbyak::CodeGenerator { 661 Code() 662 { 663 /// 664 gf2p8affineinvqb(xmm1, xmm2, 3); 665 gf2p8affineinvqb(xmm1, ptr [rax + 0x40], 3); 666 667 vgf2p8affineinvqb(xmm1, xmm5, xmm2, 3); 668 vgf2p8affineinvqb(ymm1, ymm5, ymm2, 3); 669 vgf2p8affineinvqb(xmm1, xmm5, ptr [rax + 0x40], 3); 670 vgf2p8affineinvqb(ymm1, ymm5, ptr [rax + 0x40], 3); 671 672 vgf2p8affineinvqb(xmm30, xmm31, xmm4, 5); 673 vgf2p8affineinvqb(ymm30, ymm31, ymm4, 5); 674 vgf2p8affineinvqb(zmm30, zmm31, zmm4, 5); 675 676 vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5); 677 vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5); 678 vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5); 679 680 vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5); 681 vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5); 682 vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5); 683 /// 684 gf2p8affineqb(xmm1, xmm2, 3); 685 gf2p8affineqb(xmm1, ptr [rax + 0x40], 3); 686 687 vgf2p8affineqb(xmm1, xmm5, xmm2, 3); 688 vgf2p8affineqb(ymm1, ymm5, ymm2, 3); 689 vgf2p8affineqb(xmm1, xmm5, ptr [rax + 0x40], 3); 690 vgf2p8affineqb(ymm1, ymm5, ptr [rax + 0x40], 3); 691 692 vgf2p8affineqb(xmm30, xmm31, xmm4, 5); 693 vgf2p8affineqb(ymm30, ymm31, ymm4, 5); 694 vgf2p8affineqb(zmm30, zmm31, zmm4, 5); 695 696 vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5); 697 vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5); 698 vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5); 699 700 vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5); 701 vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5); 702 vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5); 703 /// 704 gf2p8mulb(xmm1, xmm2); 705 gf2p8mulb(xmm1, ptr [rax + 0x40]); 706 707 vgf2p8mulb(xmm1, xmm5, xmm2); 708 vgf2p8mulb(ymm1, ymm5, ymm2); 709 vgf2p8mulb(xmm1, xmm5, ptr [rax + 0x40]); 710 vgf2p8mulb(ymm1, ymm5, ptr [rax + 0x40]); 711 712 vgf2p8mulb(xmm30, xmm31, xmm4); 713 vgf2p8mulb(ymm30, ymm31, ymm4); 714 vgf2p8mulb(zmm30, zmm31, zmm4); 715 716 vgf2p8mulb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40]); 717 vgf2p8mulb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40]); 718 vgf2p8mulb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40]); 719 } 720 } c; 721 const uint8_t tbl[] = { 722 0x66, 0x0f, 0x3a, 0xcf, 0xca, 0x03, 723 0x66, 0x0f, 0x3a, 0xcf, 0x48, 0x40, 0x03, 724 0xc4, 0xe3, 0xd1, 0xcf, 0xca, 0x03, 725 0xc4, 0xe3, 0xd5, 0xcf, 0xca, 0x03, 726 0xc4, 0xe3, 0xd1, 0xcf, 0x48, 0x40, 0x03, 727 0xc4, 0xe3, 0xd5, 0xcf, 0x48, 0x40, 0x03, 728 0x62, 0x63, 0x85, 0x00, 0xcf, 0xf4, 0x05, 729 0x62, 0x63, 0x85, 0x20, 0xcf, 0xf4, 0x05, 730 0x62, 0x63, 0x85, 0x40, 0xcf, 0xf4, 0x05, 731 0x62, 0x63, 0xd5, 0x89, 0xcf, 0x70, 0x04, 0x05, 732 0x62, 0x63, 0xd5, 0xa9, 0xcf, 0x70, 0x02, 0x05, 733 0x62, 0x63, 0xd5, 0xc9, 0xcf, 0x70, 0x01, 0x05, 734 0x62, 0x63, 0xd5, 0x99, 0xcf, 0x70, 0x08, 0x05, 735 0x62, 0x63, 0xd5, 0xb9, 0xcf, 0x70, 0x08, 0x05, 736 0x62, 0x63, 0xd5, 0xd9, 0xcf, 0x70, 0x08, 0x05, 737 738 0x66, 0x0f, 0x3a, 0xce, 0xca, 0x03, 739 0x66, 0x0f, 0x3a, 0xce, 0x48, 0x40, 0x03, 740 0xc4, 0xe3, 0xd1, 0xce, 0xca, 0x03, 741 0xc4, 0xe3, 0xd5, 0xce, 0xca, 0x03, 742 0xc4, 0xe3, 0xd1, 0xce, 0x48, 0x40, 0x03, 743 0xc4, 0xe3, 0xd5, 0xce, 0x48, 0x40, 0x03, 744 0x62, 0x63, 0x85, 0x00, 0xce, 0xf4, 0x05, 745 0x62, 0x63, 0x85, 0x20, 0xce, 0xf4, 0x05, 746 0x62, 0x63, 0x85, 0x40, 0xce, 0xf4, 0x05, 747 0x62, 0x63, 0xd5, 0x89, 0xce, 0x70, 0x04, 0x05, 748 0x62, 0x63, 0xd5, 0xa9, 0xce, 0x70, 0x02, 0x05, 749 0x62, 0x63, 0xd5, 0xc9, 0xce, 0x70, 0x01, 0x05, 750 0x62, 0x63, 0xd5, 0x99, 0xce, 0x70, 0x08, 0x05, 751 0x62, 0x63, 0xd5, 0xb9, 0xce, 0x70, 0x08, 0x05, 752 0x62, 0x63, 0xd5, 0xd9, 0xce, 0x70, 0x08, 0x05, 753 754 0x66, 0x0f, 0x38, 0xcf, 0xca, 755 0x66, 0x0f, 0x38, 0xcf, 0x48, 0x40, 756 0xc4, 0xe2, 0x51, 0xcf, 0xca, 757 0xc4, 0xe2, 0x55, 0xcf, 0xca, 758 0xc4, 0xe2, 0x51, 0xcf, 0x48, 0x40, 759 0xc4, 0xe2, 0x55, 0xcf, 0x48, 0x40, 760 0x62, 0x62, 0x05, 0x00, 0xcf, 0xf4, 761 0x62, 0x62, 0x05, 0x20, 0xcf, 0xf4, 762 0x62, 0x62, 0x05, 0x40, 0xcf, 0xf4, 763 0x62, 0x62, 0x55, 0x89, 0xcf, 0x70, 0x04, 764 0x62, 0x62, 0x55, 0xa9, 0xcf, 0x70, 0x02, 765 0x62, 0x62, 0x55, 0xc9, 0xcf, 0x70, 0x01, 766 }; 767 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 768 CYBOZU_TEST_EQUAL(c.getSize(), n); 769 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 770 } 771 772 CYBOZU_TEST_AUTO(bf16) 773 { 774 struct Code : Xbyak::CodeGenerator { 775 Code() 776 { 777 vcvtne2ps2bf16(xmm0 | k1, xmm1, ptr [rax + 64]); 778 vcvtne2ps2bf16(ymm0 | k1 | T_z, ymm0, ptr [rax + 64]); 779 vcvtne2ps2bf16(zmm0 | k1, zmm1, ptr [rax + 64]); 780 781 vcvtneps2bf16(xmm0, xword [rax + 64]); 782 vcvtneps2bf16(xmm0 | k1, yword [rax + 64]); 783 vcvtneps2bf16(ymm0 | k1, zword [rax + 64]); 784 vcvtneps2bf16(ymm0 | k1, ptr [rax + 64]); 785 786 vdpbf16ps(xmm0 | k1, xmm1, ptr [rax + 64]); 787 vdpbf16ps(ymm0 | k1, ymm1, ptr [rax + 64]); 788 vdpbf16ps(zmm0 | k1, zmm1, ptr [rax + 64]); 789 } 790 } c; 791 const uint8_t tbl[] = { 792 0x62, 0xf2, 0x77, 0x09, 0x72, 0x40, 0x04, 793 0x62, 0xf2, 0x7f, 0xa9, 0x72, 0x40, 0x02, 794 0x62, 0xf2, 0x77, 0x49, 0x72, 0x40, 0x01, 795 796 0x62, 0xf2, 0x7e, 0x08, 0x72, 0x40, 0x04, 797 0x62, 0xf2, 0x7e, 0x29, 0x72, 0x40, 0x02, 798 0x62, 0xf2, 0x7e, 0x49, 0x72, 0x40, 0x01, 799 0x62, 0xf2, 0x7e, 0x49, 0x72, 0x40, 0x01, 800 801 0x62, 0xf2, 0x76, 0x09, 0x52, 0x40, 0x04, 802 0x62, 0xf2, 0x76, 0x29, 0x52, 0x40, 0x02, 803 0x62, 0xf2, 0x76, 0x49, 0x52, 0x40, 0x01, 804 }; 805 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 806 CYBOZU_TEST_EQUAL(c.getSize(), n); 807 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 808 } 809 810 CYBOZU_TEST_AUTO(AMX) 811 { 812 struct Code : Xbyak::CodeGenerator { 813 Code() 814 { 815 ldtilecfg(ptr[rax + rcx * 4 + 64]); 816 sttilecfg(ptr[rsp + rax * 8 + 128]); 817 tileloadd(tmm3, ptr[rdi + rdx * 2 + 8]); 818 tileloaddt1(tmm4, ptr[r8 + r9 + 32]); 819 tilerelease(); 820 tilestored(ptr[r10 + r11 * 2 + 32], tmm2); 821 tilezero(tmm7); 822 tdpbssd(tmm1, tmm2, tmm3); 823 tdpbsud(tmm2, tmm3, tmm4); 824 tdpbusd(tmm3, tmm4, tmm5); 825 tdpbuud(tmm4, tmm5, tmm6); 826 tdpbf16ps(tmm5, tmm6, tmm7); 827 } 828 } c; 829 // generated code by patch 830 const uint8_t tbl[] = { 831 0xc4, 0xe2, 0x78, 0x49, 0x44, 0x88, 0x40, 0xc4, 0xe2, 0x79, 0x49, 0x84, 0xc4, 0x80, 0x00, 0x00, 832 0x00, 0xc4, 0xe2, 0x7b, 0x4b, 0x5c, 0x57, 0x08, 0xc4, 0x82, 0x79, 0x4b, 0x64, 0x08, 0x20, 0xc4, 833 0xe2, 0x78, 0x49, 0xc0, 0xc4, 0x82, 0x7a, 0x4b, 0x54, 0x5a, 0x20, 0xc4, 0xe2, 0x7b, 0x49, 0xf8, 834 0xc4, 0xe2, 0x63, 0x5e, 0xca, 0xc4, 0xe2, 0x5a, 0x5e, 0xd3, 0xc4, 0xe2, 0x51, 0x5e, 0xdc, 0xc4, 835 0xe2, 0x48, 0x5e, 0xe5, 0xc4, 0xe2, 0x42, 0x5c, 0xee, 836 }; 837 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 838 CYBOZU_TEST_EQUAL(c.getSize(), n); 839 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 840 } 841 842 CYBOZU_TEST_AUTO(tileloadd) 843 { 844 struct Code : Xbyak::CodeGenerator { 845 Code() 846 { 847 tileloadd(tmm1, ptr[r8+r8]); 848 tileloadd(tmm1, ptr[rax+rcx*4]); 849 tileloadd(tmm1, ptr[r8+r9*1+0x40]); 850 } 851 void notSupported() 852 { 853 tileloadd(tmm1, ptr[r8]); 854 } 855 void notSupported2() 856 { 857 tileloadd(tmm1, ptr[r8*2]); 858 } 859 } c; 860 const uint8_t tbl[] = { 861 0xC4, 0x82, 0x7B, 0x4B, 0x0C, 0x00, 862 0xC4, 0xE2, 0x7B, 0x4B, 0x0C, 0x88, 863 0xC4, 0x82, 0x7B, 0x4B, 0x4C, 0x08, 0x40, 864 }; 865 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 866 CYBOZU_TEST_EQUAL(c.getSize(), n); 867 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 868 869 // current version does not support this sibmem format 870 CYBOZU_TEST_EXCEPTION(c.notSupported(), std::exception); 871 CYBOZU_TEST_EXCEPTION(c.notSupported2(), std::exception); 872 } 873 874 CYBOZU_TEST_AUTO(vnni) 875 { 876 struct Code : Xbyak::CodeGenerator { 877 Code() 878 { 879 // default encoding is EVEX 880 vpdpbusd(xm0, xm1, xm2); 881 vpdpbusd(xm0, xm1, xm2, EvexEncoding); // EVEX 882 vpdpbusd(xm0, xm1, xm2, VexEncoding); // VEX 883 setDefaultEncoding(VexEncoding); 884 vpdpbusd(xm0, xm1, xm2); // VEX 885 setDefaultEncoding(EvexEncoding); 886 vpdpbusd(xm0, xm1, xm2); // EVEX 887 } 888 void badVex() 889 { 890 vpdpbusd(xm0, xm1, xm31, VexEncoding); 891 } 892 } c; 893 const uint8_t tbl[] = { 894 0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2, 895 0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2, 896 0xC4, 0xE2, 0x71, 0x50, 0xC2, 897 0xC4, 0xE2, 0x71, 0x50, 0xC2, 898 0x62, 0xF2, 0x75, 0x08, 0x50, 0xC2, 899 }; 900 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 901 CYBOZU_TEST_EQUAL(c.getSize(), n); 902 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 903 904 CYBOZU_TEST_EXCEPTION(c.badVex(), std::exception); 905 } 906 907 CYBOZU_TEST_AUTO(vaddph) 908 { 909 struct Code : Xbyak::CodeGenerator { 910 Code() 911 { 912 vaddph(zmm0, zmm1, ptr[rax+64]); 913 vaddph(ymm0, ymm1, ptr[rax+64]); 914 vaddph(xmm0, xmm1, ptr[rax+64]); 915 916 vaddph(zmm0, zmm1, ptr_b[rax+64]); 917 vaddph(ymm0, ymm1, ptr_b[rax+64]); 918 vaddph(xmm0, xmm1, ptr_b[rax+64]); 919 920 vaddsh(xmm0, xmm15, ptr[rax+64]); 921 vaddsh(xmm0|k5|T_z|T_rd_sae, xmm15, xmm3); 922 923 vcmpph(k1, xm15, ptr[rax+64], 1); 924 vcmpph(k2, ym15, ptr[rax+64], 2); 925 vcmpph(k3, zm15, ptr[rax+64], 3); 926 vcmpph(k1, xm15, ptr_b[rax+64], 1); 927 vcmpph(k2, ym15, ptr_b[rax+64], 2); 928 vcmpph(k3, zm15, ptr_b[rax+64], 3); 929 930 vcmpsh(k1, xm15, ptr[rax+64], 1); 931 vcmpsh(k3|k5, xmm1, xmm25|T_sae, 4); 932 933 vcomish(xmm1, ptr[rax+64]); 934 vcomish(xmm1|T_sae, xmm15); 935 936 vucomish(xmm1, ptr [rax+0x40]); 937 vucomish(xmm1|T_sae, xmm15); 938 939 vfmaddsub213ph(xmm1, xmm2, ptr [rax+0x40]); 940 vfmaddsub213ph(xmm1, xmm2, ptr_b [rax+0x40]); 941 vfmaddsub213ph(xmm1|k3, xmm2, xmm5); 942 vfmaddsub213ph(ymm1, ymm2, ptr [rax+0x40]); 943 vfmaddsub213ph(ymm1, ymm2, ptr_b[rax+0x40]); 944 vfmaddsub213ph(ymm1|k3, ymm2, ymm5); 945 vfmaddsub213ph(zmm1, zmm2, ptr [rax+0x40]); 946 vfmaddsub213ph(zmm1, zmm2, ptr_b [rax+0x40]); 947 vfmaddsub213ph(zmm1|T_ru_sae, zmm2, zmm5); 948 949 vfmsubadd132ph(xmm1, xmm2, ptr [rax+0x40]); 950 vfmsubadd132ph(xmm1, xmm2, ptr_b [rax+0x40]); 951 vfmsubadd132ph(ymm1, ymm2, ptr [rax+0x40]); 952 vfmsubadd132ph(ymm1, ymm2, ptr_b [rax+0x40]); 953 vfmsubadd132ph(zmm1, zmm2, ptr [rax+0x40]); 954 vfmsubadd132ph(zmm1, zmm2, ptr_b [rax+0x40]); 955 vfmsubadd132ph(zmm1|T_ru_sae, zmm2, zmm5); 956 957 vfmadd132ph(xmm1, xmm2, ptr [rax+0x40]); 958 vfmadd132ph(xmm1, xmm2, ptr_b [rax+0x40]); 959 vfmadd132ph(ymm1, ymm2, ptr [rax+0x40]); 960 vfmadd132ph(ymm1, ymm2, ptr_b [rax+0x40]); 961 vfmadd132ph(zmm1, zmm2, ptr [rax+0x40]); 962 vfmadd132ph(zmm1, zmm2, ptr_b [rax+0x40]); 963 vfmadd132ph(zmm1|T_rd_sae, zmm2, zmm5); 964 965 vfmsub231ph(xmm1, xmm2, ptr [rax+0x40]); 966 vfmsub231ph(xmm1, xmm2, ptr_b [rax+0x40]); 967 vfmsub231ph(ymm1, ymm2, ptr [rax+0x40]); 968 vfmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]); 969 vfmsub231ph(zmm1, zmm2, ptr [rax+0x40]); 970 vfmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]); 971 vfmsub231ph(zmm1|T_rd_sae, zmm2, zmm5); 972 973 vfnmsub231ph(xmm1, xmm2, ptr [rax+0x40]); 974 vfnmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]); 975 vfnmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]); 976 vfnmsub231ph(zmm1|T_rd_sae, zmm2, zmm5); 977 978 vfmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); 979 vfmadd132sh(xmm1, xmm2, ptr [rax+0x40]); 980 981 vfnmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); 982 vfnmadd132sh(xmm1, xmm2, ptr [rax+0x40]); 983 984 vfmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); 985 vfmsub132sh(xmm1, xmm2, ptr [rax+0x40]); 986 vfnmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); 987 vfnmsub132sh(xmm1, xmm2, ptr [rax+0x40]); 988 989 vfcmaddcph(xmm1|k1|T_z, xmm2, ptr [rax+0x40]); 990 vfcmaddcph(ymm1|k1|T_z, ymm2, ptr [rax+0x40]); 991 vfcmaddcph(zmm1|k1, zmm2, ptr [rax+0x40]); 992 vfcmaddcph(zmm1|k1|T_rd_sae, zmm2, zmm5); 993 vfcmaddcph(xmm1|k1|T_z, xmm2, ptr_b [rax+0x40]); 994 vfcmaddcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); 995 vfcmaddcph(zmm1|k1|T_z, zmm2, ptr_b [rax+0x40]); 996 997 vfmaddcph(xm1, xm2, ptr[rax+0x40]); 998 vfmaddcph(ym1|k1|T_z, ym2, ptr_b[rax+0x40]); 999 vfmaddcph(zm1, zm2, ptr_b[rax+0x40]); 1000 1001 vfcmulcph(xmm1, xmm2, ptr [rax+0x40]); 1002 vfcmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); 1003 vfcmulcph(zmm1, zmm2, ptr_b [rax+0x40]); 1004 1005 vfmulcph(xmm1, xmm2, ptr [rax+0x40]); 1006 vfmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]); 1007 vfmulcph(zmm1, zmm2, ptr_b [rax+0x40]); 1008 1009 vrcpph(xmm1, ptr [rax+0x40]); 1010 vrcpph(xmm1, ptr_b [rax+0x40]); 1011 vrcpph(ymm1, ptr [rax+0x40]); 1012 vrcpph(ymm1, ptr_b [rax+0x40]); 1013 vrcpph(zmm1, ptr [rax+0x40]); 1014 vrcpph(zmm1, ptr_b [rax+0x40]); 1015 1016 vrcpsh(xmm1, xmm3, ptr [rax+0x40]); 1017 1018 vrsqrtph(xmm1, ptr [rax+0x40]); 1019 vrsqrtph(xmm1, ptr_b [rax+0x40]); 1020 vrsqrtph(ymm2, ptr [rax+0x40]); 1021 vrsqrtph(ymm2, ptr_b [rax+0x40]); 1022 vrsqrtph(zmm2, ptr [rax+0x40]); 1023 vrsqrtph(zmm2, ptr_b [rax+0x40]); 1024 1025 vrsqrtsh(xmm1|k5|T_z, xmm7, ptr [rax+0x40]); 1026 1027 vsqrtph(xmm1|k4|T_z, ptr [rax+0x40]); 1028 vsqrtph(xmm1|k4|T_z, ptr_b [rax+0x40]); 1029 vsqrtph(ymm1|k4|T_z, ptr_b [rax+0x40]); 1030 vsqrtph(zmm1|k4|T_z, ptr [rax+0x40]); 1031 vsqrtph(zmm1|k4|T_z, ptr_b [rax+0x40]); 1032 1033 vsqrtsh(xmm1|k4|T_z, xmm5, ptr [rax+0x40]); 1034 vsqrtsh(xmm1|k4|T_z|T_rd_sae, xmm5, xmm7); 1035 1036 vscalefph(xmm1, xmm5, ptr [rax+0x40]); 1037 vscalefph(xmm1, xmm5, ptr_b [rax+0x40]); 1038 vscalefph(ymm1, ymm5, ptr [rax+0x40]); 1039 vscalefph(ymm1, ymm5, ptr_b [rax+0x40]); 1040 vscalefph(zmm1, zmm5, ptr [rax+0x40]); 1041 vscalefph(zmm1, zmm5, ptr_b [rax+0x40]); 1042 vscalefph(zmm1|k1|T_z|T_rd_sae, zmm5, zmm7); 1043 1044 vscalefsh(xmm1, xmm5, ptr [rax+0x40]); 1045 vscalefsh(xmm1|k1|T_z|T_rd_sae, xmm5, xmm7); 1046 1047 vreduceph(xmm1, ptr [rax+0x40], 0x1); 1048 vreduceph(xmm1, ptr_b [rax+0x40], 0x2); 1049 vreduceph(ymm1, ptr [rax+0x40], 0x3); 1050 vreduceph(ymm1, ptr_b [rax+0x40], 0x4); 1051 vreduceph(zmm1, ptr [rax+0x40], 0x5); 1052 vreduceph(zmm1, ptr_b [rax+0x40], 0x6); 1053 vreduceph(zmm1|k1|T_z|T_sae, zmm5, 0x7); 1054 1055 vreducesh(xmm1, xmm3, ptr [rax+0x40], 0x1); 1056 vreducesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2); 1057 1058 vrndscaleph(xmm1, ptr [rax+0x40], 0x1); 1059 vrndscaleph(xmm1, ptr_b [rax+0x40], 0x2); 1060 vrndscaleph(ymm1, ptr [rax+0x40], 0x3); 1061 vrndscaleph(ymm1, ptr_b [rax+0x40], 0x4); 1062 vrndscaleph(zmm1, ptr [rax+0x40], 0x5); 1063 vrndscaleph(zmm1, ptr_b [rax+0x40], 0x6); 1064 vrndscaleph(zmm1|k1|T_z|T_sae, zmm5, 0x7); 1065 1066 vrndscalesh(xmm1, xmm3, ptr [rax+0x40], 0x1); 1067 vrndscalesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2); 1068 1069 vfpclassph(k1, xword [rax+0x40], 0x1); 1070 vfpclassph(k1, xword_b[rax+0x40], 0x2); 1071 vfpclassph(k1, yword [rax+0x40], 0x3); 1072 vfpclassph(k1, yword_b[rax+0x40], 0x4); 1073 vfpclassph(k1, zword [rax+0x40], 0x5); 1074 vfpclassph(k1, zword_b[rax+0x40], 0x6); 1075 1076 vfpclasssh(k1|k2, xmm3, 0x5); 1077 vfpclasssh(k1|k2, ptr [rax+0x40], 0x5); 1078 1079 vgetexpph(xmm1, ptr [rax+0x40]); 1080 vgetexpph(ymm1, ptr_b [rax+0x40]); 1081 vgetexpph(zmm1, ptr [rax+0x40]); 1082 vgetexpph(zmm1|k1|T_z|T_sae, zmm5); 1083 vgetexpsh(xmm1, xmm5, ptr [rax+0x40]); 1084 vgetexpsh(xmm1|k1|T_z|T_sae, xmm3, xmm5); 1085 1086 vgetmantph(xmm1, ptr [rax+0x40], 0x1); 1087 vgetmantph(ymm1, ptr_b [rax+0x40], 0x2); 1088 vgetmantph(zmm1, ptr [rax+0x40], 0x3); 1089 vgetmantph(zmm1|k1|T_z|T_sae, zmm5, 0x4); 1090 1091 vgetmantsh(xmm1, xmm5, ptr [rax+0x40], 0x5); 1092 vgetmantsh(xmm1|k1|T_z|T_sae, xmm3, xmm5, 0x6); 1093 1094 vmovsh(xmm1|k1|T_z, ptr [rax+0x40]); 1095 vmovsh(ptr [rax+0x40]|k1, xmm1); 1096 vmovsh(xmm1|k2|T_z, xmm3, xmm5); 1097 1098 vmovw(xmm1, r13d); 1099 vmovw(xmm3, ptr [rax+0x40]); 1100 vmovw(r9d, xmm1); 1101 vmovw(ptr [rax+0x40], xmm7); 1102 1103 vcvtsd2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); 1104 vcvtsd2sh(xmm1, xmm2, ptr [rax+0x40]); 1105 1106 vcvtsh2sd(xmm1|k1|T_z|T_sae, xmm2, xmm3); 1107 vcvtsh2sd(xmm1, xmm2, ptr [rax+0x40]); 1108 1109 vcvtsh2ss(xmm1|k1|T_z|T_sae, xmm2, xmm3); 1110 vcvtsh2ss(xmm1, xmm2, ptr [rax+0x40]); 1111 1112 vcvtss2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3); 1113 vcvtss2sh(xmm1, xmm2, ptr [rax+0x40]); 1114 1115 vcvtsh2si(edx|T_rd_sae, xmm1); 1116 vcvtsh2si(edx, ptr [rax+0x40]); 1117 vcvtsh2si(rdx|T_rd_sae, xmm1); 1118 vcvtsh2si(r8, ptr [rax+0x40]); 1119 1120 vcvtph2dq(xmm1, xmm5); 1121 vcvtph2dq(xmm1, ptr [rax+0x40]); 1122 vcvtph2dq(xmm1, ptr_b [rax+0x40]); 1123 vcvtph2dq(ymm1|k2|T_z, xmm5); 1124 vcvtph2dq(ymm1, ptr [rax+0x40]); 1125 vcvtph2dq(ymm1, ptr_b [rax+0x40]); 1126 vcvtph2dq(zmm1|k5|T_z|T_rd_sae, ymm3); 1127 vcvtph2dq(zmm1|k5|T_z, ptr [rax+0x40]); 1128 vcvtph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]); 1129 1130 vcvtph2psx(xmm1, xmm5); 1131 vcvtph2psx(xmm1, ptr [rax+0x40]); 1132 vcvtph2psx(xmm1, ptr_b [rax+0x40]); 1133 vcvtph2psx(ymm1|k2|T_z, xmm5); 1134 vcvtph2psx(ymm1, ptr [rax+0x40]); 1135 vcvtph2psx(ymm1, ptr_b [rax+0x40]); 1136 vcvtph2psx(zmm1|k5|T_z|T_sae, ymm3); 1137 vcvtph2psx(zmm1|k5|T_z, ptr [rax+0x40]); 1138 vcvtph2psx(zmm1|k5|T_z, ptr_b [rax+0x40]); 1139 1140 vcvtph2udq(xmm1, xmm5); 1141 vcvtph2udq(xmm1, ptr [rax+0x40]); 1142 vcvtph2udq(xmm1, ptr_b [rax+0x40]); 1143 vcvtph2udq(ymm1|k2|T_z, xmm5); 1144 vcvtph2udq(ymm1, ptr [rax+0x40]); 1145 vcvtph2udq(ymm1, ptr_b [rax+0x40]); 1146 vcvtph2udq(zmm1|k5|T_z|T_rd_sae, ymm3); 1147 vcvtph2udq(zmm1|k5|T_z, ptr [rax+0x40]); 1148 vcvtph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]); 1149 1150 vcvttph2dq(xmm1, xmm5); 1151 vcvttph2dq(xmm1, ptr [rax+0x40]); 1152 vcvttph2dq(xmm1, ptr_b [rax+0x40]); 1153 vcvttph2dq(ymm1|k2|T_z, xmm5); 1154 vcvttph2dq(ymm1, ptr [rax+0x40]); 1155 vcvttph2dq(ymm1, ptr_b [rax+0x40]); 1156 vcvttph2dq(zmm1|k5|T_z|T_sae, ymm3); 1157 vcvttph2dq(zmm1|k5|T_z, ptr [rax+0x40]); 1158 vcvttph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]); 1159 1160 vcvttph2udq(xmm1, xmm5); 1161 vcvttph2udq(xmm1, ptr [rax+0x40]); 1162 vcvttph2udq(xmm1, ptr_b [rax+0x40]); 1163 vcvttph2udq(ymm1|k2|T_z, xmm5); 1164 vcvttph2udq(ymm1, ptr [rax+0x40]); 1165 vcvttph2udq(ymm1, ptr_b [rax+0x40]); 1166 vcvttph2udq(zmm1|k5|T_z|T_sae, ymm3); 1167 vcvttph2udq(zmm1|k5|T_z, ptr [rax+0x40]); 1168 vcvttph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]); 1169 1170 1171 vcvtph2pd(xmm1, xmm5); 1172 vcvtph2pd(xmm1, ptr [rax+0x40]); 1173 vcvtph2pd(xmm1, ptr_b [rax+0x40]); 1174 vcvtph2pd(ymm1|k2|T_z, xmm5); 1175 vcvtph2pd(ymm1, ptr [rax+0x40]); 1176 vcvtph2pd(ymm1, ptr_b [rax+0x40]); 1177 vcvtph2pd(zmm1|k5|T_z|T_sae, xmm3); 1178 vcvtph2pd(zmm1|k5|T_z, ptr [rax+0x40]); 1179 vcvtph2pd(zmm1|k5|T_z, ptr_b [rax+0x40]); 1180 1181 vcvtph2qq(xmm1, xmm5); 1182 vcvtph2qq(xmm1, ptr [rax+0x40]); 1183 vcvtph2qq(xmm1, ptr_b [rax+0x40]); 1184 vcvtph2qq(ymm1|k2|T_z, xmm5); 1185 vcvtph2qq(ymm1, ptr [rax+0x40]); 1186 vcvtph2qq(ymm1, ptr_b [rax+0x40]); 1187 vcvtph2qq(zmm1|k5|T_z|T_rd_sae, xmm3); 1188 vcvtph2qq(zmm1|k5|T_z, ptr [rax+0x40]); 1189 vcvtph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]); 1190 1191 vcvtph2uqq(xmm1, xmm5); 1192 vcvtph2uqq(xmm1, ptr [rax+0x40]); 1193 vcvtph2uqq(xmm1, ptr_b [rax+0x40]); 1194 vcvtph2uqq(ymm1|k2|T_z, xmm5); 1195 vcvtph2uqq(ymm1, ptr [rax+0x40]); 1196 vcvtph2uqq(ymm1, ptr_b [rax+0x40]); 1197 vcvtph2uqq(zmm1|k5|T_z|T_rd_sae, xmm3); 1198 vcvtph2uqq(zmm1|k5|T_z, ptr [rax+0x40]); 1199 vcvtph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]); 1200 1201 vcvttph2uqq(xmm1, xmm5); 1202 vcvttph2uqq(xmm1, ptr [rax+0x40]); 1203 vcvttph2uqq(xmm1, ptr_b [rax+0x40]); 1204 vcvttph2uqq(ymm1|k2|T_z, xmm5); 1205 vcvttph2uqq(ymm1, ptr [rax+0x40]); 1206 vcvttph2uqq(ymm1, ptr_b [rax+0x40]); 1207 vcvttph2uqq(zmm1|k5|T_z|T_sae, xmm3); 1208 vcvttph2uqq(zmm1|k5|T_z, ptr [rax+0x40]); 1209 vcvttph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]); 1210 1211 vcvtdq2ph(xmm1, xmm5); 1212 vcvtdq2ph(xmm1, xword [rax+0x40]); 1213 vcvtdq2ph(xmm1, xword_b [rax+0x40]); 1214 vcvtdq2ph(xmm1, yword [rax+0x40]); 1215 vcvtdq2ph(xmm1, yword_b [rax+0x40]); 1216 vcvtdq2ph(ymm1|k2|T_z|T_rd_sae, zmm5); 1217 vcvtdq2ph(ymm1, ptr [rax+0x40]); 1218 vcvtdq2ph(ymm1, ptr_b [rax+0x40]); 1219 1220 vcvtps2phx(xmm1, xmm5); 1221 vcvtps2phx(xmm1, xword [rax+0x40]); 1222 vcvtps2phx(xmm1, xword_b [rax+0x40]); 1223 vcvtps2phx(xmm1, yword [rax+0x40]); 1224 vcvtps2phx(xmm1, yword_b [rax+0x40]); 1225 vcvtps2phx(ymm1|k2|T_z|T_rd_sae, zmm5); 1226 vcvtps2phx(ymm1, ptr [rax+0x40]); 1227 vcvtps2phx(ymm1, ptr_b [rax+0x40]); 1228 1229 vcvtudq2ph(xmm1, xmm5); 1230 vcvtudq2ph(xmm1, xword [rax+0x40]); 1231 vcvtudq2ph(xmm1, xword_b [rax+0x40]); 1232 vcvtudq2ph(xmm1, yword [rax+0x40]); 1233 vcvtudq2ph(xmm1, yword_b [rax+0x40]); 1234 vcvtudq2ph(ymm1|k2|T_z|T_rd_sae, zmm5); 1235 vcvtudq2ph(ymm1, ptr [rax+0x40]); 1236 vcvtudq2ph(ymm1, ptr_b [rax+0x40]); 1237 1238 vcvtpd2ph(xmm1, xmm5); 1239 vcvtpd2ph(xmm1, ymm5); 1240 vcvtpd2ph(xmm1|k2|T_z|T_rd_sae, zmm5); 1241 vcvtpd2ph(xmm1, xword [rax+0x40]); 1242 vcvtpd2ph(xmm1, xword_b [rax+0x40]); 1243 vcvtpd2ph(xmm1, yword [rax+0x40]); 1244 vcvtpd2ph(xmm1, yword_b [rax+0x40]); 1245 vcvtpd2ph(xmm1, zword [rax+0x40]); 1246 vcvtpd2ph(xmm1, zword_b [rax+0x40]); 1247 1248 vcvtqq2ph(xmm1, xmm5); 1249 vcvtqq2ph(xmm1, ymm5); 1250 vcvtqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5); 1251 vcvtqq2ph(xmm1, xword [rax+0x40]); 1252 vcvtqq2ph(xmm1, xword_b [rax+0x40]); 1253 vcvtqq2ph(xmm1, yword [rax+0x40]); 1254 vcvtqq2ph(xmm1, yword_b [rax+0x40]); 1255 vcvtqq2ph(xmm1, zword [rax+0x40]); 1256 vcvtqq2ph(xmm1, zword_b [rax+0x40]); 1257 1258 vcvtuqq2ph(xmm1, xmm5); 1259 vcvtuqq2ph(xmm1, ymm5); 1260 vcvtuqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5); 1261 vcvtuqq2ph(xmm1, xword [rax+0x40]); 1262 vcvtuqq2ph(xmm1, xword_b [rax+0x40]); 1263 vcvtuqq2ph(xmm1, yword [rax+0x40]); 1264 vcvtuqq2ph(xmm1, yword_b [rax+0x40]); 1265 vcvtuqq2ph(xmm1, zword [rax+0x40]); 1266 vcvtuqq2ph(xmm1, zword_b [rax+0x40]); 1267 1268 vcvtph2uw(xmm1, xmm5); 1269 vcvtph2uw(xmm1, ptr [rax+0x40]); 1270 vcvtph2uw(xmm1, ptr_b [rax+0x40]); 1271 vcvtph2uw(ymm1, ptr [rax+0x40]); 1272 vcvtph2uw(ymm1, ptr_b [rax+0x40]); 1273 vcvtph2uw(zmm1|k2|T_z|T_rd_sae, zmm5); 1274 vcvtph2uw(zmm1, ptr [rax+0x40]); 1275 vcvtph2uw(zmm1, ptr_b [rax+0x40]); 1276 1277 vcvtph2w(xmm1, xmm5); 1278 vcvtph2w(xmm1, ptr [rax+0x40]); 1279 vcvtph2w(xmm1, ptr_b [rax+0x40]); 1280 vcvtph2w(ymm1, ptr [rax+0x40]); 1281 vcvtph2w(ymm1, ptr_b [rax+0x40]); 1282 vcvtph2w(zmm1|k2|T_z|T_rd_sae, zmm5); 1283 vcvtph2w(zmm1, ptr [rax+0x40]); 1284 vcvtph2w(zmm1, ptr_b [rax+0x40]); 1285 1286 vcvttph2uw(xmm1, xmm5); 1287 vcvttph2uw(xmm1, ptr [rax+0x40]); 1288 vcvttph2uw(xmm1, ptr_b [rax+0x40]); 1289 vcvttph2uw(ymm1, ptr [rax+0x40]); 1290 vcvttph2uw(ymm1, ptr_b [rax+0x40]); 1291 vcvttph2uw(zmm1|k2|T_z|T_sae, zmm5); 1292 vcvttph2uw(zmm1, ptr [rax+0x40]); 1293 vcvttph2uw(zmm1, ptr_b [rax+0x40]); 1294 1295 vcvttph2w(xmm1, xmm5); 1296 vcvttph2w(xmm1, ptr [rax+0x40]); 1297 vcvttph2w(xmm1, ptr_b [rax+0x40]); 1298 vcvttph2w(ymm1, ptr [rax+0x40]); 1299 vcvttph2w(ymm1, ptr_b [rax+0x40]); 1300 vcvttph2w(zmm1|k2|T_z|T_sae, zmm5); 1301 vcvttph2w(zmm1, ptr [rax+0x40]); 1302 vcvttph2w(zmm1, ptr_b [rax+0x40]); 1303 1304 vcvtuw2ph(xmm1, xmm5); 1305 vcvtuw2ph(xmm1, ptr [rax+0x40]); 1306 vcvtuw2ph(xmm1, ptr_b [rax+0x40]); 1307 vcvtuw2ph(ymm1, ptr [rax+0x40]); 1308 vcvtuw2ph(ymm1, ptr_b [rax+0x40]); 1309 vcvtuw2ph(zmm1|k2|T_z|T_rd_sae, zmm5); 1310 vcvtuw2ph(zmm1, ptr [rax+0x40]); 1311 vcvtuw2ph(zmm1, ptr_b [rax+0x40]); 1312 1313 vcvtw2ph(xmm1, xmm5); 1314 vcvtw2ph(xmm1, ptr [rax+0x40]); 1315 vcvtw2ph(xmm1, ptr_b [rax+0x40]); 1316 vcvtw2ph(ymm1, ptr [rax+0x40]); 1317 vcvtw2ph(ymm1, ptr_b [rax+0x40]); 1318 vcvtw2ph(zmm1|k2|T_z|T_rd_sae, zmm5); 1319 vcvtw2ph(zmm1, ptr [rax+0x40]); 1320 vcvtw2ph(zmm1, ptr_b [rax+0x40]); 1321 1322 vcvtps2ph(xmm1, xmm2, 0x1); 1323 vcvtps2ph(ptr [rax+0x40], xmm2, 0x2); 1324 vcvtps2ph(xmm1, ymm2, 0x3); 1325 vcvtps2ph(ptr [rax+0x40], ymm2, 0x4); 1326 vcvtps2ph(xmm1|k1|T_z, xmm2, 0x5); 1327 vcvtps2ph(ptr [rax+0x40]|k1, xmm3, 0x6); 1328 vcvtps2ph(xmm1|k2, ymm4, 0x7); 1329 vcvtps2ph(ptr [rax+0x40]|k2, ymm5, 0x8); 1330 vcvtps2ph(ymm1|k2|T_sae, zmm5, 0x9); 1331 vcvtps2ph(ptr [rax+0x40]|k5, zmm4, 0xa); 1332 1333 vcvtsh2usi(ecx|T_rd_sae, xmm1); 1334 vcvtsh2usi(eax, ptr [rax+0x40]); 1335 vcvtsh2usi(r9|T_rd_sae, xmm1); 1336 vcvtsh2usi(r13, ptr [rax+0x40]); 1337 1338 vcvttsh2si(ecx|T_sae, xmm1); 1339 vcvttsh2si(eax, ptr [rax+0x40]); 1340 vcvttsh2si(r9|T_sae, xmm1); 1341 vcvttsh2si(r13, ptr [rax+0x40]); 1342 1343 vcvttsh2usi(ecx|T_sae, xmm1); 1344 vcvttsh2usi(eax, ptr [rax+0x40]); 1345 vcvttsh2usi(r9|T_sae, xmm1); 1346 vcvttsh2usi(r13, ptr [rax+0x40]); 1347 1348 vcvttph2qq(xmm1, xmm5); 1349 vcvttph2qq(xmm1, ptr [rax+0x40]); 1350 vcvttph2qq(xmm1, ptr_b [rax+0x40]); 1351 vcvttph2qq(ymm1|k2|T_z, xmm5); 1352 vcvttph2qq(ymm1, ptr [rax+0x40]); 1353 vcvttph2qq(ymm1, ptr_b [rax+0x40]); 1354 vcvttph2qq(zmm1|k5|T_z|T_sae, xmm3); 1355 vcvttph2qq(zmm1|k5|T_z, ptr [rax+0x40]); 1356 vcvttph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]); 1357 1358 vcvtsi2sh(xmm1|T_rd_sae, xmm2, eax); 1359 vcvtsi2sh(xmm1, xmm2, dword [rax+0x40]); 1360 vcvtsi2sh(xmm1|T_rd_sae, xmm2, r9); 1361 vcvtsi2sh(xmm1, xmm2, qword [rax+0x40]); 1362 1363 vcvtusi2sh(xmm1|T_rd_sae, xmm2, eax); 1364 vcvtusi2sh(xmm1, xmm2, dword [rax+0x40]); 1365 vcvtusi2sh(xmm1|T_rd_sae, xmm2, r9); 1366 vcvtusi2sh(xmm1, xmm2, qword [rax+0x40]); 1367 } 1368 } c; 1369 const uint8_t tbl[] = { 1370 // vaddph 1371 0x62, 0xF5, 0x74, 0x48, 0x58, 0x40, 0x01, 1372 0x62, 0xF5, 0x74, 0x28, 0x58, 0x40, 0x02, 1373 0x62, 0xF5, 0x74, 0x08, 0x58, 0x40, 0x04, 1374 1375 0x62, 0xF5, 0x74, 0x58, 0x58, 0x40, 0x20, 1376 0x62, 0xF5, 0x74, 0x38, 0x58, 0x40, 0x20, 1377 0x62, 0xF5, 0x74, 0x18, 0x58, 0x40, 0x20, 1378 1379 // vaddsh 1380 0x62, 0xF5, 0x06, 0x08, 0x58, 0x40, 0x20, 1381 0x62, 0xF5, 0x06, 0xBD, 0x58, 0xC3, 1382 1383 // vcmpph 1384 0x62, 0xf3, 0x04, 0x08, 0xc2, 0x48, 0x04, 0x01, 1385 0x62, 0xf3, 0x04, 0x28, 0xc2, 0x50, 0x02, 0x02, 1386 0x62, 0xf3, 0x04, 0x48, 0xc2, 0x58, 0x01, 0x03, 1387 0x62, 0xf3, 0x04, 0x18, 0xc2, 0x48, 0x20, 0x01, 1388 0x62, 0xf3, 0x04, 0x38, 0xc2, 0x50, 0x20, 0x02, 1389 0x62, 0xf3, 0x04, 0x58, 0xc2, 0x58, 0x20, 0x03, 1390 1391 // vcmpsh 1392 0x62, 0xf3, 0x06, 0x08, 0xc2, 0x48, 0x20, 0x01, 1393 0x62, 0x93, 0x76, 0x1d, 0xc2, 0xd9, 0x04, 1394 1395 // vcomish 1396 0x62, 0xf5, 0x7c, 0x08, 0x2f, 0x48, 0x20, 1397 0x62, 0xd5, 0x7c, 0x18, 0x2f, 0xcf, 1398 1399 // vucomish 1400 0x62, 0xf5, 0x7c, 0x08, 0x2e, 0x48, 0x20, 1401 0x62, 0xd5, 0x7c, 0x18, 0x2e, 0xcf, 1402 1403 // vfmaddsub213ph 1404 0x62, 0xf6, 0x6d, 0x08, 0xa6, 0x48, 0x04, 1405 0x62, 0xf6, 0x6d, 0x18, 0xa6, 0x48, 0x20, 1406 0x62, 0xf6, 0x6d, 0x0b, 0xa6, 0xcd, 1407 0x62, 0xf6, 0x6d, 0x28, 0xa6, 0x48, 0x02, 1408 0x62, 0xf6, 0x6d, 0x38, 0xa6, 0x48, 0x20, 1409 0x62, 0xf6, 0x6d, 0x2b, 0xa6, 0xcd, 1410 0x62, 0xf6, 0x6d, 0x48, 0xa6, 0x48, 0x01, 1411 0x62, 0xf6, 0x6d, 0x58, 0xa6, 0x48, 0x20, 1412 0x62, 0xf6, 0x6d, 0x58, 0xa6, 0xcd, 1413 1414 // vfmsubadd132ph 1415 0x62, 0xf6, 0x6d, 0x08, 0x97, 0x48, 0x04, 1416 0x62, 0xf6, 0x6d, 0x18, 0x97, 0x48, 0x20, 1417 0x62, 0xf6, 0x6d, 0x28, 0x97, 0x48, 0x02, 1418 0x62, 0xf6, 0x6d, 0x38, 0x97, 0x48, 0x20, 1419 0x62, 0xf6, 0x6d, 0x48, 0x97, 0x48, 0x01, 1420 0x62, 0xf6, 0x6d, 0x58, 0x97, 0x48, 0x20, 1421 0x62, 0xf6, 0x6d, 0x58, 0x97, 0xcd, 1422 1423 // vfmadd132ph 1424 0x62, 0xf6, 0x6d, 0x08, 0x98, 0x48, 0x04, 1425 0x62, 0xf6, 0x6d, 0x18, 0x98, 0x48, 0x20, 1426 0x62, 0xf6, 0x6d, 0x28, 0x98, 0x48, 0x02, 1427 0x62, 0xf6, 0x6d, 0x38, 0x98, 0x48, 0x20, 1428 0x62, 0xf6, 0x6d, 0x48, 0x98, 0x48, 0x01, 1429 0x62, 0xf6, 0x6d, 0x58, 0x98, 0x48, 0x20, 1430 0x62, 0xf6, 0x6d, 0x38, 0x98, 0xcd, 1431 1432 // vfmsub231ph 1433 0x62, 0xf6, 0x6d, 0x08, 0xba, 0x48, 0x04, 1434 0x62, 0xf6, 0x6d, 0x18, 0xba, 0x48, 0x20, 1435 0x62, 0xf6, 0x6d, 0x28, 0xba, 0x48, 0x02, 1436 0x62, 0xf6, 0x6d, 0x38, 0xba, 0x48, 0x20, 1437 0x62, 0xf6, 0x6d, 0x48, 0xba, 0x48, 0x01, 1438 0x62, 0xf6, 0x6d, 0x58, 0xba, 0x48, 0x20, 1439 0x62, 0xf6, 0x6d, 0x38, 0xba, 0xcd, 1440 1441 // vfnmsub231ph 1442 0x62, 0xf6, 0x6d, 0x08, 0xbe, 0x48, 0x04, 1443 0x62, 0xf6, 0x6d, 0x38, 0xbe, 0x48, 0x20, 1444 0x62, 0xf6, 0x6d, 0x58, 0xbe, 0x48, 0x20, 1445 0x62, 0xf6, 0x6d, 0x38, 0xbe, 0xcd, 1446 1447 // vfmadd132sh 1448 0x62, 0xf6, 0x6d, 0xb9, 0x99, 0xcb, 1449 0x62, 0xf6, 0x6d, 0x08, 0x99, 0x48, 0x20, 1450 1451 // vfnmadd132sh 1452 0x62, 0xf6, 0x6d, 0xb9, 0x9d, 0xcb, 1453 0x62, 0xf6, 0x6d, 0x08, 0x9d, 0x48, 0x20, 1454 1455 // vfmsub132sh 1456 0x62, 0xf6, 0x6d, 0xb9, 0x9b, 0xcb, 1457 0x62, 0xf6, 0x6d, 0x08, 0x9b, 0x48, 0x20, 1458 1459 // vfnmsub132sh 1460 0x62, 0xf6, 0x6d, 0xb9, 0x9f, 0xcb, 1461 0x62, 0xf6, 0x6d, 0x08, 0x9f, 0x48, 0x20, 1462 1463 // vfcmaddcph 1464 0x62, 0xf6, 0x6f, 0x89, 0x56, 0x48, 0x04, 1465 0x62, 0xf6, 0x6f, 0xa9, 0x56, 0x48, 0x02, 1466 0x62, 0xf6, 0x6f, 0x49, 0x56, 0x48, 0x01, 1467 0x62, 0xf6, 0x6f, 0x39, 0x56, 0xcd, 1468 0x62, 0xf6, 0x6f, 0x99, 0x56, 0x48, 0x10, 1469 0x62, 0xf6, 0x6f, 0xb9, 0x56, 0x48, 0x10, 1470 0x62, 0xf6, 0x6f, 0xd9, 0x56, 0x48, 0x10, 1471 1472 // vfmaddcph 1473 0x62, 0xf6, 0x6e, 0x08, 0x56, 0x48, 0x04, 1474 0x62, 0xf6, 0x6e, 0xb9, 0x56, 0x48, 0x10, 1475 0x62, 0xf6, 0x6e, 0x58, 0x56, 0x48, 0x10, 1476 1477 // vfcmulcph 1478 0x62, 0xf6, 0x6f, 0x08, 0xd6, 0x48, 0x04, 1479 0x62, 0xf6, 0x6f, 0xb9, 0xd6, 0x48, 0x10, 1480 0x62, 0xf6, 0x6f, 0x58, 0xd6, 0x48, 0x10, 1481 1482 // vfmulcph 1483 0x62, 0xf6, 0x6e, 0x08, 0xd6, 0x48, 0x04, 1484 0x62, 0xf6, 0x6e, 0xb9, 0xd6, 0x48, 0x10, 1485 0x62, 0xf6, 0x6e, 0x58, 0xd6, 0x48, 0x10, 1486 1487 // vrcpph 1488 0x62, 0xf6, 0x7d, 0x08, 0x4c, 0x48, 0x04, 1489 0x62, 0xf6, 0x7d, 0x18, 0x4c, 0x48, 0x20, 1490 0x62, 0xf6, 0x7d, 0x28, 0x4c, 0x48, 0x02, 1491 0x62, 0xf6, 0x7d, 0x38, 0x4c, 0x48, 0x20, 1492 0x62, 0xf6, 0x7d, 0x48, 0x4c, 0x48, 0x01, 1493 0x62, 0xf6, 0x7d, 0x58, 0x4c, 0x48, 0x20, 1494 1495 // vrcpsh 1496 0x62, 0xf6, 0x65, 0x08, 0x4d, 0x48, 0x20, 1497 1498 // vrsqrtph 1499 0x62, 0xf6, 0x7d, 0x08, 0x4e, 0x48, 0x04, 1500 0x62, 0xf6, 0x7d, 0x18, 0x4e, 0x48, 0x20, 1501 0x62, 0xf6, 0x7d, 0x28, 0x4e, 0x50, 0x02, 1502 0x62, 0xf6, 0x7d, 0x38, 0x4e, 0x50, 0x20, 1503 0x62, 0xf6, 0x7d, 0x48, 0x4e, 0x50, 0x01, 1504 0x62, 0xf6, 0x7d, 0x58, 0x4e, 0x50, 0x20, 1505 1506 // vrsqrtsh 1507 0x62, 0xf6, 0x45, 0x8d, 0x4f, 0x48, 0x20, 1508 1509 // vsqrtph 1510 0x62, 0xf5, 0x7c, 0x8c, 0x51, 0x48, 0x04, 1511 0x62, 0xf5, 0x7c, 0x9c, 0x51, 0x48, 0x20, 1512 0x62, 0xf5, 0x7c, 0xbc, 0x51, 0x48, 0x20, 1513 0x62, 0xf5, 0x7c, 0xcc, 0x51, 0x48, 0x01, 1514 0x62, 0xf5, 0x7c, 0xdc, 0x51, 0x48, 0x20, 1515 1516 // vsqrtsh 1517 0x62, 0xf5, 0x56, 0x8c, 0x51, 0x48, 0x20, 1518 0x62, 0xf5, 0x56, 0xbc, 0x51, 0xcf, 1519 1520 // vscalefph 1521 0x62, 0xf6, 0x55, 0x08, 0x2c, 0x48, 0x04, 1522 0x62, 0xf6, 0x55, 0x18, 0x2c, 0x48, 0x20, 1523 0x62, 0xf6, 0x55, 0x28, 0x2c, 0x48, 0x02, 1524 0x62, 0xf6, 0x55, 0x38, 0x2c, 0x48, 0x20, 1525 0x62, 0xf6, 0x55, 0x48, 0x2c, 0x48, 0x01, 1526 0x62, 0xf6, 0x55, 0x58, 0x2c, 0x48, 0x20, 1527 0x62, 0xf6, 0x55, 0xb9, 0x2c, 0xcf, 1528 1529 // vscalefsh 1530 0x62, 0xf6, 0x55, 0x08, 0x2d, 0x48, 0x20, 1531 0x62, 0xf6, 0x55, 0xb9, 0x2d, 0xcf, 1532 1533 // vreduceph 1534 0x62, 0xf3, 0x7c, 0x08, 0x56, 0x48, 0x04, 0x01, 1535 0x62, 0xf3, 0x7c, 0x18, 0x56, 0x48, 0x20, 0x02, 1536 0x62, 0xf3, 0x7c, 0x28, 0x56, 0x48, 0x02, 0x03, 1537 0x62, 0xf3, 0x7c, 0x38, 0x56, 0x48, 0x20, 0x04, 1538 0x62, 0xf3, 0x7c, 0x48, 0x56, 0x48, 0x01, 0x05, 1539 0x62, 0xf3, 0x7c, 0x58, 0x56, 0x48, 0x20, 0x06, 1540 0x62, 0xf3, 0x7c, 0x99, 0x56, 0xcd, 0x07, 1541 1542 // vreducesh 1543 0x62, 0xf3, 0x64, 0x08, 0x57, 0x48, 0x20, 0x01, 1544 0x62, 0xf3, 0x54, 0x99, 0x57, 0xcc, 0x02, 1545 1546 // vrndscaleph 1547 0x62, 0xf3, 0x7c, 0x08, 0x08, 0x48, 0x04, 0x01, 1548 0x62, 0xf3, 0x7c, 0x18, 0x08, 0x48, 0x20, 0x02, 1549 0x62, 0xf3, 0x7c, 0x28, 0x08, 0x48, 0x02, 0x03, 1550 0x62, 0xf3, 0x7c, 0x38, 0x08, 0x48, 0x20, 0x04, 1551 0x62, 0xf3, 0x7c, 0x48, 0x08, 0x48, 0x01, 0x05, 1552 0x62, 0xf3, 0x7c, 0x58, 0x08, 0x48, 0x20, 0x06, 1553 0x62, 0xf3, 0x7c, 0x99, 0x08, 0xcd, 0x07, 1554 1555 // vrndscalesh 1556 0x62, 0xf3, 0x64, 0x08, 0x0a, 0x48, 0x20, 0x01, 1557 0x62, 0xf3, 0x54, 0x99, 0x0a, 0xcc, 0x02, 1558 1559 // vfpclassph 1560 0x62, 0xf3, 0x7c, 0x08, 0x66, 0x48, 0x04, 0x01, 1561 0x62, 0xf3, 0x7c, 0x18, 0x66, 0x48, 0x20, 0x02, 1562 0x62, 0xf3, 0x7c, 0x28, 0x66, 0x48, 0x02, 0x03, 1563 0x62, 0xf3, 0x7c, 0x38, 0x66, 0x48, 0x20, 0x04, 1564 0x62, 0xf3, 0x7c, 0x48, 0x66, 0x48, 0x01, 0x05, 1565 0x62, 0xf3, 0x7c, 0x58, 0x66, 0x48, 0x20, 0x06, 1566 1567 // vfpclasssh 1568 0x62, 0xf3, 0x7c, 0x0a, 0x67, 0xcb, 0x05, 1569 0x62, 0xf3, 0x7c, 0x0a, 0x67, 0x48, 0x20, 0x05, 1570 1571 // vgetexpph 1572 0x62, 0xf6, 0x7d, 0x08, 0x42, 0x48, 0x04, 1573 0x62, 0xf6, 0x7d, 0x38, 0x42, 0x48, 0x20, 1574 0x62, 0xf6, 0x7d, 0x48, 0x42, 0x48, 0x01, 1575 0x62, 0xf6, 0x7d, 0x99, 0x42, 0xcd, 1576 1577 // vgetexpsh 1578 0x62, 0xf6, 0x55, 0x08, 0x43, 0x48, 0x20, 1579 0x62, 0xf6, 0x65, 0x99, 0x43, 0xcd, 1580 1581 // vgetmantph 1582 0x62, 0xf3, 0x7c, 0x08, 0x26, 0x48, 0x04, 0x01, 1583 0x62, 0xf3, 0x7c, 0x38, 0x26, 0x48, 0x20, 0x02, 1584 0x62, 0xf3, 0x7c, 0x48, 0x26, 0x48, 0x01, 0x03, 1585 0x62, 0xf3, 0x7c, 0x99, 0x26, 0xcd, 0x04, 1586 1587 // vgetmantsh 1588 0x62, 0xf3, 0x54, 0x08, 0x27, 0x48, 0x20, 0x05, 1589 0x62, 0xf3, 0x64, 0x99, 0x27, 0xcd, 0x06, 1590 1591 // vmovsh 1592 0x62, 0xf5, 0x7e, 0x89, 0x10, 0x48, 0x20, 1593 0x62, 0xf5, 0x7e, 0x09, 0x11, 0x48, 0x20, 1594 0x62, 0xf5, 0x66, 0x8a, 0x10, 0xcd, 1595 1596 // vmovw 1597 0x62, 0xd5, 0x7d, 0x08, 0x6e, 0xcd, 1598 0x62, 0xf5, 0x7d, 0x08, 0x6e, 0x58, 0x20, 1599 0x62, 0xd5, 0x7d, 0x08, 0x7e, 0xc9, 1600 0x62, 0xf5, 0x7d, 0x08, 0x7e, 0x78, 0x20, 1601 1602 // vcvtsd2sh 1603 0x62, 0xf5, 0xef, 0xb9, 0x5a, 0xcb, 1604 0x62, 0xf5, 0xef, 0x08, 0x5a, 0x48, 0x08, 1605 1606 // vcvtsh2sd 1607 0x62, 0xf5, 0x6e, 0x99, 0x5a, 0xcb, 1608 0x62, 0xf5, 0x6e, 0x08, 0x5a, 0x48, 0x20, 1609 1610 // vcvtsh2ss 1611 0x62, 0xf6, 0x6c, 0x99, 0x13, 0xcb, 1612 0x62, 0xf6, 0x6c, 0x08, 0x13, 0x48, 0x20, 1613 1614 // vcvtss2sh 1615 0x62, 0xf5, 0x6c, 0xb9, 0x1d, 0xcb, 1616 0x62, 0xf5, 0x6c, 0x08, 0x1d, 0x48, 0x10, 1617 1618 // vcvtsh2si 1619 0x62, 0xf5, 0x7e, 0x38, 0x2d, 0xd1, 1620 0x62, 0xf5, 0x7e, 0x08, 0x2d, 0x50, 0x20, 1621 0x62, 0xf5, 0xfe, 0x38, 0x2d, 0xd1, 1622 0x62, 0x75, 0xfe, 0x08, 0x2d, 0x40, 0x20, 1623 1624 // vcvtph2dq 1625 0x62, 0xf5, 0x7d, 0x08, 0x5b, 0xcd, 1626 0x62, 0xf5, 0x7d, 0x08, 0x5b, 0x48, 0x08, 1627 0x62, 0xf5, 0x7d, 0x18, 0x5b, 0x48, 0x20, 1628 0x62, 0xf5, 0x7d, 0xaa, 0x5b, 0xcd, 1629 0x62, 0xf5, 0x7d, 0x28, 0x5b, 0x48, 0x04, 1630 0x62, 0xf5, 0x7d, 0x38, 0x5b, 0x48, 0x20, 1631 0x62, 0xf5, 0x7d, 0xbd, 0x5b, 0xcb, 1632 0x62, 0xf5, 0x7d, 0xcd, 0x5b, 0x48, 0x02, 1633 0x62, 0xf5, 0x7d, 0xdd, 0x5b, 0x48, 0x20, 1634 1635 // vcvtph2psx 1636 0x62, 0xf6, 0x7d, 0x08, 0x13, 0xcd, 1637 0x62, 0xf6, 0x7d, 0x08, 0x13, 0x48, 0x08, 1638 0x62, 0xf6, 0x7d, 0x18, 0x13, 0x48, 0x20, 1639 0x62, 0xf6, 0x7d, 0xaa, 0x13, 0xcd, 1640 0x62, 0xf6, 0x7d, 0x28, 0x13, 0x48, 0x04, 1641 0x62, 0xf6, 0x7d, 0x38, 0x13, 0x48, 0x20, 1642 0x62, 0xf6, 0x7d, 0x9d, 0x13, 0xcb, 1643 0x62, 0xf6, 0x7d, 0xcd, 0x13, 0x48, 0x02, 1644 0x62, 0xf6, 0x7d, 0xdd, 0x13, 0x48, 0x20, 1645 1646 // vcvtph2udq 1647 0x62, 0xf5, 0x7c, 0x08, 0x79, 0xcd, 1648 0x62, 0xf5, 0x7c, 0x08, 0x79, 0x48, 0x08, 1649 0x62, 0xf5, 0x7c, 0x18, 0x79, 0x48, 0x20, 1650 0x62, 0xf5, 0x7c, 0xaa, 0x79, 0xcd, 1651 0x62, 0xf5, 0x7c, 0x28, 0x79, 0x48, 0x04, 1652 0x62, 0xf5, 0x7c, 0x38, 0x79, 0x48, 0x20, 1653 0x62, 0xf5, 0x7c, 0xbd, 0x79, 0xcb, 1654 0x62, 0xf5, 0x7c, 0xcd, 0x79, 0x48, 0x02, 1655 0x62, 0xf5, 0x7c, 0xdd, 0x79, 0x48, 0x20, 1656 1657 // vcvttph2dq 1658 0x62, 0xf5, 0x7e, 0x08, 0x5b, 0xcd, 1659 0x62, 0xf5, 0x7e, 0x08, 0x5b, 0x48, 0x08, 1660 0x62, 0xf5, 0x7e, 0x18, 0x5b, 0x48, 0x20, 1661 0x62, 0xf5, 0x7e, 0xaa, 0x5b, 0xcd, 1662 0x62, 0xf5, 0x7e, 0x28, 0x5b, 0x48, 0x04, 1663 0x62, 0xf5, 0x7e, 0x38, 0x5b, 0x48, 0x20, 1664 0x62, 0xf5, 0x7e, 0x9d, 0x5b, 0xcb, 1665 0x62, 0xf5, 0x7e, 0xcd, 0x5b, 0x48, 0x02, 1666 0x62, 0xf5, 0x7e, 0xdd, 0x5b, 0x48, 0x20, 1667 1668 // vcvttph2udq 1669 0x62, 0xf5, 0x7c, 0x08, 0x78, 0xcd, 1670 0x62, 0xf5, 0x7c, 0x08, 0x78, 0x48, 0x08, 1671 0x62, 0xf5, 0x7c, 0x18, 0x78, 0x48, 0x20, 1672 0x62, 0xf5, 0x7c, 0xaa, 0x78, 0xcd, 1673 0x62, 0xf5, 0x7c, 0x28, 0x78, 0x48, 0x04, 1674 0x62, 0xf5, 0x7c, 0x38, 0x78, 0x48, 0x20, 1675 0x62, 0xf5, 0x7c, 0x9d, 0x78, 0xcb, 1676 0x62, 0xf5, 0x7c, 0xcd, 0x78, 0x48, 0x02, 1677 0x62, 0xf5, 0x7c, 0xdd, 0x78, 0x48, 0x20, 1678 1679 // vcvtph2pd 1680 0x62, 0xf5, 0x7c, 0x08, 0x5a, 0xcd, 1681 0x62, 0xf5, 0x7c, 0x08, 0x5a, 0x48, 0x10, 1682 0x62, 0xf5, 0x7c, 0x18, 0x5a, 0x48, 0x20, 1683 0x62, 0xf5, 0x7c, 0xaa, 0x5a, 0xcd, 1684 0x62, 0xf5, 0x7c, 0x28, 0x5a, 0x48, 0x08, 1685 0x62, 0xf5, 0x7c, 0x38, 0x5a, 0x48, 0x20, 1686 0x62, 0xf5, 0x7c, 0x9d, 0x5a, 0xcb, 1687 0x62, 0xf5, 0x7c, 0xcd, 0x5a, 0x48, 0x04, 1688 0x62, 0xf5, 0x7c, 0xdd, 0x5a, 0x48, 0x20, 1689 1690 // vcvtph2qq 1691 0x62, 0xf5, 0x7d, 0x08, 0x7b, 0xcd, 1692 0x62, 0xf5, 0x7d, 0x08, 0x7b, 0x48, 0x10, 1693 0x62, 0xf5, 0x7d, 0x18, 0x7b, 0x48, 0x20, 1694 0x62, 0xf5, 0x7d, 0xaa, 0x7b, 0xcd, 1695 0x62, 0xf5, 0x7d, 0x28, 0x7b, 0x48, 0x08, 1696 0x62, 0xf5, 0x7d, 0x38, 0x7b, 0x48, 0x20, 1697 0x62, 0xf5, 0x7d, 0xbd, 0x7b, 0xcb, 1698 0x62, 0xf5, 0x7d, 0xcd, 0x7b, 0x48, 0x04, 1699 0x62, 0xf5, 0x7d, 0xdd, 0x7b, 0x48, 0x20, 1700 1701 // vcvtph2uqq 1702 0x62, 0xf5, 0x7d, 0x08, 0x79, 0xcd, 1703 0x62, 0xf5, 0x7d, 0x08, 0x79, 0x48, 0x10, 1704 0x62, 0xf5, 0x7d, 0x18, 0x79, 0x48, 0x20, 1705 0x62, 0xf5, 0x7d, 0xaa, 0x79, 0xcd, 1706 0x62, 0xf5, 0x7d, 0x28, 0x79, 0x48, 0x08, 1707 0x62, 0xf5, 0x7d, 0x38, 0x79, 0x48, 0x20, 1708 0x62, 0xf5, 0x7d, 0xbd, 0x79, 0xcb, 1709 0x62, 0xf5, 0x7d, 0xcd, 0x79, 0x48, 0x04, 1710 0x62, 0xf5, 0x7d, 0xdd, 0x79, 0x48, 0x20, 1711 1712 // vcvttph2uqq 1713 0x62, 0xf5, 0x7d, 0x08, 0x78, 0xcd, 1714 0x62, 0xf5, 0x7d, 0x08, 0x78, 0x48, 0x10, 1715 0x62, 0xf5, 0x7d, 0x18, 0x78, 0x48, 0x20, 1716 0x62, 0xf5, 0x7d, 0xaa, 0x78, 0xcd, 1717 0x62, 0xf5, 0x7d, 0x28, 0x78, 0x48, 0x08, 1718 0x62, 0xf5, 0x7d, 0x38, 0x78, 0x48, 0x20, 1719 0x62, 0xf5, 0x7d, 0x9d, 0x78, 0xcb, 1720 0x62, 0xf5, 0x7d, 0xcd, 0x78, 0x48, 0x04, 1721 0x62, 0xf5, 0x7d, 0xdd, 0x78, 0x48, 0x20, 1722 1723 // vcvtdq2ph 1724 0x62, 0xf5, 0x7c, 0x08, 0x5b, 0xcd, 1725 0x62, 0xf5, 0x7c, 0x08, 0x5b, 0x48, 0x04, 1726 0x62, 0xf5, 0x7c, 0x18, 0x5b, 0x48, 0x10, 1727 0x62, 0xf5, 0x7c, 0x28, 0x5b, 0x48, 0x02, 1728 0x62, 0xf5, 0x7c, 0x38, 0x5b, 0x48, 0x10, 1729 0x62, 0xf5, 0x7c, 0xba, 0x5b, 0xcd, 1730 0x62, 0xf5, 0x7c, 0x48, 0x5b, 0x48, 0x01, 1731 0x62, 0xf5, 0x7c, 0x58, 0x5b, 0x48, 0x10, 1732 1733 // vcvtps2phx 1734 0x62, 0xf5, 0x7d, 0x08, 0x1d, 0xcd, 1735 0x62, 0xf5, 0x7d, 0x08, 0x1d, 0x48, 0x04, 1736 0x62, 0xf5, 0x7d, 0x18, 0x1d, 0x48, 0x10, 1737 0x62, 0xf5, 0x7d, 0x28, 0x1d, 0x48, 0x02, 1738 0x62, 0xf5, 0x7d, 0x38, 0x1d, 0x48, 0x10, 1739 0x62, 0xf5, 0x7d, 0xba, 0x1d, 0xcd, 1740 0x62, 0xf5, 0x7d, 0x48, 0x1d, 0x48, 0x01, 1741 0x62, 0xf5, 0x7d, 0x58, 0x1d, 0x48, 0x10, 1742 1743 // vcvtudq2ph 1744 0x62, 0xf5, 0x7f, 0x08, 0x7a, 0xcd, 1745 0x62, 0xf5, 0x7f, 0x08, 0x7a, 0x48, 0x04, 1746 0x62, 0xf5, 0x7f, 0x18, 0x7a, 0x48, 0x10, 1747 0x62, 0xf5, 0x7f, 0x28, 0x7a, 0x48, 0x02, 1748 0x62, 0xf5, 0x7f, 0x38, 0x7a, 0x48, 0x10, 1749 0x62, 0xf5, 0x7f, 0xba, 0x7a, 0xcd, 1750 0x62, 0xf5, 0x7f, 0x48, 0x7a, 0x48, 0x01, 1751 0x62, 0xf5, 0x7f, 0x58, 0x7a, 0x48, 0x10, 1752 1753 // vcvtpd2ph 1754 0x62, 0xf5, 0xfd, 0x08, 0x5a, 0xcd, 1755 0x62, 0xf5, 0xfd, 0x28, 0x5a, 0xcd, 1756 0x62, 0xf5, 0xfd, 0xba, 0x5a, 0xcd, 1757 0x62, 0xf5, 0xfd, 0x08, 0x5a, 0x48, 0x04, 1758 0x62, 0xf5, 0xfd, 0x18, 0x5a, 0x48, 0x08, 1759 0x62, 0xf5, 0xfd, 0x28, 0x5a, 0x48, 0x02, 1760 0x62, 0xf5, 0xfd, 0x38, 0x5a, 0x48, 0x08, 1761 0x62, 0xf5, 0xfd, 0x48, 0x5a, 0x48, 0x01, 1762 0x62, 0xf5, 0xfd, 0x58, 0x5a, 0x48, 0x08, 1763 1764 // vcvtqq2ph 1765 0x62, 0xf5, 0xfc, 0x08, 0x5b, 0xcd, 1766 0x62, 0xf5, 0xfc, 0x28, 0x5b, 0xcd, 1767 0x62, 0xf5, 0xfc, 0xba, 0x5b, 0xcd, 1768 0x62, 0xf5, 0xfc, 0x08, 0x5b, 0x48, 0x04, 1769 0x62, 0xf5, 0xfc, 0x18, 0x5b, 0x48, 0x08, 1770 0x62, 0xf5, 0xfc, 0x28, 0x5b, 0x48, 0x02, 1771 0x62, 0xf5, 0xfc, 0x38, 0x5b, 0x48, 0x08, 1772 0x62, 0xf5, 0xfc, 0x48, 0x5b, 0x48, 0x01, 1773 0x62, 0xf5, 0xfc, 0x58, 0x5b, 0x48, 0x08, 1774 1775 // vcvtuqq2ph 1776 0x62, 0xf5, 0xff, 0x08, 0x7a, 0xcd, 1777 0x62, 0xf5, 0xff, 0x28, 0x7a, 0xcd, 1778 0x62, 0xf5, 0xff, 0xba, 0x7a, 0xcd, 1779 0x62, 0xf5, 0xff, 0x08, 0x7a, 0x48, 0x04, 1780 0x62, 0xf5, 0xff, 0x18, 0x7a, 0x48, 0x08, 1781 0x62, 0xf5, 0xff, 0x28, 0x7a, 0x48, 0x02, 1782 0x62, 0xf5, 0xff, 0x38, 0x7a, 0x48, 0x08, 1783 0x62, 0xf5, 0xff, 0x48, 0x7a, 0x48, 0x01, 1784 0x62, 0xf5, 0xff, 0x58, 0x7a, 0x48, 0x08, 1785 1786 // vcvtph2uw 1787 0x62, 0xf5, 0x7c, 0x08, 0x7d, 0xcd, 1788 0x62, 0xf5, 0x7c, 0x08, 0x7d, 0x48, 0x04, 1789 0x62, 0xf5, 0x7c, 0x18, 0x7d, 0x48, 0x20, 1790 0x62, 0xf5, 0x7c, 0x28, 0x7d, 0x48, 0x02, 1791 0x62, 0xf5, 0x7c, 0x38, 0x7d, 0x48, 0x20, 1792 0x62, 0xf5, 0x7c, 0xba, 0x7d, 0xcd, 1793 0x62, 0xf5, 0x7c, 0x48, 0x7d, 0x48, 0x01, 1794 0x62, 0xf5, 0x7c, 0x58, 0x7d, 0x48, 0x20, 1795 1796 // vcvtph2w 1797 0x62, 0xf5, 0x7d, 0x08, 0x7d, 0xcd, 1798 0x62, 0xf5, 0x7d, 0x08, 0x7d, 0x48, 0x04, 1799 0x62, 0xf5, 0x7d, 0x18, 0x7d, 0x48, 0x20, 1800 0x62, 0xf5, 0x7d, 0x28, 0x7d, 0x48, 0x02, 1801 0x62, 0xf5, 0x7d, 0x38, 0x7d, 0x48, 0x20, 1802 0x62, 0xf5, 0x7d, 0xba, 0x7d, 0xcd, 1803 0x62, 0xf5, 0x7d, 0x48, 0x7d, 0x48, 0x01, 1804 0x62, 0xf5, 0x7d, 0x58, 0x7d, 0x48, 0x20, 1805 1806 // vcvttph2uw 1807 0x62, 0xf5, 0x7c, 0x08, 0x7c, 0xcd, 1808 0x62, 0xf5, 0x7c, 0x08, 0x7c, 0x48, 0x04, 1809 0x62, 0xf5, 0x7c, 0x18, 0x7c, 0x48, 0x20, 1810 0x62, 0xf5, 0x7c, 0x28, 0x7c, 0x48, 0x02, 1811 0x62, 0xf5, 0x7c, 0x38, 0x7c, 0x48, 0x20, 1812 0x62, 0xf5, 0x7c, 0x9a, 0x7c, 0xcd, 1813 0x62, 0xf5, 0x7c, 0x48, 0x7c, 0x48, 0x01, 1814 0x62, 0xf5, 0x7c, 0x58, 0x7c, 0x48, 0x20, 1815 1816 // vcvttph2w 1817 0x62, 0xf5, 0x7d, 0x08, 0x7c, 0xcd, 1818 0x62, 0xf5, 0x7d, 0x08, 0x7c, 0x48, 0x04, 1819 0x62, 0xf5, 0x7d, 0x18, 0x7c, 0x48, 0x20, 1820 0x62, 0xf5, 0x7d, 0x28, 0x7c, 0x48, 0x02, 1821 0x62, 0xf5, 0x7d, 0x38, 0x7c, 0x48, 0x20, 1822 0x62, 0xf5, 0x7d, 0x9a, 0x7c, 0xcd, 1823 0x62, 0xf5, 0x7d, 0x48, 0x7c, 0x48, 0x01, 1824 0x62, 0xf5, 0x7d, 0x58, 0x7c, 0x48, 0x20, 1825 1826 // vcvtuw2ph 1827 0x62, 0xf5, 0x7f, 0x08, 0x7d, 0xcd, 1828 0x62, 0xf5, 0x7f, 0x08, 0x7d, 0x48, 0x04, 1829 0x62, 0xf5, 0x7f, 0x18, 0x7d, 0x48, 0x20, 1830 0x62, 0xf5, 0x7f, 0x28, 0x7d, 0x48, 0x02, 1831 0x62, 0xf5, 0x7f, 0x38, 0x7d, 0x48, 0x20, 1832 0x62, 0xf5, 0x7f, 0xba, 0x7d, 0xcd, 1833 0x62, 0xf5, 0x7f, 0x48, 0x7d, 0x48, 0x01, 1834 0x62, 0xf5, 0x7f, 0x58, 0x7d, 0x48, 0x20, 1835 1836 // vcvtw2ph 1837 0x62, 0xf5, 0x7e, 0x08, 0x7d, 0xcd, 1838 0x62, 0xf5, 0x7e, 0x08, 0x7d, 0x48, 0x04, 1839 0x62, 0xf5, 0x7e, 0x18, 0x7d, 0x48, 0x20, 1840 0x62, 0xf5, 0x7e, 0x28, 0x7d, 0x48, 0x02, 1841 0x62, 0xf5, 0x7e, 0x38, 0x7d, 0x48, 0x20, 1842 0x62, 0xf5, 0x7e, 0xba, 0x7d, 0xcd, 1843 0x62, 0xf5, 0x7e, 0x48, 0x7d, 0x48, 0x01, 1844 0x62, 0xf5, 0x7e, 0x58, 0x7d, 0x48, 0x20, 1845 1846 // vcvtps2ph 1847 0xc4, 0xe3, 0x79, 0x1d, 0xd1, 0x01, 1848 0xc4, 0xe3, 0x79, 0x1d, 0x50, 0x40, 0x02, 1849 0xc4, 0xe3, 0x7d, 0x1d, 0xd1, 0x03, 1850 0xc4, 0xe3, 0x7d, 0x1d, 0x50, 0x40, 0x04, 1851 0x62, 0xf3, 0x7d, 0x89, 0x1d, 0xd1, 0x05, 1852 0x62, 0xf3, 0x7d, 0x09, 0x1d, 0x58, 0x08, 0x06, 1853 0x62, 0xf3, 0x7d, 0x2a, 0x1d, 0xe1, 0x07, 1854 0x62, 0xf3, 0x7d, 0x2a, 0x1d, 0x68, 0x04, 0x08, 1855 0x62, 0xf3, 0x7d, 0x1a, 0x1d, 0xe9, 0x09, 1856 0x62, 0xf3, 0x7d, 0x4d, 0x1d, 0x60, 0x02, 0x0a, 1857 1858 // vcvtsh2usi 1859 0x62, 0xf5, 0x7e, 0x38, 0x79, 0xc9, 1860 0x62, 0xf5, 0x7e, 0x08, 0x79, 0x40, 0x20, 1861 0x62, 0x75, 0xfe, 0x38, 0x79, 0xc9, 1862 0x62, 0x75, 0xfe, 0x08, 0x79, 0x68, 0x20, 1863 1864 // vcvttsh2si 1865 0x62, 0xf5, 0x7e, 0x18, 0x2c, 0xc9, 1866 0x62, 0xf5, 0x7e, 0x08, 0x2c, 0x40, 0x20, 1867 0x62, 0x75, 0xfe, 0x18, 0x2c, 0xc9, 1868 0x62, 0x75, 0xfe, 0x08, 0x2c, 0x68, 0x20, 1869 1870 // vcvttsh2usi 1871 0x62, 0xf5, 0x7e, 0x18, 0x78, 0xc9, 1872 0x62, 0xf5, 0x7e, 0x08, 0x78, 0x40, 0x20, 1873 0x62, 0x75, 0xfe, 0x18, 0x78, 0xc9, 1874 0x62, 0x75, 0xfe, 0x08, 0x78, 0x68, 0x20, 1875 1876 // vcvttph2qq 1877 0x62, 0xf5, 0x7d, 0x08, 0x7a, 0xcd, 1878 0x62, 0xf5, 0x7d, 0x08, 0x7a, 0x48, 0x10, 1879 0x62, 0xf5, 0x7d, 0x18, 0x7a, 0x48, 0x20, 1880 0x62, 0xf5, 0x7d, 0xaa, 0x7a, 0xcd, 1881 0x62, 0xf5, 0x7d, 0x28, 0x7a, 0x48, 0x08, 1882 0x62, 0xf5, 0x7d, 0x38, 0x7a, 0x48, 0x20, 1883 0x62, 0xf5, 0x7d, 0x9d, 0x7a, 0xcb, 1884 0x62, 0xf5, 0x7d, 0xcd, 0x7a, 0x48, 0x04, 1885 0x62, 0xf5, 0x7d, 0xdd, 0x7a, 0x48, 0x20, 1886 1887 // vcvtsi2sh 1888 0x62, 0xf5, 0x6e, 0x38, 0x2a, 0xc8, 1889 0x62, 0xf5, 0x6e, 0x08, 0x2a, 0x48, 0x10, 1890 0x62, 0xd5, 0xee, 0x38, 0x2a, 0xc9, 1891 0x62, 0xf5, 0xee, 0x08, 0x2a, 0x48, 0x08, 1892 1893 // vcvtusi2sh 1894 0x62, 0xf5, 0x6e, 0x38, 0x7b, 0xc8, 1895 0x62, 0xf5, 0x6e, 0x08, 0x7b, 0x48, 0x10, 1896 0x62, 0xd5, 0xee, 0x38, 0x7b, 0xc9, 1897 0x62, 0xf5, 0xee, 0x08, 0x7b, 0x48, 0x08, 1898 }; 1899 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 1900 CYBOZU_TEST_EQUAL(c.getSize(), n); 1901 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 1902 } 1903 #endif 1904 1905 CYBOZU_TEST_AUTO(waitpkg) 1906 { 1907 struct Code : Xbyak::CodeGenerator { 1908 Code() 1909 { 1910 tpause(eax); 1911 tpause(ebx); 1912 #ifdef XBYAK32 1913 umonitor(cx); 1914 umonitor(ecx); 1915 #else 1916 umonitor(ecx); 1917 umonitor(rcx); 1918 #endif 1919 umwait(eax); 1920 umwait(ebx); 1921 } 1922 } c; 1923 const uint8_t tbl[] = { 1924 // tpause 1925 0x66, 0x0f, 0xae, 0xf0, 1926 0x66, 0x0f, 0xae, 0xf3, 1927 // umonitor 1928 0x67, 0xf3, 0x0f, 0xae, 0xf1, 1929 0xf3, 0x0f, 0xae, 0xf1, 1930 // tpause 1931 0xf2, 0x0f, 0xae, 0xf0, 1932 0xf2, 0x0f, 0xae, 0xf3, 1933 }; 1934 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 1935 CYBOZU_TEST_EQUAL(c.getSize(), n); 1936 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 1937 } 1938 1939 CYBOZU_TEST_AUTO(misc) 1940 { 1941 struct Code : Xbyak::CodeGenerator { 1942 Code() 1943 { 1944 cldemote(ptr[eax+esi*4+0x12]); 1945 movdiri(ptr[edx+esi*2+4], eax); 1946 movdir64b(eax, ptr[edx]); 1947 #ifdef XBYAK64 1948 cldemote(ptr[rax+rdi*8+0x123]); 1949 movdiri(ptr[rax+r12], r9); 1950 movdiri(ptr[rax+r12*2+4], r9d); 1951 movdir64b(r10, ptr[r8]); 1952 clui(); 1953 senduipi(rax); 1954 senduipi(r10); 1955 stui(); 1956 testui(); 1957 uiret(); 1958 #endif 1959 } 1960 } c; 1961 const uint8_t tbl[] = { 1962 #ifdef XBYAK64 1963 0x67, 1964 #endif 1965 0x0f, 0x1c, 0x44, 0xb0, 0x12, // cldemote 1966 #ifdef XBYAK64 1967 0x67, 1968 #endif 1969 0x0f, 0x38, 0xf9, 0x44, 0x72, 0x04, // movdiri 1970 1971 0x66, 1972 #ifdef XBYAK64 1973 0x67, 1974 #endif 1975 0x0f, 0x38, 0xf8, 0x02, // movdir64b 1976 #ifdef XBYAK64 1977 0x0f, 0x1c, 0x84, 0xf8, 0x23, 0x01, 0x00, 0x00, // cldemote 1978 0x4e, 0x0f, 0x38, 0xf9, 0x0c, 0x20, // movdiri 1979 0x46, 0x0f, 0x38, 0xf9, 0x4c, 0x60, 0x04, // movdiri 1980 0x66, 0x45, 0x0f, 0x38, 0xf8, 0x10, // movdir64b 1981 0xf3, 0x0f, 0x01, 0xee, // clui 1982 0xf3, 0x0f, 0xc7, 0xf0, // senduipi rax 1983 0xf3, 0x41, 0x0f, 0xc7, 0xf2, // senduipi r10 1984 0xf3, 0x0f, 0x01, 0xef, // stui 1985 0xf3, 0x0f, 0x01, 0xed, // testui 1986 0xf3, 0x0f, 0x01, 0xec, // uiret 1987 #endif 1988 }; 1989 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 1990 CYBOZU_TEST_EQUAL(c.getSize(), n); 1991 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 1992 } 1993 1994 CYBOZU_TEST_AUTO(cpu) 1995 { 1996 // https://github.com/herumi/xbyak/issues/148 1997 using namespace Xbyak::util; 1998 Cpu cpu; 1999 CYBOZU_TEST_EQUAL(cpu.has(Cpu::tINTEL) && cpu.has(Cpu::tAMD), cpu.has(Cpu::tINTEL | Cpu::tAMD)); 2000 } 2001 2002 CYBOZU_TEST_AUTO(minmax) 2003 { 2004 using namespace Xbyak::util; 2005 CYBOZU_TEST_EQUAL((std::min)(3, 4), local::min_(3, 4)); 2006 CYBOZU_TEST_EQUAL((std::max)(3, 4), local::max_(3, 4)); 2007 } 2008 2009 CYBOZU_TEST_AUTO(rao_int) 2010 { 2011 struct Code : Xbyak::CodeGenerator { 2012 Code() 2013 { 2014 #ifdef XBYAK64 2015 aadd(ptr[rax], ecx); 2016 aadd(ptr[eax], ecx); 2017 aadd(ptr[rax], r10); 2018 aand(ptr[rax], ecx); 2019 aand(ptr[eax], ecx); 2020 aand(ptr[rax], r10); 2021 aor(ptr[rax], ecx); 2022 aor(ptr[eax], ecx); 2023 aor(ptr[rax], r10); 2024 axor(ptr[rax], ecx); 2025 axor(ptr[eax], ecx); 2026 axor(ptr[rax], r10); 2027 #else 2028 aadd(ptr[eax], ecx); 2029 aand(ptr[eax], ecx); 2030 aor(ptr[eax], ecx); 2031 axor(ptr[eax], ecx); 2032 #endif 2033 } 2034 } c; 2035 const uint8_t tbl[] = { 2036 #ifdef XBYAK64 2037 // aadd 2038 0x0f, 0x38, 0xfc, 0x08, 2039 0x67, 0x0f, 0x38, 0xfc, 0x08, 2040 0x4c, 0x0f, 0x38, 0xfc, 0x10, 2041 2042 // aand 2043 0x66, 0x0f, 0x38, 0xfc, 0x08, 2044 0x66, 0x67, 0x0f, 0x38, 0xfc, 0x08, 2045 0x66, 0x4c, 0x0f, 0x38, 0xfc, 0x10, 2046 2047 // aor 2048 0xf2, 0x0f, 0x38, 0xfc, 0x08, 2049 0xf2, 0x67, 0x0f, 0x38, 0xfc, 0x08, 2050 0xf2, 0x4c, 0x0f, 0x38, 0xfc, 0x10, 2051 2052 // axor 2053 0xf3, 0x0f, 0x38, 0xfc, 0x08, 2054 0xf3, 0x67, 0x0f, 0x38, 0xfc, 0x08, 2055 0xf3, 0x4c, 0x0f, 0x38, 0xfc, 0x10, 2056 #else 2057 // aadd 2058 0x0f, 0x38, 0xfc, 0x08, 2059 // aand 2060 0x66, 0x0f, 0x38, 0xfc, 0x08, 2061 // aor 2062 0xf2, 0x0f, 0x38, 0xfc, 0x08, 2063 // axor 2064 0xf3, 0x0f, 0x38, 0xfc, 0x08, 2065 #endif 2066 }; 2067 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 2068 CYBOZU_TEST_EQUAL(c.getSize(), n); 2069 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 2070 } 2071 2072 #ifdef XBYAK64 2073 CYBOZU_TEST_AUTO(CMPccXADD) 2074 { 2075 struct Code : Xbyak::CodeGenerator { 2076 Code() 2077 { 2078 // 32bit reg 2079 cmpbexadd(ptr[rax+r10*4], ecx, edx); 2080 cmpbxadd(ptr[rax+r10*4], ecx, edx); 2081 cmplexadd(ptr[rax+r10*4], ecx, edx); 2082 cmplxadd(ptr[rax+r10*4], ecx, edx); 2083 cmpnbexadd(ptr[rax+r10*4], ecx, edx); 2084 cmpnbxadd(ptr[rax+r10*4], ecx, edx); 2085 cmpnlexadd(ptr[rax+r10*4], ecx, edx); 2086 cmpnlxadd(ptr[rax+r10*4], ecx, edx); 2087 cmpnoxadd(ptr[rax+r10*4], ecx, edx); 2088 cmpnpxadd(ptr[rax+r10*4], ecx, edx); 2089 cmpnsxadd(ptr[rax+r10*4], ecx, edx); 2090 cmpnzxadd(ptr[rax+r10*4], ecx, edx); 2091 cmpoxadd(ptr[rax+r10*4], ecx, edx); 2092 cmppxadd(ptr[rax+r10*4], ecx, edx); 2093 cmpsxadd(ptr[rax+r10*4], ecx, edx); 2094 cmpzxadd(ptr[rax+r10*4], ecx, edx); 2095 // 64bit reg 2096 cmpbexadd(ptr[rax+r10*4], rcx, rdx); 2097 cmpbxadd(ptr[rax+r10*4], rcx, rdx); 2098 cmplexadd(ptr[rax+r10*4], rcx, rdx); 2099 cmplxadd(ptr[rax+r10*4], rcx, rdx); 2100 cmpnbexadd(ptr[rax+r10*4], rcx, rdx); 2101 cmpnbxadd(ptr[rax+r10*4], rcx, rdx); 2102 cmpnlexadd(ptr[rax+r10*4], rcx, rdx); 2103 cmpnlxadd(ptr[rax+r10*4], rcx, rdx); 2104 cmpnoxadd(ptr[rax+r10*4], rcx, rdx); 2105 cmpnpxadd(ptr[rax+r10*4], rcx, rdx); 2106 cmpnsxadd(ptr[rax+r10*4], rcx, rdx); 2107 cmpnzxadd(ptr[rax+r10*4], rcx, rdx); 2108 cmpoxadd(ptr[rax+r10*4], rcx, rdx); 2109 cmppxadd(ptr[rax+r10*4], rcx, rdx); 2110 cmpsxadd(ptr[rax+r10*4], rcx, rdx); 2111 cmpzxadd(ptr[rax+r10*4], rcx, rdx); 2112 } 2113 } c; 2114 const uint8_t tbl[] = { 2115 // 32bit reg 2116 0xc4, 0xa2, 0x69, 0xe6, 0x0c, 0x90, 2117 0xc4, 0xa2, 0x69, 0xe2, 0x0c, 0x90, 2118 0xc4, 0xa2, 0x69, 0xee, 0x0c, 0x90, 2119 0xc4, 0xa2, 0x69, 0xec, 0x0c, 0x90, 2120 0xc4, 0xa2, 0x69, 0xe7, 0x0c, 0x90, 2121 0xc4, 0xa2, 0x69, 0xe3, 0x0c, 0x90, 2122 0xc4, 0xa2, 0x69, 0xef, 0x0c, 0x90, 2123 0xc4, 0xa2, 0x69, 0xed, 0x0c, 0x90, 2124 0xc4, 0xa2, 0x69, 0xe1, 0x0c, 0x90, 2125 0xc4, 0xa2, 0x69, 0xeb, 0x0c, 0x90, 2126 0xc4, 0xa2, 0x69, 0xe9, 0x0c, 0x90, 2127 0xc4, 0xa2, 0x69, 0xe5, 0x0c, 0x90, 2128 0xc4, 0xa2, 0x69, 0xe0, 0x0c, 0x90, 2129 0xc4, 0xa2, 0x69, 0xea, 0x0c, 0x90, 2130 0xc4, 0xa2, 0x69, 0xe8, 0x0c, 0x90, 2131 0xc4, 0xa2, 0x69, 0xe4, 0x0c, 0x90, 2132 // 64bit reg 2133 0xc4, 0xa2, 0xe9, 0xe6, 0x0c, 0x90, 2134 0xc4, 0xa2, 0xe9, 0xe2, 0x0c, 0x90, 2135 0xc4, 0xa2, 0xe9, 0xee, 0x0c, 0x90, 2136 0xc4, 0xa2, 0xe9, 0xec, 0x0c, 0x90, 2137 0xc4, 0xa2, 0xe9, 0xe7, 0x0c, 0x90, 2138 0xc4, 0xa2, 0xe9, 0xe3, 0x0c, 0x90, 2139 0xc4, 0xa2, 0xe9, 0xef, 0x0c, 0x90, 2140 0xc4, 0xa2, 0xe9, 0xed, 0x0c, 0x90, 2141 0xc4, 0xa2, 0xe9, 0xe1, 0x0c, 0x90, 2142 0xc4, 0xa2, 0xe9, 0xeb, 0x0c, 0x90, 2143 0xc4, 0xa2, 0xe9, 0xe9, 0x0c, 0x90, 2144 0xc4, 0xa2, 0xe9, 0xe5, 0x0c, 0x90, 2145 0xc4, 0xa2, 0xe9, 0xe0, 0x0c, 0x90, 2146 0xc4, 0xa2, 0xe9, 0xea, 0x0c, 0x90, 2147 0xc4, 0xa2, 0xe9, 0xe8, 0x0c, 0x90, 2148 0xc4, 0xa2, 0xe9, 0xe4, 0x0c, 0x90, 2149 }; 2150 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 2151 CYBOZU_TEST_EQUAL(c.getSize(), n); 2152 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 2153 } 2154 2155 CYBOZU_TEST_AUTO(prefetchiti) 2156 { 2157 struct Code : Xbyak::CodeGenerator { 2158 Code() 2159 { 2160 prefetchit0(ptr[rax]); 2161 prefetchit1(ptr[rax]); 2162 } 2163 } c; 2164 const uint8_t tbl[] = { 2165 0x0f, 0x18, 0x38, 2166 0x0f, 0x18, 0x30 2167 }; 2168 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 2169 CYBOZU_TEST_EQUAL(c.getSize(), n); 2170 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 2171 } 2172 2173 CYBOZU_TEST_AUTO(crypto) 2174 { 2175 struct Code : Xbyak::CodeGenerator { 2176 Code() 2177 { 2178 vsha512msg1(ymm3, xmm5); 2179 vsha512msg2(ymm9, ymm10); 2180 vsha512rnds2(ymm1, ymm3, xmm2); 2181 2182 vsm3msg1(xmm1, xmm2, xmm3); 2183 vsm3msg1(xmm1, xmm2, ptr [rax]); 2184 vsm3msg2(xmm5, xmm7, xmm3); 2185 vsm3msg2(xmm5, xmm6, ptr [rax]); 2186 vsm3rnds2(xmm5, xmm7, xmm3, 0x12); 2187 vsm3rnds2(xmm5, xmm7, ptr [rcx], 0x34); 2188 2189 vsm4key4(xmm1, xmm2, xmm3); 2190 vsm4key4(xmm1, xmm2, ptr [rdx]); 2191 vsm4rnds4(xmm1, xmm2, xmm3); 2192 vsm4rnds4(xmm5, xmm6, ptr [rcx+rax*4]); 2193 } 2194 } c; 2195 const uint8_t tbl[] = { 2196 // sha512 2197 0xc4, 0xe2, 0x7f, 0xcc, 0xdd, 2198 0xc4, 0x42, 0x7f, 0xcd, 0xca, 2199 0xc4, 0xe2, 0x67, 0xcb, 0xca, 2200 2201 // sm3 2202 0xC4, 0xE2, 0x68, 0xDA, 0xCB, 2203 0xC4, 0xE2, 0x68, 0xDA, 0x08, 2204 0xC4, 0xE2, 0x41, 0xDA, 0xEB, 2205 0xC4, 0xE2, 0x49, 0xDA, 0x28, 2206 0xC4, 0xE3, 0x41, 0xDE, 0xEB, 0x12, 2207 0xC4, 0xE3, 0x41, 0xDE, 0x29, 0x34, 2208 2209 // sm4 2210 0xc4, 0xe2, 0x6a, 0xda, 0xcb, 2211 0xc4, 0xe2, 0x6a, 0xda, 0x0a, 2212 0xc4, 0xe2, 0x6b, 0xda, 0xcb, 2213 0xc4, 0xe2, 0x4b, 0xda, 0x2c, 0x81, 2214 }; 2215 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 2216 CYBOZU_TEST_EQUAL(c.getSize(), n); 2217 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 2218 } 2219 2220 CYBOZU_TEST_AUTO(avx_vnni_int) 2221 { 2222 struct Code : Xbyak::CodeGenerator { 2223 Code() 2224 { 2225 vpdpbssd(xmm1, xmm2, xmm3); 2226 vpdpbssd(ymm1, ymm2, ptr [rax]); 2227 vpdpbssds(xmm1, xmm2, xmm3); 2228 vpdpbssds(ymm1, ymm2, ptr [rax]); 2229 vpdpbsud(xmm1, xmm2, xmm3); 2230 vpdpbsud(ymm1, ymm2, ptr [rax]); 2231 vpdpbsuds(xmm1, xmm2, xmm3); 2232 vpdpbsuds(ymm1, ymm2, ptr [rax]); 2233 vpdpbuud(xmm1, xmm2, xmm3); 2234 vpdpbuud(ymm1, ymm2, ptr [rax]); 2235 vpdpbuuds(xmm1, xmm2, xmm3); 2236 vpdpbuuds(ymm1, ymm2, ptr [rax]); 2237 2238 vpdpwsud(xmm1, xmm2, xmm3); 2239 vpdpwsud(ymm1, ymm2, ptr [rax]); 2240 vpdpwsuds(xmm1, xmm2, xmm3); 2241 vpdpwsuds(ymm1, ymm2, ptr [rax]); 2242 vpdpwusd(xmm1, xmm2, xmm3); 2243 vpdpwusd(ymm1, ymm2, ptr [rax]); 2244 vpdpwusds(xmm1, xmm2, xmm3); 2245 vpdpwusds(ymm1, ymm2, ptr [rax]); 2246 vpdpwuud(xmm1, xmm2, xmm3); 2247 vpdpwuud(ymm1, ymm2, ptr [rax]); 2248 vpdpwuuds(xmm1, xmm2, xmm3); 2249 vpdpwuuds(ymm1, ymm2, ptr [rax]); 2250 } 2251 } c; 2252 const uint8_t tbl[] = { 2253 0xc4, 0xe2, 0x6b, 0x50, 0xcb, 2254 0xc4, 0xe2, 0x6f, 0x50, 0x08, 2255 0xc4, 0xe2, 0x6b, 0x51, 0xcb, 2256 0xc4, 0xe2, 0x6f, 0x51, 0x08, 2257 0xc4, 0xe2, 0x6a, 0x50, 0xcb, 2258 0xc4, 0xe2, 0x6e, 0x50, 0x08, 2259 0xc4, 0xe2, 0x6a, 0x51, 0xcb, 2260 0xc4, 0xe2, 0x6e, 0x51, 0x08, 2261 0xc4, 0xe2, 0x68, 0x50, 0xcb, 2262 0xc4, 0xe2, 0x6c, 0x50, 0x08, 2263 0xc4, 0xe2, 0x68, 0x51, 0xcb, 2264 0xc4, 0xe2, 0x6c, 0x51, 0x08, 2265 0xc4, 0xe2, 0x6a, 0xd2, 0xcb, 2266 0xc4, 0xe2, 0x6e, 0xd2, 0x08, 2267 0xc4, 0xe2, 0x6a, 0xd3, 0xcb, 2268 0xc4, 0xe2, 0x6e, 0xd3, 0x08, 2269 0xc4, 0xe2, 0x69, 0xd2, 0xcb, 2270 0xc4, 0xe2, 0x6d, 0xd2, 0x08, 2271 0xc4, 0xe2, 0x69, 0xd3, 0xcb, 2272 0xc4, 0xe2, 0x6d, 0xd3, 0x08, 2273 0xc4, 0xe2, 0x68, 0xd2, 0xcb, 2274 0xc4, 0xe2, 0x6c, 0xd2, 0x08, 2275 0xc4, 0xe2, 0x68, 0xd3, 0xcb, 2276 0xc4, 0xe2, 0x6c, 0xd3, 0x08, 2277 }; 2278 const size_t n = sizeof(tbl) / sizeof(tbl[0]); 2279 CYBOZU_TEST_EQUAL(c.getSize(), n); 2280 CYBOZU_TEST_EQUAL_ARRAY(c.getCode(), tbl, n); 2281 } 2282 2283 2284 #endif