make_512.cpp
1 #include <stdio.h> 2 #include "xbyak/xbyak.h" 3 #include <stdlib.h> 4 #include <string.h> 5 #include "cybozu/inttype.hpp" 6 #define NUM_OF_ARRAY(x) (sizeof(x) / sizeof(x[0])) 7 8 using namespace Xbyak; 9 10 const int bitEnd = 64; 11 12 const uint64_t YMM_SAE = 1ULL << 0; 13 const uint64_t _XMM = 1ULL << 1; 14 const uint64_t _MEM = 1ULL << 2; 15 const uint64_t _REG32 = 1ULL << 3; 16 const uint64_t EAX = 1ULL << 4; 17 const uint64_t IMM32 = 1ULL << 5; 18 const uint64_t IMM8 = 1ULL << 6; 19 const uint64_t _REG8 = 1ULL << 7; 20 const uint64_t _REG16 = 1ULL << 8; 21 const uint64_t XMM_K = 1ULL << 9; 22 const uint64_t YMM_K = 1ULL << 10; 23 const uint64_t ZMM_K = 1ULL << 11; 24 const uint64_t AX = 1ULL << 12; 25 const uint64_t AL = 1ULL << 13; 26 const uint64_t IMM_1 = 1ULL << 14; 27 const uint64_t MEM8 = 1ULL << 15; 28 const uint64_t MEM16 = 1ULL << 16; 29 const uint64_t MEM32 = 1ULL << 17; 30 const uint64_t VM32Z = 1ULL << 19; 31 const uint64_t K_K = 1ULL << 20; 32 const uint64_t MEM_ONLY_DISP = 1ULL << 21; 33 const uint64_t VM32X_K = 1ULL << 23; 34 const uint64_t _YMM = 1ULL << 24; 35 const uint64_t VM32X_32 = 1ULL << 39; 36 const uint64_t VM32X_64 = 1ULL << 40; 37 const uint64_t VM32Y_32 = 1ULL << 41; 38 const uint64_t VM32Y_64 = 1ULL << 42; 39 const uint64_t VM32Z_K = 1ULL << 32; 40 #ifdef XBYAK64 41 const uint64_t _MEMe = 1ULL << 25; 42 const uint64_t REG32_2 = 1ULL << 26; // r8d, ... 43 const uint64_t REG16_2 = 1ULL << 27; // r8w, ... 44 const uint64_t REG8_2 = 1ULL << 28; // r8b, ... 45 const uint64_t REG8_3 = 1ULL << 29; // spl, ... 46 const uint64_t _REG64 = 1ULL << 30; // rax, ... 47 const uint64_t _REG64_2 = 1ULL << 31; // r8, ... 48 const uint64_t _XMM2 = 1ULL << 33; 49 const uint64_t _YMM2 = 1ULL << 34; 50 const uint64_t VM32X = VM32X_32 | VM32X_64; 51 const uint64_t VM32Y = VM32Y_32 | VM32Y_64; 52 #else 53 const uint64_t _MEMe = 0; 54 const uint64_t REG32_2 = 0; 55 const uint64_t REG16_2 = 0; 56 const uint64_t REG8_2 = 0; 57 const uint64_t REG8_3 = 0; 58 const uint64_t _REG64 = 0; 59 const uint64_t _REG64_2 = 0; 60 const uint64_t _XMM2 = 0; 61 const uint64_t _YMM2 = 0; 62 const uint64_t VM32X = VM32X_32; 63 const uint64_t VM32Y = VM32Y_32; 64 #endif 65 const uint64_t REG64 = _REG64 | _REG64_2; 66 const uint64_t REG32 = _REG32 | REG32_2 | EAX; 67 const uint64_t REG16 = _REG16 | REG16_2 | AX; 68 const uint64_t REG32e = REG32 | REG64; 69 const uint64_t REG8 = _REG8 | REG8_2|AL; 70 const uint64_t MEM = _MEM | _MEMe; 71 const uint64_t MEM64 = 1ULL << 35; 72 const uint64_t YMM_ER = 1ULL << 36; 73 const uint64_t VM32Y_K = 1ULL << 37; 74 const uint64_t IMM_2 = 1ULL << 38; 75 const uint64_t IMM = IMM_1 | IMM_2; 76 const uint64_t YMM = _YMM | _YMM2; 77 const uint64_t K = 1ULL << 43; 78 const uint64_t _ZMM = 1ULL << 44; 79 const uint64_t _ZMM2 = 1ULL << 45; 80 #ifdef XBYAK64 81 const uint64_t ZMM = _ZMM | _ZMM2; 82 const uint64_t _YMM3 = 1ULL << 46; 83 #else 84 const uint64_t ZMM = _ZMM; 85 const uint64_t _YMM3 = 0; 86 #endif 87 const uint64_t K2 = 1ULL << 47; 88 const uint64_t ZMM_SAE = 1ULL << 48; 89 const uint64_t ZMM_ER = 1ULL << 49; 90 #ifdef XBYAK64 91 const uint64_t _XMM3 = 1ULL << 50; 92 #else 93 const uint64_t _XMM3 = 0; 94 #endif 95 const uint64_t XMM = _XMM | _XMM2 | _XMM3; 96 const uint64_t XMM_SAE = 1ULL << 51; 97 #ifdef XBYAK64 98 const uint64_t XMM_KZ = 1ULL << 52; 99 const uint64_t YMM_KZ = 1ULL << 53; 100 const uint64_t ZMM_KZ = 1ULL << 54; 101 #else 102 const uint64_t XMM_KZ = 0; 103 const uint64_t YMM_KZ = 0; 104 const uint64_t ZMM_KZ = 0; 105 #endif 106 const uint64_t MEM_K = 1ULL << 55; 107 const uint64_t M_1to2 = 1ULL << 56; 108 const uint64_t M_1to4 = 1ULL << 57; 109 const uint64_t M_1to8 = 1ULL << 58; 110 const uint64_t M_1to16 = 1ULL << 59; 111 const uint64_t XMM_ER = 1ULL << 60; 112 const uint64_t M_xword = 1ULL << 61; 113 const uint64_t M_yword = 1ULL << 62; 114 const uint64_t MY_1to4 = 1ULL << 18; 115 116 const uint64_t NOPARA = 1ULL << (bitEnd - 1); 117 118 class Test { 119 Test(const Test&); 120 void operator=(const Test&); 121 const bool isXbyak_; 122 int funcNum_; 123 // check all op1, op2, op3 124 void put(const std::string& nm, uint64_t op1 = NOPARA, uint64_t op2 = NOPARA, uint64_t op3 = NOPARA, uint64_t op4 = NOPARA) const 125 { 126 for (int i = 0; i < bitEnd; i++) { 127 if ((op1 & (1ULL << i)) == 0) continue; 128 for (int j = 0; j < bitEnd; j++) { 129 if ((op2 & (1ULL << j)) == 0) continue; 130 for (int k = 0; k < bitEnd; k++) { 131 if ((op3 & (1ULL << k)) == 0) continue; 132 for (int s = 0; s < bitEnd; s++) { 133 if ((op4 & (1ULL << s)) == 0) continue; 134 printf("%s ", nm.c_str()); 135 if (isXbyak_) printf("("); 136 if (!(op1 & NOPARA)) printf("%s", get(1ULL << i)); 137 if (!(op2 & NOPARA)) printf(", %s", get(1ULL << j)); 138 if (!(op3 & NOPARA)) printf(", %s", get(1ULL << k)); 139 if (!(op4 & NOPARA)) printf(", %s", get(1ULL << s)); 140 if (isXbyak_) printf("); dump();"); 141 printf("\n"); 142 } 143 } 144 } 145 } 146 } 147 void put(const char *nm, uint64_t op, const char *xbyak, const char *nasm) const 148 { 149 for (int i = 0; i < bitEnd; i++) { 150 if ((op & (1ULL << i)) == 0) continue; 151 printf("%s ", nm); 152 if (isXbyak_) printf("("); 153 if (!(op & NOPARA)) printf("%s", get(1ULL << i)); 154 printf(", %s", isXbyak_ ? xbyak : nasm); 155 if (isXbyak_) printf("); dump();"); 156 printf("\n"); 157 } 158 } 159 void put(const char *nm, const char *xbyak, const char *nasm = 0, uint64_t op = NOPARA) const 160 { 161 if (nasm == 0) nasm = xbyak; 162 for (int i = 0; i < bitEnd; i++) { 163 if ((op & (1ULL << i)) == 0) continue; 164 printf("%s ", nm); 165 if (isXbyak_) printf("("); 166 printf("%s ", isXbyak_ ? xbyak : nasm); 167 if (!(op & NOPARA)) printf(", %s", get(1ULL << i)); 168 if (isXbyak_) printf("); dump();"); 169 printf("\n"); 170 } 171 } 172 const char *get(uint64_t type) const 173 { 174 int idx = (rand() / 31) & 7; 175 switch (type) { 176 case _XMM: 177 { 178 static const char tbl[][6] = { 179 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", 180 }; 181 return tbl[idx]; 182 } 183 case _YMM: 184 { 185 static const char tbl[][6] = { 186 "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7" 187 }; 188 return tbl[idx]; 189 } 190 case _ZMM: 191 { 192 static const char tbl[][6] = { 193 "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7" 194 }; 195 return tbl[idx]; 196 } 197 #ifdef XBYAK64 198 case _XMM2: 199 { 200 static const char tbl[][6] = { 201 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" 202 }; 203 return tbl[idx]; 204 } 205 case _XMM3: 206 { 207 static const char tbl[][6] = { 208 "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23" 209 }; 210 return tbl[idx]; 211 } 212 case _YMM2: 213 { 214 static const char tbl[][6] = { 215 "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", 216 }; 217 return tbl[idx]; 218 } 219 case _YMM3: 220 { 221 static const char tbl[][6] = { 222 "ymm16", "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23", 223 }; 224 return tbl[idx]; 225 } 226 case _ZMM2: 227 { 228 static const char tbl[][6] = { 229 "zmm8", "zmm9", "zmm10", "zmm11", "zmm28", "zmm29", "zmm30", "zmm31", 230 }; 231 return tbl[idx]; 232 } 233 #endif 234 case _MEM: 235 return isXbyak_ ? "ptr[eax+ecx+64]" : "[eax+ecx+64]"; // QQQ 236 // return isXbyak_ ? "ptr[eax+ecx+6]" : "[eax+ecx+6]"; 237 case _MEMe: 238 { 239 static int ccc = 1; 240 #ifdef USE_YASM 241 ccc++; 242 #endif 243 if (ccc & 1) { 244 return isXbyak_ ? "ptr[rdx+r15+0x12]" : "[rdx+r15+0x12]"; 245 } else { 246 return isXbyak_ ? "ptr[rip - 0x13456+1-3]" : "[rip - 0x13456+1-3]"; 247 } 248 } 249 case MEM8: 250 return "byte [eax+edx]"; 251 case MEM16: 252 return "word [esi]"; 253 case MEM32: 254 return "dword [eax+64]"; 255 case MEM64: 256 return "qword [rax+64]"; 257 case MEM_ONLY_DISP: 258 return isXbyak_ ? "ptr[(void*)0x123]" : "[0x123]"; 259 case _REG16: // not ax 260 { 261 static const char Reg16Tbl[][4] = { 262 "ax", "cx", "dx", "bx", "sp", "bp", "si", "di" 263 }; 264 return Reg16Tbl[(idx % 7) + 1]; 265 } 266 case _REG8: // not al 267 { 268 static const char Reg8Tbl[][4] = { 269 #ifdef XBYAK64 // QQQ 270 "al", "cl", "dl", "bl", "al", "cl", "dl", "bl" 271 #else 272 "al", "cl", "dl", "bl", "ah", "ch", "dh", "bh" 273 #endif 274 }; 275 return Reg8Tbl[(idx % 7) + 1]; 276 } 277 case _REG32: // not eax 278 { 279 static const char Reg32Tbl[][4] = { 280 "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi" 281 }; 282 return Reg32Tbl[(idx % 7) + 1]; 283 } 284 #ifdef XBYAK64 285 case _REG64: // not rax 286 { 287 static const char Reg64Tbl[][4] = { 288 "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi" 289 }; 290 return Reg64Tbl[(idx % 7) + 1]; 291 } 292 case _REG64_2: 293 { 294 static const char Reg64_2Tbl[][4] = { 295 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 296 }; 297 return Reg64_2Tbl[idx]; 298 } 299 case REG32_2: 300 { 301 static const char Reg32eTbl[][5] = { 302 "r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d" 303 }; 304 return Reg32eTbl[idx]; 305 } 306 case REG16_2: 307 { 308 static const char Reg16eTbl[][5] = { 309 "r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w", "r15w" 310 }; 311 return Reg16eTbl[idx]; 312 } 313 case REG8_2: 314 { 315 static const char Reg8_2Tbl[][5] = { 316 "r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b" 317 }; 318 return Reg8_2Tbl[idx]; 319 } 320 case REG8_3: 321 { 322 static const char Reg8_3Tbl[][5] = { 323 "spl", "bpl", "sil", "dil", "spl", "bpl", "sil", "dil" 324 }; 325 return Reg8_3Tbl[idx]; 326 } 327 #endif 328 case EAX: 329 return "eax"; 330 case AX: 331 return "ax"; 332 case AL: 333 return "al"; 334 case K_K: 335 return isXbyak_ ? "k5 | k3" : "k5{k3}"; 336 case IMM32: 337 return isXbyak_ ? "12345678" : "dword 12345678"; 338 case IMM8: 339 return isXbyak_ ? "4" : "byte 4"; 340 case IMM_1: 341 return "4"; 342 case IMM_2: 343 return isXbyak_ ? "0xda" : "0xda"; 344 case VM32X_32: 345 return isXbyak_ ? "ptr [ebp+64+xmm1*8]" : "[ebp+64+xmm1*8]"; 346 case VM32X_64: 347 return isXbyak_ ? "ptr [rax+64+xmm13*2]" : "[rax+64+xmm13*2]"; 348 case VM32Y_32: 349 return isXbyak_ ? "ptr [ymm4]" : "[ymm4]"; 350 case VM32Y_64: 351 return isXbyak_ ? "ptr [64+ymm13*2+r13]" : "[64+ymm13*2+r13]"; 352 case VM32X_K: 353 return isXbyak_ ? "ptr [64+xmm13*2+r13] | k6" : "[64+xmm13*2+r13]{k6}"; 354 case VM32Y_K: 355 return isXbyak_ ? "ptr [64+ymm13*2+r13] | k6" : "[64+ymm13*2+r13]{k6}"; 356 case VM32Z_K: 357 if (idx & 1) return isXbyak_ ? "ptr [64+zmm10*8+r9] | k6" : "[64+zmm10*8+r9]{k6}"; 358 return isXbyak_ ? "ptr [64+zmm30*2+r13] | k6" : "[64+zmm30*2+r13]{k6}"; 359 case VM32Z: 360 return isXbyak_ ? "ptr [64+zmm13*2+rcx]" : "[64+zmm13*2+rcx]"; 361 case M_1to2: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to2}"; 362 case M_1to4: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to4}"; 363 case M_1to8: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to8}"; 364 case M_1to16: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to16}"; 365 366 case M_xword: return isXbyak_ ? "ptr [eax+32]" : "oword [eax+32]"; 367 case M_yword: return isXbyak_ ? "yword [eax+32]" : "yword [eax+32]"; 368 case MY_1to4: return isXbyak_ ? "yword_b [eax+32]" : "[eax+32]{1to4}"; 369 case K: 370 { 371 static const char kTbl[][5] = { 372 "k1", "k2", "k3", "k4", "k5", "k6", "k7", 373 }; 374 return kTbl[idx % 7]; 375 } 376 case K2: 377 return isXbyak_ ? "k3 | k5" : "k3{k5}"; 378 #ifdef XBYAK64 379 case XMM_SAE: 380 return isXbyak_ ? "xmm25 | T_sae" : "xmm25, {sae}"; 381 case YMM_SAE: 382 return isXbyak_ ? "ymm25 | T_sae" : "ymm25, {sae}"; 383 case ZMM_SAE: 384 return isXbyak_ ? "zmm25 | T_sae" : "zmm25, {sae}"; 385 case XMM_ER: 386 return isXbyak_ ? "xmm4 | T_rd_sae" : "xmm4, {rd-sae}"; 387 case YMM_ER: 388 return isXbyak_ ? "ymm20 | T_rd_sae" : "ymm20, {rd-sae}"; 389 case ZMM_ER: 390 return isXbyak_ ? "zmm20 | T_rd_sae" : "zmm20, {rd-sae}"; 391 case XMM_KZ: 392 return isXbyak_ ? "xmm5 | k5" : "xmm5{k5}"; 393 case YMM_KZ: 394 return isXbyak_ ? "ymm2 |k3|T_z" : "ymm2{k3}{z}"; 395 case ZMM_KZ: 396 return isXbyak_ ? "zmm7|k1" : "zmm7{k1}"; 397 case MEM_K: 398 return isXbyak_ ? "ptr [rax] | k1" : "[rax]{k1}"; 399 #else 400 case XMM_SAE: 401 return isXbyak_ ? "xmm5 | T_sae" : "xmm5, {sae}"; 402 case YMM_SAE: 403 return isXbyak_ ? "ymm5 | T_sae" : "ymm5, {sae}"; 404 case ZMM_SAE: 405 return isXbyak_ ? "zmm5 | T_sae" : "zmm5, {sae}"; 406 case XMM_ER: 407 return isXbyak_ ? "xmm30 | T_rd_sae" : "xmm30, {rd-sae}"; 408 case YMM_ER: 409 return isXbyak_ ? "ymm2 | T_rd_sae" : "ymm2, {rd-sae}"; 410 case ZMM_ER: 411 return isXbyak_ ? "zmm2 | T_rd_sae" : "zmm2, {rd-sae}"; 412 case MEM_K: 413 return isXbyak_ ? "ptr [eax] | k1" : "[eax]{k1}"; 414 #endif 415 case XMM_K: 416 return isXbyak_ ? "xmm5 | k7" : "xmm5{k7}"; 417 case YMM_K: 418 return isXbyak_ ? "ymm5 | k4" : "ymm5{k4}"; 419 case ZMM_K: 420 return isXbyak_ ? "zmm5 | k3" : "zmm5{k3}"; 421 } 422 return 0; 423 } 424 public: 425 Test(bool isXbyak) 426 : isXbyak_(isXbyak) 427 , funcNum_(1) 428 { 429 if (!isXbyak_) return; 430 printf("%s", 431 " void gen0()\n" 432 " {\n"); 433 } 434 /* 435 gcc and vc give up to compile this source, 436 so I split functions. 437 */ 438 void separateFunc() 439 { 440 if (!isXbyak_) return; 441 printf( 442 " }\n" 443 " void gen%d()\n" 444 " {\n", funcNum_++); 445 } 446 ~Test() 447 { 448 if (!isXbyak_) return; 449 printf("%s", 450 " }\n" 451 " void gen()\n" 452 " {\n"); 453 for (int i = 0; i < funcNum_; i++) { 454 printf( 455 " gen%d();\n", i); 456 } 457 printf( 458 " }\n"); 459 } 460 void put() 461 { 462 putAVX512(); 463 } 464 void putOpmask() 465 { 466 { 467 const char *tbl[] = { 468 "kadd", 469 "kand", 470 "kandn", 471 "kor", 472 "kxnor", 473 "kxor", 474 }; 475 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 476 std::string name = tbl[i]; 477 put(name + "b", K, K, K); 478 put(name + "w", K, K, K); 479 put(name + "q", K, K, K); 480 put(name + "d", K, K, K); 481 } 482 put("kunpckbw", K, K, K); 483 put("kunpckwd", K, K, K); 484 put("kunpckdq", K, K, K); 485 } 486 { 487 const char *tbl[] = { 488 "knot", 489 "kortest", 490 "ktest", 491 }; 492 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 493 std::string name = tbl[i]; 494 put(name + "b", K, K); 495 put(name + "w", K, K); 496 put(name + "q", K, K); 497 put(name + "d", K, K); 498 } 499 } 500 { 501 const char *tbl[] = { 502 "kshiftl", 503 "kshiftr", 504 }; 505 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 506 std::string name = tbl[i]; 507 put(name + "b", K, K, IMM8); 508 put(name + "w", K, K, IMM8); 509 put(name + "q", K, K, IMM8); 510 put(name + "d", K, K, IMM8); 511 } 512 } 513 put("kmovw", K, K | MEM | REG32); 514 put("kmovq", K, K | MEM); 515 put("kmovb", K, K | MEM | REG32); 516 put("kmovd", K, K | MEM | REG32); 517 518 put("kmovw", MEM | REG32, K); 519 put("kmovq", MEM, K); 520 put("kmovb", MEM | REG32, K); 521 put("kmovd", MEM | REG32, K); 522 #ifdef XBYAK64 523 put("kmovq", K, REG64); 524 put("kmovq", REG64, K); 525 #endif 526 } 527 void put_vaddpd(const char *r1, const char *r2, const char *r3, int kIdx = 0, bool z = false, int sae = 0) 528 { 529 std::string modifier; 530 char pk[16] = ""; 531 const char *pz = ""; 532 const char *saeTblXbyak[] = { "", "|T_rn_sae", "|T_rd_sae", "|T_ru_sae", "|T_rz_sae" }; 533 const char *saeTblNASM[] = { "", ",{rn-sae}", ",{rd-sae}", ",{ru-sae}", ",{rz-sae}" }; 534 if (isXbyak_) { 535 if (kIdx) CYBOZU_SNPRINTF(pk, sizeof(pk), "|k%d", kIdx); 536 if (z) pz = "|T_z"; 537 printf("vaddpd(%s%s%s, %s, %s%s); dump();\n", r1, pk, pz, r2, r3, saeTblXbyak[sae]); 538 } else { 539 if (kIdx) CYBOZU_SNPRINTF(pk, sizeof(pk), "{k%d}", kIdx); 540 if (z && kIdx) pz = "{z}"; 541 printf("vaddpd %s%s%s, %s, %s%s\n", r1, pk, pz, r2, r3, saeTblNASM[sae]); 542 } 543 } 544 void putCombi() 545 { 546 const char *xTbl[] = { 547 "xmm2", 548 #ifdef XBYAK64 549 "xmm8", "xmm31" 550 #else 551 "xmm5", "xmm6" 552 #endif 553 }; 554 const char *yTbl[] = { 555 "ymm0", 556 #ifdef XBYAK64 557 "ymm15", "ymm31" 558 #else 559 "ymm4", "ymm2" 560 #endif 561 }; 562 const char *zTbl[] = { 563 "zmm1", 564 #ifdef XBYAK64 565 "zmm9", "zmm30" 566 #else 567 "zmm3", "zmm7" 568 #endif 569 }; 570 const size_t N = NUM_OF_ARRAY(zTbl); 571 for (size_t i = 0; i < N; i++) { 572 for (size_t j = 0; j < N; j++) { 573 separateFunc(); 574 for (size_t k = 0; k < N; k++) { 575 #ifdef XBYAK64 576 for (int kIdx = 0; kIdx < 8; kIdx++) { 577 put_vaddpd(xTbl[i], xTbl[j], xTbl[k], kIdx); 578 put_vaddpd(yTbl[i], yTbl[j], yTbl[k], kIdx); 579 for (int z = 0; z < 2; z++) { 580 for (int sae = 0; sae < 5; sae++) { 581 put_vaddpd(zTbl[i], zTbl[j], zTbl[k], kIdx, z == 1, sae); 582 } 583 } 584 } 585 #else 586 put_vaddpd(xTbl[i], xTbl[j], xTbl[k]); 587 put_vaddpd(yTbl[i], yTbl[j], yTbl[k]); 588 for (int sae = 0; sae < 5; sae++) { 589 put_vaddpd(zTbl[i], zTbl[j], zTbl[k], sae); 590 } 591 #endif 592 } 593 } 594 } 595 put("vaddpd", XMM, XMM, _MEM); 596 put("vaddpd", YMM, YMM, _MEM); 597 put("vaddpd", ZMM, ZMM, _MEM); 598 } 599 void putCmpK() 600 { 601 { 602 const struct Tbl { 603 const char *name; 604 bool supportYMM; 605 } tbl[] = { 606 { "vcmppd", true }, 607 { "vcmpps", true }, 608 { "vcmpsd", false }, 609 { "vcmpss", false }, 610 }; 611 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 612 const Tbl *p = &tbl[i]; 613 put(p->name, K, XMM, _XMM | MEM, IMM8); 614 if (!p->supportYMM) continue; 615 put(p->name, K, _YMM, _YMM | MEM, IMM8); 616 put(p->name, K, _ZMM, _ZMM | MEM, IMM8); 617 } 618 put("vcmppd", K, XMM, M_1to2, IMM8); 619 put("vcmppd", K, YMM, M_1to4, IMM8); 620 put("vcmppd", K, ZMM, M_1to8, IMM8); 621 622 put("vcmpps", K, XMM, M_1to4, IMM8); 623 put("vcmpps", K, YMM, M_1to8, IMM8); 624 put("vcmpps", K, ZMM, M_1to16, IMM8); 625 } 626 put("vcmppd", K2, ZMM, ZMM_SAE, IMM); 627 #ifdef XBYAK64 628 { 629 const struct Tbl { 630 const char *name; 631 } tbl[] = { 632 { "vcomisd" }, 633 { "vcomiss" }, 634 { "vucomisd" }, 635 { "vucomiss" }, 636 }; 637 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 638 const Tbl *p = &tbl[i]; 639 put(p->name, XMM, XMM_SAE | XMM | MEM); 640 } 641 } 642 put("vcomiss", XMM, _XMM3 | MEM); 643 put("vcomiss", XMM, XMM_SAE); 644 #endif 645 } 646 void putBroadcastSub(int idx, int disp) 647 { 648 #ifdef XBYAK64 649 const char *a = "rax"; 650 #else 651 const char *a = "eax"; 652 #endif 653 if (isXbyak_) { 654 printf("vaddpd(zmm%d, zmm1, ptr_b[%s+%d]);dump();\n", idx, a, disp); 655 printf("vaddpd(ymm%d, ymm1, ptr_b[%s+%d]);dump();\n", idx, a, disp); 656 printf("vaddpd(xmm%d, xmm1, ptr_b[%s+%d]);dump();\n", idx, a, disp); 657 } else { 658 printf("vaddpd zmm%d, zmm1, [%s+%d]{1to8}\n", idx, a, disp); 659 printf("vaddpd ymm%d, ymm1, [%s+%d]{1to4}\n", idx, a, disp); 660 printf("vaddpd xmm%d, xmm1, [%s+%d]{1to2}\n", idx, a, disp); 661 } 662 } 663 void putBroadcast() 664 { 665 for (int i = 0; i < 9; i++) { 666 putBroadcastSub(0, i); 667 #ifdef XBYAK64 668 putBroadcastSub(10, i); 669 putBroadcastSub(20, i); 670 #endif 671 } 672 put("vpbroadcastb", XMM_KZ | ZMM_KZ, REG8 | _MEM); 673 put("vpbroadcastw", XMM_KZ | ZMM_KZ, REG16 | _MEM); 674 put("vpbroadcastd", XMM_KZ | ZMM_KZ, REG32 | _MEM); 675 #ifdef XBYAK64 676 put("vpbroadcastq", XMM_KZ | ZMM_KZ, REG64 | _MEM); 677 #endif 678 { 679 const char *tbl[] = { 680 "vpbroadcastb", 681 "vpbroadcastw", 682 "vpbroadcastd", 683 "vpbroadcastq", 684 }; 685 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 686 put(tbl[i], XMM_KZ | ZMM_KZ, XMM | _MEM); 687 } 688 } 689 put("vbroadcasti32x2", XMM_KZ | YMM_KZ | ZMM_KZ, XMM | _MEM); 690 put("vbroadcasti32x4", YMM_KZ | ZMM_KZ, _MEM); 691 put("vbroadcasti64x2", YMM_KZ | ZMM_KZ, _MEM); 692 put("vbroadcasti32x8", ZMM_KZ, _MEM); 693 put("vbroadcasti64x4", ZMM_KZ, _MEM); 694 } 695 void putMisc1() 696 { 697 put("vmaskmovps", _XMM, _XMM, MEM); 698 put("vmaskmovps", YMM, YMM, MEM); 699 700 put("vmaskmovpd", YMM, YMM, MEM); 701 put("vmaskmovpd", _XMM, _XMM, MEM); 702 703 put("vmaskmovps", MEM, _XMM, _XMM); 704 put("vmaskmovpd", MEM, _XMM, _XMM); 705 706 put("vbroadcastf128", YMM, MEM); 707 put("vbroadcasti128", YMM, MEM); 708 put("vbroadcastsd", YMM|_YMM3, XMM|MEM); 709 put("vbroadcastsd", ZMM, XMM|MEM); 710 { 711 const char *tbl[] = { 712 "vbroadcastss", 713 "vpbroadcastb", 714 "vpbroadcastw", 715 "vpbroadcastd", 716 "vpbroadcastq", 717 }; 718 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 719 put(tbl[i], XMM | YMM | ZMM, XMM|MEM); 720 } 721 } 722 723 put("vinsertf128", YMM, YMM, _XMM | _XMM2 | MEM, IMM8); 724 put("vinserti128", YMM, YMM, _XMM | _XMM2 | MEM, IMM8); 725 put("vperm2f128", YMM, YMM, YMM | MEM, IMM8); 726 put("vperm2i128", YMM, YMM, YMM | MEM, IMM8); 727 728 { 729 const char *tbl[] = { 730 "vpmaskmovd", "vpmaskmovq" 731 }; 732 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 733 const char *name = tbl[i]; 734 put(name, _XMM, _XMM, MEM); 735 put(name, YMM, YMM, MEM); 736 put(name, MEM, _XMM, _XMM); 737 put(name, MEM, YMM, YMM); 738 } 739 } 740 { 741 const char *tbl[] = { 742 "vpermd", "vpermps", 743 }; 744 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 745 const char *name = tbl[i]; 746 put(name, YMM, YMM, YMM | MEM); 747 } 748 } 749 { 750 const char *tbl[] = { 751 "vpermq", "vpermpd", 752 }; 753 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 754 const char *name = tbl[i]; 755 put(name, YMM, YMM | MEM, IMM8); 756 } 757 } 758 put("vpextrw", REG32e | MEM, XMM, IMM); // nasm is ok, yasm generate redundant code 759 } 760 void putAVX512_M_X() 761 { 762 const char *tbl[] = { 763 "vmovapd", 764 "vmovaps", 765 "vmovupd", 766 "vmovups", 767 }; 768 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 769 const char *name = tbl[i]; 770 put(name, MEM, ZMM); 771 put(name, ZMM, MEM); 772 #ifdef XBYAK64 773 put(name, MEM, XMM); 774 put(name, XMM, MEM); 775 #endif 776 } 777 } 778 void put_vmov() 779 { 780 #ifdef XBYAK64 781 put("vmovd", XMM, MEM|REG32); 782 put("vmovd", MEM|REG32, XMM); 783 put("vmovq", XMM, MEM|REG64|XMM); 784 put("vmovq", MEM|REG64|XMM, XMM); 785 put("vmovhlps", XMM, _XMM3, _XMM3); 786 put("vmovlhps", XMM, _XMM3, _XMM3); 787 put("vmovntdqa", XMM|_YMM3|ZMM, MEM); 788 put("vmovntdq", MEM, XMM | _YMM3 | ZMM); 789 put("vmovntpd", MEM, XMM | _YMM3 | ZMM); 790 put("vmovntps", MEM, XMM | _YMM3 | ZMM); 791 792 put("vmovsd", XMM_KZ, XMM, _XMM3); 793 put("vmovsd", XMM_KZ, MEM); 794 put("vmovsd", MEM_K, XMM); 795 put("vmovss", XMM_KZ, XMM, _XMM3); 796 put("vmovss", XMM_KZ, MEM); 797 put("vmovss", MEM_K, XMM); 798 799 put("vmovshdup", _ZMM, _ZMM); 800 put("vmovsldup", _ZMM, _ZMM); 801 802 803 { 804 const char *tbl[] = { 805 "valignd", 806 "valignq", 807 }; 808 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 809 const char *name = tbl[i]; 810 put(name, XMM_KZ, XMM, _XMM | MEM, IMM); 811 put(name, _YMM3, _YMM3, _YMM3 | _MEM, IMM); 812 put(name, _ZMM, _ZMM, _ZMM | _MEM, IMM); 813 } 814 } 815 { 816 const char tbl[][16] = { 817 "vmovhpd", 818 "vmovhps", 819 "vmovlpd", 820 "vmovlps", 821 }; 822 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 823 put(tbl[i], XMM, _XMM3, MEM); 824 put(tbl[i], MEM, _XMM3); 825 } 826 } 827 #endif 828 } 829 void put512_X_XM() 830 { 831 const struct Tbl { 832 const char *name; 833 bool M_X; 834 } tbl[] = { 835 { "vmovddup", false }, 836 { "vmovdqa32", true }, 837 { "vmovdqa64", true }, 838 { "vmovdqu8", true }, 839 { "vmovdqu16", true }, 840 { "vmovdqu32", true }, 841 { "vmovdqu64", true }, 842 { "vpabsb", false }, 843 { "vpabsw", false }, 844 { "vpabsd", false }, 845 { "vpabsq", false }, 846 }; 847 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 848 const Tbl& p = tbl[i]; 849 put(p.name, XMM|XMM_KZ, _XMM|MEM); 850 put(p.name, _YMM|YMM_KZ, _YMM|MEM); 851 put(p.name, _ZMM|ZMM_KZ, _ZMM|MEM); 852 if (!p.M_X) continue; 853 put(p.name, MEM|MEM_K, XMM); 854 put(p.name, MEM|MEM_K, _YMM); 855 put(p.name, MEM|MEM_K, _ZMM); 856 } 857 put("vsqrtpd", XMM_KZ, M_1to2 | _MEM); 858 put("vsqrtpd", YMM_KZ, M_1to4 | _MEM); 859 put("vsqrtpd", ZMM_KZ, M_1to8 | _MEM); 860 put("vsqrtpd", ZMM_KZ, ZMM_ER); 861 862 put("vsqrtps", XMM_KZ, M_1to4 | _MEM); 863 put("vsqrtps", YMM_KZ, M_1to8 | _MEM); 864 put("vsqrtps", ZMM_KZ, M_1to16 | _MEM); 865 put("vsqrtps", ZMM_KZ, ZMM_ER); 866 867 put("vpabsd", ZMM_KZ, M_1to16 | _MEM); 868 put("vpabsq", ZMM_KZ, M_1to8 | _MEM); 869 870 put("vbroadcastf32x2", YMM_KZ | ZMM_KZ, XMM | _MEM); 871 put("vbroadcastf32x4", YMM_KZ | ZMM_KZ, _MEM); 872 873 put("vbroadcastf64x2", YMM_KZ | ZMM_KZ, _MEM); 874 put("vbroadcastf64x4", ZMM_KZ, _MEM); 875 put("vbroadcastf32x8", ZMM_KZ, _MEM); 876 } 877 void put512_X_X_XM() 878 { 879 const struct Tbl { 880 const char *name; 881 uint64_t mem; 882 } tbl[] = { 883 { "vsqrtsd", MEM }, 884 { "vsqrtss", MEM }, 885 { "vunpckhpd", M_1to2 }, 886 { "vunpckhps", M_1to4 }, 887 { "vunpcklpd", M_1to2 }, 888 { "vunpcklps", M_1to4 }, 889 }; 890 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 891 const Tbl& p = tbl[i]; 892 put(p.name, XMM_KZ, XMM, _XMM|p.mem); 893 } 894 } 895 void put512_X3() 896 { 897 #ifdef XBYAK64 898 const struct Tbl { 899 const char *name; 900 uint64_t x1; 901 uint64_t x2; 902 uint64_t xm; 903 } tbl[] = { 904 { "vpacksswb", XMM_KZ, XMM, _XMM | _MEM }, 905 { "vpacksswb", YMM_KZ, _YMM, _YMM | _MEM }, 906 { "vpacksswb", ZMM_KZ, _ZMM, _ZMM | _MEM }, 907 908 { "vpackssdw", XMM_KZ, XMM, _XMM | M_1to4 | _MEM }, 909 { "vpackssdw", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM }, 910 { "vpackssdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM }, 911 912 { "vpackusdw", XMM_KZ, XMM, _XMM | M_1to4 | _MEM }, 913 { "vpackusdw", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM }, 914 { "vpackusdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM }, 915 916 { "vpackuswb", XMM_KZ, XMM, _XMM | _MEM }, 917 { "vpackuswb", YMM_KZ, _YMM, _YMM | _MEM }, 918 { "vpackuswb", ZMM_KZ, _ZMM, _ZMM | _MEM }, 919 920 { "vpaddb", XMM_KZ, XMM, _XMM | _MEM }, 921 { "vpaddw", XMM_KZ, _XMM, _XMM | _MEM }, 922 { "vpaddd", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, 923 { "vpaddq", ZMM_KZ, _ZMM, M_1to8 | _MEM }, 924 925 { "vpaddsb", XMM_KZ, XMM, _XMM | _MEM }, 926 { "vpaddsb", ZMM_KZ, _ZMM, _ZMM | _MEM }, 927 928 { "vpaddsw", XMM_KZ, XMM, _XMM | _MEM }, 929 { "vpaddsw", ZMM_KZ, _ZMM, _ZMM | _MEM }, 930 931 { "vpaddusb", XMM_KZ, XMM, _XMM | MEM }, 932 { "vpaddusb", ZMM_KZ, _ZMM, _ZMM | MEM }, 933 934 { "vpaddusw", XMM_KZ, XMM, _XMM | MEM }, 935 { "vpaddusw", ZMM_KZ, _ZMM, _ZMM | MEM }, 936 937 { "vpsubb", XMM_KZ, XMM, _XMM | _MEM }, 938 { "vpsubw", XMM_KZ, XMM, _XMM | _MEM }, 939 { "vpsubd", XMM_KZ, XMM, _XMM | M_1to4 | _MEM }, 940 { "vpsubq", ZMM_KZ, _ZMM, M_1to8 | _MEM }, 941 942 { "vpsubsb", XMM_KZ, XMM, _XMM | _MEM }, 943 { "vpsubsb", ZMM_KZ, _ZMM, _ZMM | _MEM }, 944 945 { "vpsubsw", XMM_KZ, XMM, _XMM | _MEM }, 946 { "vpsubsw", ZMM_KZ, _ZMM, _ZMM | _MEM }, 947 948 { "vpsubusb", XMM_KZ, XMM, _XMM | MEM }, 949 { "vpsubusb", ZMM_KZ, _ZMM, _ZMM | MEM }, 950 951 { "vpsubusw", XMM_KZ, XMM, _XMM | MEM }, 952 { "vpsubusw", ZMM_KZ, _ZMM, _ZMM | MEM }, 953 954 { "vpandd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM }, 955 { "vpandq", ZMM_KZ, _ZMM, _ZMM | M_1to8 | _MEM }, 956 957 { "vpandnd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM }, 958 { "vpandnq", ZMM_KZ, _ZMM, _ZMM | M_1to8 | _MEM }, 959 960 { "vpavgb", ZMM_KZ, _ZMM, _ZMM }, 961 { "vpavgw", ZMM_KZ, _ZMM, _ZMM }, 962 963 { "vpcmpeqb", K2, _ZMM, _ZMM | _MEM }, 964 { "vpcmpeqw", K2, _ZMM, _ZMM | _MEM }, 965 { "vpcmpeqd", K2, _ZMM, _ZMM | M_1to16 | _MEM }, 966 { "vpcmpeqq", K2, _ZMM, _ZMM | M_1to8 | _MEM }, 967 968 { "vpcmpgtb", K2, _ZMM, _ZMM | _MEM }, 969 { "vpcmpgtw", K2, _ZMM, _ZMM | _MEM }, 970 { "vpcmpgtd", K2, _ZMM, _ZMM | M_1to16 | _MEM }, 971 { "vpcmpgtq", K2, _ZMM, _ZMM | M_1to8 | _MEM }, 972 973 { "vpmaddubsw", ZMM_KZ, _ZMM, _ZMM | _MEM }, 974 { "vpmaddwd", ZMM_KZ, _ZMM, _ZMM | _MEM }, 975 976 { "vpmaxsb", ZMM_KZ, _ZMM, _ZMM | _MEM }, 977 { "vpmaxsw", ZMM_KZ, _ZMM, _ZMM | _MEM }, 978 { "vpmaxsd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 }, 979 { "vpmaxsq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 }, 980 981 { "vpmaxub", ZMM_KZ, _ZMM, _ZMM | _MEM }, 982 { "vpmaxuw", ZMM_KZ, _ZMM, _ZMM | _MEM }, 983 { "vpmaxud", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 }, 984 { "vpmaxuq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 }, 985 986 { "vpminsb", ZMM_KZ, _ZMM, _ZMM | _MEM }, 987 { "vpminsw", ZMM_KZ, _ZMM, _ZMM | _MEM }, 988 { "vpminsd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 }, 989 { "vpminsq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 }, 990 991 { "vpminub", ZMM_KZ, _ZMM, _ZMM | _MEM }, 992 { "vpminuw", ZMM_KZ, _ZMM, _ZMM | _MEM }, 993 { "vpminud", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 }, 994 { "vpminuq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 }, 995 996 { "vpslldq", XMM, _XMM3 | _MEM, IMM8 }, 997 { "vpslldq", _YMM3, _YMM3 | _MEM, IMM8 }, 998 { "vpslldq", _ZMM, _ZMM | _MEM, IMM8 }, 999 1000 { "vpsrldq", XMM, _XMM3 | _MEM, IMM8 }, 1001 { "vpsrldq", _YMM3, _YMM3 | _MEM, IMM8 }, 1002 { "vpsrldq", _ZMM, _ZMM | _MEM, IMM8 }, 1003 1004 { "vpsraw", XMM_KZ, XMM | _MEM, IMM8 }, 1005 { "vpsraw", ZMM_KZ, _ZMM | _MEM, IMM8 }, 1006 1007 { "vpsrad", XMM_KZ, XMM | M_1to4 | _MEM, IMM8 }, 1008 { "vpsrad", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 }, 1009 1010 { "vpsraq", XMM, XMM, IMM8 }, 1011 { "vpsraq", XMM_KZ, XMM | M_1to2 | _MEM, IMM8 }, 1012 { "vpsraq", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 }, 1013 1014 { "vpsllw", XMM, _XMM3 | _MEM, IMM8 }, 1015 { "vpslld", XMM, _XMM3 | _MEM | M_1to4, IMM8 }, 1016 { "vpsllq", XMM, _XMM3 | _MEM | M_1to2, IMM8 }, 1017 1018 { "vpsrlw", XMM_KZ, XMM | _MEM, IMM8 }, 1019 { "vpsrlw", ZMM_KZ, _ZMM | _MEM, IMM8 }, 1020 1021 { "vpsrld", XMM_KZ, XMM | M_1to4 | _MEM, IMM8 }, 1022 { "vpsrld", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 }, 1023 1024 { "vpsrlq", XMM, _XMM3 | _MEM | M_1to2, IMM8 }, 1025 { "vpsrlq", _ZMM, _ZMM | _MEM | M_1to8, IMM8 }, 1026 1027 { "vpsravw", XMM_KZ | XMM, _XMM, _XMM | _MEM }, 1028 { "vpsravw", _ZMM, _ZMM, _MEM }, 1029 1030 { "vpsravd", XMM_KZ | XMM, _XMM, _XMM | _MEM }, 1031 { "vpsravd", _ZMM, _ZMM, M_1to16 | _MEM }, 1032 1033 { "vpsravq", XMM_KZ | XMM, _XMM, _XMM | _MEM }, 1034 { "vpsravq", _ZMM, _ZMM, M_1to8 | _MEM }, 1035 1036 { "vpsllvw", XMM_KZ | XMM, _XMM, _XMM | _MEM }, 1037 { "vpsllvw", _ZMM, _ZMM, _MEM }, 1038 1039 { "vpsllvd", XMM_KZ | XMM, _XMM, _XMM | _MEM }, 1040 { "vpsllvd", _ZMM, _ZMM, M_1to16 | _MEM }, 1041 1042 { "vpsllvq", XMM_KZ | XMM, _XMM, _XMM | _MEM }, 1043 { "vpsllvq", _ZMM, _ZMM, M_1to8 | _MEM }, 1044 1045 { "vpsrlvw", XMM_KZ | XMM, _XMM, _XMM | _MEM }, 1046 { "vpsrlvw", _ZMM, _ZMM, _MEM }, 1047 1048 { "vpsrlvd", XMM_KZ | XMM, _XMM, _XMM | _MEM }, 1049 { "vpsrlvd", _ZMM, _ZMM, M_1to16 | _MEM }, 1050 1051 { "vpsrlvq", XMM_KZ | XMM, _XMM, _XMM | _MEM }, 1052 { "vpsrlvq", _ZMM, _ZMM, M_1to8 | _MEM }, 1053 1054 { "vpshufb", XMM | XMM_KZ, _XMM, _XMM | _MEM }, 1055 { "vpshufb", ZMM_KZ, _ZMM, _MEM }, 1056 1057 { "vpshufhw", XMM | XMM_KZ, _XMM | _MEM, IMM8 }, 1058 { "vpshufhw", ZMM_KZ, _MEM, IMM8 }, 1059 1060 { "vpshuflw", XMM | XMM_KZ, _XMM | _MEM, IMM8 }, 1061 { "vpshuflw", ZMM_KZ, _MEM, IMM8 }, 1062 1063 { "vpshufd", XMM | XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 }, 1064 { "vpshufd", _ZMM | ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 }, 1065 1066 { "vpord", XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, 1067 { "vpord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM }, 1068 1069 { "vporq", XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM }, 1070 { "vporq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM }, 1071 1072 { "vpxord", XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM }, 1073 { "vpxord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM }, 1074 1075 { "vpxorq", XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM }, 1076 { "vpxorq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM }, 1077 1078 { "vpsadbw", XMM, _XMM, _XMM | _MEM }, 1079 { "vpsadbw", _ZMM, _ZMM, _MEM }, 1080 1081 { "vpmuldq", XMM, _XMM, _XMM | M_1to2 | _MEM }, 1082 { "vpmuldq", ZMM_KZ, _ZMM, M_1to8 | _MEM }, 1083 1084 { "vpmulhrsw", XMM, _XMM, _XMM | _MEM }, 1085 { "vpmulhrsw", ZMM_KZ, _ZMM, _MEM }, 1086 1087 { "vpmulhuw", XMM, _XMM, _XMM | _MEM }, 1088 { "vpmulhuw", ZMM_KZ, _ZMM, _MEM }, 1089 1090 { "vpmulhw", XMM, _XMM, _XMM | _MEM }, 1091 { "vpmulhw", ZMM_KZ, _ZMM, _MEM }, 1092 1093 { "vpmullw", XMM, _XMM, _XMM | _MEM }, 1094 { "vpmullw", ZMM_KZ, _ZMM, _MEM }, 1095 1096 { "vpmulld", XMM, _XMM, M_1to4 | _MEM }, 1097 { "vpmulld", ZMM_KZ, _ZMM, M_1to16 | _MEM }, 1098 1099 { "vpmullq", XMM, _XMM, M_1to2 | _MEM }, 1100 { "vpmullq", ZMM_KZ, _ZMM, M_1to8 | _MEM }, 1101 1102 { "vpmuludq", XMM, _XMM, M_1to2 | _MEM }, 1103 { "vpmuludq", ZMM_KZ, _ZMM, M_1to8 | _MEM }, 1104 1105 { "vpunpckhbw", XMM, _XMM, _XMM | _MEM }, 1106 { "vpunpckhbw", _ZMM, _ZMM, _MEM }, 1107 1108 { "vpunpckhwd", XMM, _XMM, _XMM | _MEM }, 1109 { "vpunpckhwd", _ZMM, _ZMM, _MEM }, 1110 1111 { "vpunpckhdq", XMM, _XMM, M_1to4 | _MEM }, 1112 { "vpunpckhdq", _ZMM, _ZMM, M_1to16 | _MEM }, 1113 1114 { "vpunpckhqdq", XMM, _XMM, M_1to2 | _MEM }, 1115 { "vpunpckhqdq", _ZMM, _ZMM, M_1to8 | _MEM }, 1116 1117 { "vpunpcklbw", XMM, _XMM, _XMM | _MEM }, 1118 { "vpunpcklbw", _ZMM, _ZMM, _MEM }, 1119 1120 { "vpunpcklwd", XMM, _XMM, _XMM | _MEM }, 1121 { "vpunpcklwd", _ZMM, _ZMM, _MEM }, 1122 1123 { "vpunpckldq", XMM, _XMM, M_1to4 | _MEM }, 1124 { "vpunpckldq", _ZMM, _ZMM, M_1to16 | _MEM }, 1125 1126 { "vpunpcklqdq", XMM, _XMM, M_1to2 | _MEM }, 1127 { "vpunpcklqdq", _ZMM, _ZMM, M_1to8 | _MEM }, 1128 1129 { "vextractf32x4", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 }, 1130 { "vextractf64x2", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 }, 1131 { "vextractf32x8", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 }, 1132 { "vextractf64x4", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 }, 1133 1134 { "vextracti32x4", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 }, 1135 { "vextracti64x2", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 }, 1136 { "vextracti32x8", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 }, 1137 { "vextracti64x4", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 }, 1138 1139 { "vextractps", REG32 | _MEM, XMM, IMM8 }, 1140 1141 { "vpermb", XMM_KZ, _XMM, _XMM | _MEM }, 1142 { "vpermb", ZMM_KZ, _ZMM, _ZMM | _MEM }, 1143 1144 { "vpermw", XMM_KZ, _XMM, _XMM | _MEM }, 1145 { "vpermw", ZMM_KZ, _ZMM, _ZMM | _MEM }, 1146 1147 { "vpermd", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM }, 1148 { "vpermd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM }, 1149 1150 { "vpermilpd", XMM_KZ, _XMM, _XMM | M_1to2 | _MEM }, 1151 { "vpermilpd", ZMM_KZ, _ZMM, M_1to8 | _MEM }, 1152 { "vpermilpd", XMM_KZ, M_1to2 | _MEM, IMM8 }, 1153 { "vpermilpd", ZMM_KZ, M_1to8 | _MEM, IMM8 }, 1154 1155 { "vpermilps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4 }, 1156 { "vpermilps", ZMM_KZ, _ZMM, _MEM | M_1to16 }, 1157 { "vpermilps", XMM_KZ, _MEM | M_1to4 | _MEM, IMM8 }, 1158 { "vpermilps", ZMM_KZ, _MEM | M_1to16 | _MEM, IMM8 }, 1159 1160 { "vpermpd", YMM_KZ, _YMM | M_1to4 | _MEM, IMM8 }, 1161 { "vpermpd", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 }, 1162 { "vpermpd", YMM_KZ, _YMM, M_1to4 | _MEM }, 1163 { "vpermpd", ZMM_KZ, _ZMM, M_1to8 | _MEM }, 1164 1165 { "vpermps", YMM_KZ, _YMM, M_1to8 | _MEM }, 1166 { "vpermps", ZMM_KZ, _ZMM, M_1to16 | _MEM }, 1167 1168 { "vpermq", YMM_KZ, _YMM | M_1to4 | _MEM, IMM8 }, 1169 { "vpermq", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 }, 1170 { "vpermq", YMM_KZ, _YMM, M_1to4 | _MEM }, 1171 { "vpermq", ZMM_KZ, _ZMM, M_1to8 | _MEM }, 1172 }; 1173 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 1174 const Tbl& p = tbl[i]; 1175 put(p.name, p.x1, p.x2, p.xm); 1176 } 1177 #endif 1178 } 1179 void put512_X3_I() 1180 { 1181 const struct Tbl { 1182 const char *name; 1183 uint64_t x1; 1184 uint64_t x2; 1185 uint64_t xm; 1186 } tbl[] = { 1187 #ifdef XBYAK64 1188 { "vinsertps", XMM, _XMM, _XMM3 | _MEM }, 1189 1190 { "vshufpd", XMM_KZ, _XMM, M_1to2 | _MEM }, 1191 { "vshufpd", ZMM_KZ, _ZMM, M_1to8 | _MEM }, 1192 1193 { "vshufps", XMM_KZ, _XMM, M_1to4 | _MEM }, 1194 { "vshufps", ZMM_KZ, _ZMM, M_1to16 | _MEM }, 1195 1196 { "vinsertf32x4", _YMM | YMM_KZ, _YMM, _XMM | _MEM }, 1197 { "vinsertf32x4", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM }, 1198 1199 { "vinsertf64x2", _YMM | YMM_KZ, _YMM, _XMM | _MEM }, 1200 { "vinsertf64x2", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM }, 1201 1202 { "vinsertf32x8", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM }, 1203 { "vinsertf64x4", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM }, 1204 1205 { "vinserti32x4", _YMM | YMM_KZ, _YMM, _XMM | _MEM }, 1206 { "vinserti32x4", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM }, 1207 1208 { "vinserti64x2", _YMM | YMM_KZ, _YMM, _XMM | _MEM }, 1209 { "vinserti64x2", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM }, 1210 1211 { "vinserti32x8", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM }, 1212 { "vinserti64x4", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM }, 1213 #endif 1214 { "vpalignr", ZMM_KZ, _ZMM, _ZMM | _MEM }, 1215 }; 1216 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 1217 const Tbl& p = tbl[i]; 1218 put(p.name, p.x1, p.x2, p.xm, IMM8); 1219 } 1220 #ifdef XBYAK64 1221 put("vpextrb", _REG64 | _MEM, XMM, IMM8); 1222 put("vpextrw", _REG64 | _MEM, XMM, IMM8); 1223 put("vpextrd", _REG32 | _MEM, XMM, IMM8); 1224 put("vpextrq", _REG64 | _MEM, XMM, IMM8); 1225 put("vpinsrb", XMM, _XMM3, _REG32 | _MEM, IMM8); 1226 put("vpinsrw", XMM, _XMM3, _REG32 | _MEM, IMM8); 1227 put("vpinsrd", XMM, _XMM3, _REG32 | _MEM, IMM8); 1228 put("vpinsrq", XMM, _XMM3, _REG64 | _MEM, IMM8); 1229 #endif 1230 } 1231 void put512_FMA() 1232 { 1233 const struct Tbl { 1234 const char *name; 1235 bool supportYMM; 1236 } tbl[] = { 1237 { "vfmadd", true }, 1238 { "vfmadd", false }, 1239 { "vfmaddsub", true }, 1240 { "vfmsubadd", true }, 1241 { "vfmsub", true }, 1242 { "vfmsub", false }, 1243 { "vfnmadd", true }, 1244 { "vfnmadd", false }, 1245 { "vfnmsub", true }, 1246 { "vfnmsub", false }, 1247 }; 1248 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 1249 const Tbl& p = tbl[i]; 1250 const struct Ord { 1251 const char *name; 1252 } ord[] = { 1253 { "132" }, 1254 { "213" }, 1255 { "231" }, 1256 }; 1257 for (size_t j = 0; j < NUM_OF_ARRAY(ord); j++) { 1258 const char sufTbl[][2][8] = { 1259 { "pd", "ps" }, 1260 { "sd", "ss" }, 1261 }; 1262 for (size_t k = 0; k < 2; k++) { 1263 const std::string suf = sufTbl[p.supportYMM ? 0 : 1][k]; 1264 uint64_t mem = 0; 1265 if (suf == "pd") { 1266 mem = M_1to2; 1267 } else if (suf == "ps") { 1268 mem = M_1to4; 1269 } else { 1270 mem = XMM_ER; 1271 } 1272 std::string name = std::string(p.name) + ord[j].name + suf; 1273 const char *q = name.c_str(); 1274 put(q, XMM_KZ, _XMM, mem | _MEM); 1275 if (!p.supportYMM) continue; 1276 if (suf == "pd") { 1277 mem = M_1to8; 1278 } else if (suf == "ps") { 1279 mem = M_1to16; 1280 } else { 1281 mem = XMM_ER; 1282 } 1283 put(q, _ZMM, _ZMM, mem | _MEM); 1284 } 1285 } 1286 } 1287 } 1288 void put512_Y_XM() 1289 { 1290 const struct Tbl { 1291 const char *name; 1292 bool all_xmm; // 2nd param 1293 } tbl[] = { 1294 { "vpmovsxbw", false }, 1295 { "vpmovsxbd", true }, 1296 { "vpmovsxbq", true }, 1297 { "vpmovsxwd", false }, 1298 { "vpmovsxwq", true }, 1299 { "vpmovsxdq", false }, 1300 1301 { "vpmovzxbw", false }, 1302 { "vpmovzxbd", true }, 1303 { "vpmovzxbq", true }, 1304 { "vpmovzxwd", false }, 1305 { "vpmovzxwq", true }, 1306 { "vpmovzxdq", false }, 1307 }; 1308 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 1309 const Tbl& p = tbl[i]; 1310 const char *name = p.name; 1311 put(name, XMM_KZ | YMM, _XMM | _MEM); 1312 if (p.all_xmm) { 1313 put(name, ZMM, _XMM | _MEM); 1314 } else { 1315 put(name, ZMM, YMM | _MEM); 1316 } 1317 } 1318 } 1319 void put512_AVX1() 1320 { 1321 #ifdef XBYAK64 1322 const struct Tbl { 1323 std::string name; 1324 bool only_pd_ps; 1325 } tbl[] = { 1326 { "vadd", false }, 1327 { "vsub", false }, 1328 { "vmul", false }, 1329 { "vdiv", false }, 1330 { "vmax", false }, 1331 { "vmin", false }, 1332 { "vand", true }, 1333 { "vandn", true }, 1334 { "vor", true }, 1335 { "vxor", true }, 1336 }; 1337 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 1338 const struct Suf { 1339 const char *suf; 1340 bool supportYMM; 1341 } sufTbl[] = { 1342 { "pd", true }, 1343 { "ps", true }, 1344 { "sd", false }, 1345 { "ss", false }, 1346 }; 1347 for (size_t j = 0; j < NUM_OF_ARRAY(sufTbl); j++) { 1348 if (tbl[i].only_pd_ps && j == 2) break; 1349 std::string suf = sufTbl[j].suf; 1350 std::string name = tbl[i].name + suf; 1351 const char *p = name.c_str(); 1352 uint64_t mem = 0; 1353 if (suf == "pd") { 1354 mem = M_1to2; 1355 } else if (suf == "ps") { 1356 mem = M_1to4; 1357 } 1358 put(p, XMM | XMM_KZ, _XMM, mem | _MEM); 1359 if (!sufTbl[j].supportYMM) continue; 1360 mem = 0; 1361 if (suf == "pd") { 1362 mem = M_1to8; 1363 } else if (suf == "ps") { 1364 mem = M_1to16; 1365 } 1366 put(p, _ZMM, _ZMM, mem | _MEM); 1367 } 1368 } 1369 put("vaddss", XMM, _XMM, XMM_ER); 1370 put("vaddsd", XMM, _XMM, XMM_ER); 1371 #endif 1372 } 1373 void putAVX1() 1374 { 1375 const struct Tbl { 1376 const char *name; 1377 bool only_pd_ps; 1378 } tbl[] = { 1379 { "add", false }, 1380 { "sub", false }, 1381 { "mul", false }, 1382 { "div", false }, 1383 { "max", false }, 1384 { "min", false }, 1385 { "and", true }, 1386 { "andn", true }, 1387 { "or", true }, 1388 { "xor", true }, 1389 }; 1390 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 1391 const struct Suf { 1392 const char *suf; 1393 bool supportYMM; 1394 } suf[] = { 1395 { "pd", true }, 1396 { "ps", true }, 1397 { "sd", false }, 1398 { "ss", false }, 1399 }; 1400 for (size_t j = 0; j < NUM_OF_ARRAY(suf); j++) { 1401 if (tbl[i].only_pd_ps && j == 2) break; 1402 std::string name = std::string("v") + tbl[i].name + suf[j].suf; 1403 const char *p = name.c_str(); 1404 put(p, XMM, XMM | MEM); 1405 put(p, XMM, XMM, XMM | MEM); 1406 if (!suf[j].supportYMM) continue; 1407 put(p, YMM, YMM | MEM); 1408 put(p, YMM, YMM, YMM | MEM); 1409 put(p, ZMM, ZMM, ZMM | MEM); 1410 } 1411 } 1412 } 1413 void put512_cvt() 1414 { 1415 #ifdef XBYAK64 1416 put("vcvtdq2pd", XMM_KZ, _XMM | _MEM | M_1to2); 1417 put("vcvtdq2pd", YMM_KZ, _XMM | _MEM | M_1to4); 1418 put("vcvtdq2pd", ZMM_KZ, _YMM | _MEM | M_1to8); 1419 1420 put("vcvtdq2ps", XMM_KZ, _XMM | _MEM | M_1to4); 1421 put("vcvtdq2ps", YMM_KZ, _YMM | _MEM | M_1to8); 1422 put("vcvtdq2ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_ER); 1423 1424 put("vcvtpd2dq", XMM_KZ, _XMM | M_xword | M_1to2); 1425 put("vcvtpd2dq", XMM_KZ, _YMM | M_yword | MY_1to4); 1426 put("vcvtpd2dq", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER); 1427 1428 put("vcvtpd2ps", XMM_KZ, _XMM | M_xword | M_1to2); 1429 put("vcvtpd2ps", XMM_KZ, _YMM | M_yword | MY_1to4); 1430 put("vcvtpd2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER); 1431 1432 put("vcvtpd2qq", XMM_KZ, _XMM | _MEM | M_1to2); 1433 put("vcvtpd2qq", YMM_KZ, _YMM | _MEM | M_1to4); 1434 put("vcvtpd2qq", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER); 1435 1436 put("vcvtpd2udq", XMM_KZ, _XMM | M_xword | M_1to2); 1437 put("vcvtpd2udq", XMM_KZ, _YMM | M_yword | MY_1to4); 1438 put("vcvtpd2udq", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER); 1439 1440 put("vcvtpd2uqq", XMM_KZ, _XMM | _MEM | M_1to2); 1441 put("vcvtpd2uqq", YMM_KZ, _YMM | _MEM | M_1to4); 1442 put("vcvtpd2uqq", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER); 1443 1444 put("vcvtph2ps", XMM_KZ, _XMM | _MEM); 1445 put("vcvtph2ps", YMM_KZ, _XMM | _MEM); 1446 put("vcvtph2ps", ZMM_KZ, _YMM | _MEM | YMM_SAE); 1447 1448 put("vcvtps2ph", XMM_KZ | _MEM, _XMM, IMM8); 1449 put("vcvtps2ph", XMM_KZ | _MEM, _YMM, IMM8); 1450 put("vcvtps2ph", YMM_KZ | _MEM, _ZMM, IMM8); 1451 put("vcvtps2ph", YMM_KZ, ZMM_SAE, IMM8); 1452 1453 put("vcvtps2dq", XMM_KZ, _XMM | _MEM | M_1to4); 1454 put("vcvtps2dq", YMM_KZ, _YMM | _MEM | M_1to8); 1455 put("vcvtps2dq", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_ER); 1456 1457 put("vcvtps2udq", XMM_KZ, _XMM | M_1to4); 1458 put("vcvtps2udq", YMM_KZ, _YMM | M_1to8); 1459 put("vcvtps2udq", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_ER); 1460 1461 put("vcvtps2qq", XMM_KZ, _XMM | _MEM | M_1to2); 1462 put("vcvtps2qq", YMM_KZ, _XMM | _MEM | M_1to4); 1463 put("vcvtps2qq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_ER); 1464 1465 put("vcvtps2uqq", XMM_KZ, _XMM | _MEM | M_1to2); 1466 put("vcvtps2uqq", YMM_KZ, _XMM | _MEM | M_1to4); 1467 put("vcvtps2uqq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_ER); 1468 1469 put("vcvtps2pd", XMM_KZ, _XMM | _MEM | M_1to2); 1470 put("vcvtps2pd", YMM_KZ, _XMM | _MEM | M_1to4); 1471 put("vcvtps2pd", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_SAE); 1472 1473 put("vcvtqq2pd", XMM_KZ, _XMM | _MEM | M_1to2); 1474 put("vcvtqq2pd", YMM_KZ, _YMM | _MEM | M_1to4); 1475 put("vcvtqq2pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER); 1476 1477 put("vcvtqq2ps", XMM_KZ, _XMM | M_xword | M_1to2); 1478 put("vcvtqq2ps", XMM_KZ, _YMM | M_yword | MY_1to4); 1479 put("vcvtqq2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER); 1480 1481 put("vcvtsd2si", REG32 | REG64, XMM | _MEM | XMM_ER); 1482 1483 put("vcvtsd2usi", REG32 | REG64, XMM | _MEM | XMM_ER); 1484 1485 put("vcvtsd2ss", XMM_KZ, XMM, _XMM3 | _MEM | XMM_ER); 1486 1487 put("vcvtsi2sd", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64); 1488 put("vcvtsi2sd", XMM, XMM_ER, REG64); 1489 1490 put("vcvtsi2ss", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64); 1491 put("vcvtsi2ss", XMM, XMM_ER, REG32 | REG64); 1492 1493 put("vcvtss2sd", XMM_KZ, XMM, _XMM3 | _MEM | XMM_SAE); 1494 1495 put("vcvtss2si", REG32 | REG64, XMM | _MEM | XMM_ER); 1496 1497 put("vcvtss2usi", REG32 | REG64, XMM | _MEM | XMM_ER); 1498 1499 put("vcvtpd2dq", XMM_KZ, _XMM | M_xword | M_1to2); 1500 put("vcvtpd2dq", XMM_KZ, _YMM | M_yword | MY_1to4); 1501 put("vcvtpd2dq", YMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER); 1502 1503 put("vcvttpd2qq", XMM_KZ, _XMM | _MEM | M_1to2); 1504 put("vcvttpd2qq", YMM_KZ, _YMM | _MEM | M_1to4); 1505 put("vcvttpd2qq", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE); 1506 1507 put("vcvttpd2udq", XMM_KZ, _XMM | M_xword | M_1to2); 1508 put("vcvttpd2udq", XMM_KZ, _YMM | M_yword | MY_1to4); 1509 put("vcvttpd2udq", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_SAE); 1510 1511 put("vcvttpd2uqq", XMM_KZ, _XMM | _MEM | M_1to2); 1512 put("vcvttpd2uqq", YMM_KZ, _YMM | _MEM | M_1to4); 1513 put("vcvttpd2uqq", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE); 1514 1515 put("vcvttps2dq", XMM_KZ, _XMM | _MEM | M_1to4); 1516 put("vcvttps2dq", YMM_KZ, _YMM | _MEM | M_1to8); 1517 put("vcvttps2dq", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE); 1518 1519 put("vcvttps2udq", XMM_KZ, _XMM | M_1to4); 1520 put("vcvttps2udq", YMM_KZ, _YMM | M_1to8); 1521 put("vcvttps2udq", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE); 1522 1523 put("vcvttps2qq", XMM_KZ, _XMM | _MEM | M_1to2); 1524 put("vcvttps2qq", YMM_KZ, _XMM | _MEM | M_1to4); 1525 put("vcvttps2qq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_SAE); 1526 1527 put("vcvttps2uqq", XMM_KZ, _XMM | _MEM | M_1to2); 1528 put("vcvttps2uqq", YMM_KZ, _XMM | _MEM | M_1to4); 1529 put("vcvttps2uqq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_SAE); 1530 1531 put("vcvttsd2si", REG32 | REG64, XMM | _MEM | XMM_SAE); 1532 1533 put("vcvttsd2usi", REG32 | REG64, XMM | _MEM | XMM_SAE); 1534 1535 put("vcvttss2si", REG32 | REG64, XMM | _MEM | XMM_SAE); 1536 1537 put("vcvttss2usi", REG32 | REG64, XMM | _MEM | XMM_SAE); 1538 1539 put("vcvtudq2pd", XMM_KZ, _XMM | _MEM | M_1to2); 1540 put("vcvtudq2pd", YMM_KZ, _XMM | _MEM | M_1to4); 1541 put("vcvtudq2pd", ZMM_KZ, _YMM | _MEM | M_1to8); 1542 1543 put("vcvtudq2ps", XMM_KZ, _XMM | _MEM | M_1to4); 1544 put("vcvtudq2ps", YMM_KZ, _YMM | _MEM | M_1to8); 1545 put("vcvtudq2ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_ER); 1546 1547 put("vcvtuqq2pd", XMM_KZ, _XMM | _MEM | M_1to2); 1548 put("vcvtuqq2pd", YMM_KZ, _YMM | _MEM | M_1to4); 1549 put("vcvtuqq2pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER); 1550 1551 put("vcvtuqq2ps", XMM_KZ, _XMM | M_xword | M_1to2); 1552 put("vcvtuqq2ps", XMM_KZ, _YMM | M_yword | MY_1to4); 1553 put("vcvtuqq2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER); 1554 1555 put("vcvtusi2sd", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64); 1556 put("vcvtusi2sd", XMM, XMM_ER, REG64); 1557 1558 put("vcvtusi2ss", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64); 1559 put("vcvtusi2ss", XMM, XMM_ER, REG32 | REG64); 1560 #endif 1561 } 1562 enum { 1563 xx_yy_zz, 1564 xx_yx_zy, 1565 xx_xy_yz 1566 }; 1567 void putGather() 1568 { 1569 #ifdef XBYAK64 1570 const struct Tbl { 1571 const char *name; 1572 int mode; 1573 } tbl[] = { 1574 { "vpgatherdd", xx_yy_zz }, 1575 { "vpgatherdq", xx_yx_zy }, 1576 { "vpgatherqd", xx_xy_yz }, 1577 { "vpgatherqq", xx_yy_zz }, 1578 { "vgatherdps", xx_yy_zz }, 1579 { "vgatherdpd", xx_yx_zy }, 1580 { "vgatherqps", xx_xy_yz }, 1581 { "vgatherqpd", xx_yy_zz }, 1582 }; 1583 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 1584 const Tbl& p = tbl[i]; 1585 switch (p.mode) { 1586 case xx_yy_zz: 1587 put(p.name, XMM_K, VM32X); 1588 put(p.name, YMM_K, VM32Y); 1589 put(p.name, ZMM_K, VM32Z); 1590 break; 1591 case xx_yx_zy: 1592 put(p.name, XMM_K, VM32X); 1593 put(p.name, YMM_K, VM32X); 1594 put(p.name, ZMM_K, VM32Y); 1595 break; 1596 case xx_xy_yz: 1597 put(p.name, XMM_K, VM32X); 1598 put(p.name, XMM_K, VM32Y); 1599 put(p.name, YMM_K, VM32Z); 1600 break; 1601 } 1602 } 1603 #endif 1604 } 1605 void putScatter() 1606 { 1607 #ifdef XBYAK64 1608 const struct Tbl { 1609 const char *name; 1610 int mode; 1611 } tbl[] = { 1612 { "vpscatterdd", xx_yy_zz }, 1613 { "vpscatterdq", xx_xy_yz }, 1614 { "vpscatterqd", xx_yx_zy }, 1615 { "vpscatterqq", xx_yy_zz }, 1616 1617 { "vscatterdps", xx_yy_zz }, 1618 { "vscatterdpd", xx_xy_yz }, 1619 { "vscatterqps", xx_yx_zy }, 1620 { "vscatterqpd", xx_yy_zz }, 1621 }; 1622 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 1623 const Tbl& p = tbl[i]; 1624 switch (p.mode) { 1625 case xx_yy_zz: 1626 put(p.name, VM32X_K, _XMM); 1627 put(p.name, VM32Y_K, _YMM); 1628 put(p.name, VM32Z_K, _ZMM); 1629 break; 1630 case xx_yx_zy: 1631 put(p.name, VM32X_K, _XMM); 1632 put(p.name, VM32Y_K, _XMM); 1633 put(p.name, VM32Z_K, _YMM); 1634 break; 1635 case xx_xy_yz: 1636 put(p.name, VM32X_K, _XMM); 1637 put(p.name, VM32X_K, _YMM); 1638 put(p.name, VM32Y_K, _ZMM); 1639 break; 1640 } 1641 } 1642 #endif 1643 } 1644 void putBlend() 1645 { 1646 put("vblendmpd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2); 1647 put("vblendmpd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4); 1648 put("vblendmpd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8); 1649 1650 put("vblendmps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4); 1651 put("vblendmps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8); 1652 put("vblendmps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16); 1653 1654 put("vpblendmb", XMM_KZ, _XMM, _XMM | _MEM); 1655 put("vpblendmb", YMM_KZ, _YMM, _YMM | _MEM); 1656 put("vpblendmb", ZMM_KZ, _ZMM, _ZMM | _MEM); 1657 1658 put("vpblendmb", XMM_KZ, _XMM, _XMM | _MEM); 1659 put("vpblendmb", YMM_KZ, _YMM, _YMM | _MEM); 1660 put("vpblendmb", ZMM_KZ, _ZMM, _ZMM | _MEM); 1661 1662 put("vpblendmd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4); 1663 put("vpblendmd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8); 1664 put("vpblendmd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16); 1665 1666 put("vpblendmq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2); 1667 put("vpblendmq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4); 1668 put("vpblendmq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8); 1669 } 1670 void putVpcmp() 1671 { 1672 const uint64_t b0Tbl[] = { 0, 0, 0 }; 1673 const uint64_t b4Tbl[] = { M_1to4, M_1to8, M_1to16 }; 1674 const uint64_t b2Tbl[] = { M_1to2, M_1to4, M_1to8 }; 1675 const struct Tbl { 1676 const char *name; 1677 uint64_t b; 1678 } tbl[] = { 1679 { "vpcmpb", 0 }, 1680 { "vpcmpub", 0 }, 1681 { "vpcmpw", 0 }, 1682 { "vpcmpuw", 0 }, 1683 { "vpcmpd", M_1to4 }, 1684 { "vpcmpud", M_1to4 }, 1685 { "vpcmpq", M_1to2 }, 1686 { "vpcmpuq", M_1to2 }, 1687 }; 1688 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 1689 const Tbl& p = tbl[i]; 1690 const uint64_t *bTbl = p.b == 0 ? b0Tbl : p.b == M_1to4 ? b4Tbl : b2Tbl; 1691 put(p.name, K_K, _XMM, _XMM | _MEM | bTbl[0], IMM8); 1692 put(p.name, K_K, _YMM, _YMM | _MEM | bTbl[1], IMM8); 1693 put(p.name, K_K, _ZMM, _ZMM | _MEM | bTbl[2], IMM8); 1694 } 1695 } 1696 void putVtest() 1697 { 1698 const uint64_t b0Tbl[] = { 0, 0, 0 }; 1699 const uint64_t b4Tbl[] = { M_1to4, M_1to8, M_1to16 }; 1700 const uint64_t b2Tbl[] = { M_1to2, M_1to4, M_1to8 }; 1701 const struct Tbl { 1702 const char *name; 1703 uint64_t b; 1704 } tbl[] = { 1705 { "vptestmb", 0 }, 1706 { "vptestmw", 0 }, 1707 { "vptestmd", M_1to4 }, 1708 { "vptestmq", M_1to2 }, 1709 1710 { "vptestnmb", 0 }, 1711 { "vptestnmw", 0 }, 1712 { "vptestnmd", M_1to4 }, 1713 { "vptestnmq", M_1to2 }, 1714 }; 1715 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 1716 const Tbl& p = tbl[i]; 1717 const uint64_t *bTbl = p.b == 0 ? b0Tbl : p.b == M_1to4 ? b4Tbl : b2Tbl; 1718 put(p.name, K_K, _XMM, _XMM | _MEM | bTbl[0]); 1719 put(p.name, K_K, _YMM, _YMM | _MEM | bTbl[1]); 1720 put(p.name, K_K, _ZMM, _ZMM | _MEM | bTbl[2]); 1721 } 1722 } 1723 void putCompExp() 1724 { 1725 { 1726 const char *tbl[] = { 1727 "vcompresspd", 1728 "vcompressps", 1729 "vpcompressd", 1730 "vpcompressq", 1731 }; 1732 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 1733 const char *name = tbl[i]; 1734 put(name, XMM_KZ | _MEM, _XMM); 1735 put(name, YMM_KZ | _MEM, _YMM); 1736 put(name, ZMM_KZ | _MEM, _ZMM); 1737 } 1738 } 1739 { 1740 const char *tbl[] = { 1741 "vexpandpd", 1742 "vexpandps", 1743 "vpexpandd", 1744 "vpexpandq", 1745 }; 1746 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 1747 const char *name = tbl[i]; 1748 put(name, XMM_KZ, _XMM | _MEM); 1749 put(name, YMM_KZ, _YMM | _MEM); 1750 put(name, ZMM_KZ, _ZMM | _MEM); 1751 } 1752 } 1753 } 1754 void putPerm() 1755 { 1756 const uint64_t b0Tbl[] = { 0, 0, 0 }; 1757 const uint64_t b4Tbl[] = { M_1to4, M_1to8, M_1to16 }; 1758 const uint64_t b2Tbl[] = { M_1to2, M_1to4, M_1to8 }; 1759 const struct Tbl { 1760 const char *name; 1761 uint64_t b; 1762 } tbl[] = { 1763 { "vpermt2b", 0 }, 1764 { "vpermt2w", 0 }, 1765 { "vpermt2d", M_1to4 }, 1766 { "vpermt2q", M_1to2 }, 1767 { "vpermt2ps", M_1to4 }, 1768 { "vpermt2pd", M_1to2 }, 1769 1770 { "vpermi2b", 0 }, 1771 { "vpermi2w", 0 }, 1772 { "vpermi2d", M_1to4 }, 1773 { "vpermi2q", M_1to2 }, 1774 { "vpermi2ps", M_1to4 }, 1775 }; 1776 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 1777 const Tbl& p = tbl[i]; 1778 const uint64_t *bTbl = p.b == 0 ? b0Tbl : p.b == M_1to4 ? b4Tbl : b2Tbl; 1779 put(p.name, XMM_KZ, _XMM, _XMM | _MEM | bTbl[0]); 1780 put(p.name, YMM_KZ, _YMM, _YMM | _MEM | bTbl[1]); 1781 put(p.name, ZMM_KZ, _ZMM, _ZMM | _MEM | bTbl[2]); 1782 } 1783 } 1784 void putShuff() 1785 { 1786 put("vshuff32x4", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8); 1787 put("vshuff32x4", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16, IMM8); 1788 1789 put("vshuff64x2", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8); 1790 put("vshuff64x2", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8, IMM8); 1791 1792 put("vshufi32x4", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8); 1793 put("vshufi32x4", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16, IMM8); 1794 1795 put("vshufi64x2", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8); 1796 put("vshufi64x2", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8, IMM8); 1797 } 1798 void putMov() 1799 { 1800 put("vpmovm2b", _XMM | _YMM | _ZMM, K); 1801 put("vpmovm2w", _XMM | _YMM | _ZMM, K); 1802 put("vpmovm2d", _XMM | _YMM | _ZMM, K); 1803 put("vpmovm2q", _XMM | _YMM | _ZMM, K); 1804 1805 put("vpmovb2m", K, _XMM | _YMM | _ZMM); 1806 put("vpmovw2m", K, _XMM | _YMM | _ZMM); 1807 put("vpmovd2m", K, _XMM | _YMM | _ZMM); 1808 put("vpmovq2m", K, _XMM | _YMM | _ZMM); 1809 1810 put("vpmovqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); 1811 put("vpmovsqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); 1812 put("vpmovusqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); 1813 1814 put("vpmovqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); 1815 put("vpmovsqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); 1816 put("vpmovusqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); 1817 1818 put("vpmovqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); 1819 put("vpmovqd", YMM_KZ | _MEM | MEM_K, _ZMM); 1820 1821 put("vpmovsqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); 1822 put("vpmovsqd", YMM_KZ | _MEM | MEM_K, _ZMM); 1823 1824 put("vpmovusqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); 1825 put("vpmovusqd", YMM_KZ | _MEM | MEM_K, _ZMM); 1826 1827 put("vpmovdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); 1828 put("vpmovsdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); 1829 put("vpmovusdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM); 1830 1831 put("vpmovdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); 1832 put("vpmovdw", YMM_KZ | _MEM | MEM_K, _ZMM); 1833 1834 put("vpmovsdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); 1835 put("vpmovsdw", YMM_KZ | _MEM | MEM_K, _ZMM); 1836 1837 put("vpmovusdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); 1838 put("vpmovusdw", YMM_KZ | _MEM | MEM_K, _ZMM); 1839 1840 put("vpmovwb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); 1841 put("vpmovwb", YMM_KZ | _MEM | MEM_K, _ZMM); 1842 1843 put("vpmovswb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); 1844 put("vpmovswb", YMM_KZ | _MEM | MEM_K, _ZMM); 1845 1846 put("vpmovuswb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM); 1847 put("vpmovuswb", YMM_KZ | _MEM | MEM_K, _ZMM); 1848 } 1849 void putRot() 1850 { 1851 put("vprolvd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4); 1852 put("vprolvd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8); 1853 put("vprolvd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16); 1854 1855 put("vprolvq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2); 1856 put("vprolvq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4); 1857 put("vprolvq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8); 1858 1859 put("vprorvd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4); 1860 put("vprorvd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8); 1861 put("vprorvd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16); 1862 1863 put("vprorvq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2); 1864 put("vprorvq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4); 1865 put("vprorvq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8); 1866 1867 put("vprold", XMM_KZ, _XMM | _MEM | M_1to4, IMM8); 1868 put("vprold", YMM_KZ, _YMM | _MEM | M_1to8, IMM8); 1869 put("vprold", ZMM_KZ, _ZMM | _MEM | M_1to16, IMM8); 1870 1871 put("vprolq", XMM_KZ, _XMM | _MEM | M_1to2, IMM8); 1872 put("vprolq", YMM_KZ, _YMM | _MEM | M_1to4, IMM8); 1873 put("vprolq", ZMM_KZ, _ZMM | _MEM | M_1to8, IMM8); 1874 1875 put("vprord", XMM_KZ, _XMM | _MEM | M_1to4, IMM8); 1876 put("vprord", YMM_KZ, _YMM | _MEM | M_1to8, IMM8); 1877 put("vprord", ZMM_KZ, _ZMM | _MEM | M_1to16, IMM8); 1878 1879 put("vprorq", XMM_KZ, _XMM | _MEM | M_1to2, IMM8); 1880 put("vprorq", YMM_KZ, _YMM | _MEM | M_1to4, IMM8); 1881 put("vprorq", ZMM_KZ, _ZMM | _MEM | M_1to8, IMM8); 1882 } 1883 void putMisc2() 1884 { 1885 #ifdef XBYAK64 1886 put("vpternlogd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4, IMM8); 1887 put("vpternlogd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8); 1888 put("vpternlogd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16, IMM8); 1889 1890 put("vpternlogq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2, IMM8); 1891 put("vpternlogq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8); 1892 put("vpternlogq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8, IMM8); 1893 1894 put("vgetexppd", XMM_KZ, _XMM | MEM | M_1to2); 1895 put("vgetexppd", YMM_KZ, _YMM | MEM | M_1to4); 1896 put("vgetexppd", ZMM_KZ, _ZMM | MEM | M_1to8 | ZMM_SAE); 1897 1898 put("vgetexpps", XMM_KZ, _XMM | MEM | M_1to4); 1899 put("vgetexpps", YMM_KZ, _YMM | MEM | M_1to8); 1900 put("vgetexpps", ZMM_KZ, _ZMM | MEM | M_1to16 | ZMM_SAE); 1901 1902 put("vgetexpsd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE); 1903 put("vgetexpss", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE); 1904 1905 put("vgetmantpd", XMM_KZ, _XMM | _MEM | M_1to2, IMM8); 1906 put("vgetmantpd", YMM_KZ, _YMM | _MEM | M_1to4, IMM8); 1907 put("vgetmantpd", ZMM_KZ, _ZMM | _MEM | M_1to8, IMM8); 1908 1909 put("vgetmantps", XMM_KZ, _XMM | _MEM | M_1to4, IMM8); 1910 put("vgetmantps", YMM_KZ, _YMM | _MEM | M_1to8, IMM8); 1911 put("vgetmantps", ZMM_KZ, _ZMM | _MEM | M_1to16, IMM8); 1912 1913 put("vgetmantsd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8); 1914 put("vgetmantss", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8); 1915 1916 put("vfixupimmpd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2, IMM8); 1917 put("vfixupimmpd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8); 1918 put("vfixupimmpd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8, IMM8); 1919 1920 put("vfixupimmps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4, IMM8); 1921 put("vfixupimmps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8); 1922 put("vfixupimmps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16, IMM8); 1923 1924 put("vfixupimmsd", XMM_KZ, _XMM, _XMM | _MEM, IMM8); 1925 put("vfixupimmss", XMM_KZ, _XMM, _XMM | _MEM, IMM8); 1926 1927 put("vrcp14pd", XMM_KZ, _XMM | _MEM | M_1to2); 1928 put("vrcp14pd", YMM_KZ, _YMM | _MEM | M_1to4); 1929 put("vrcp14pd", ZMM_KZ, _ZMM | _MEM | M_1to8); 1930 1931 put("vrcp14ps", XMM_KZ, _XMM | _MEM | M_1to4); 1932 put("vrcp14ps", YMM_KZ, _YMM | _MEM | M_1to8); 1933 put("vrcp14ps", ZMM_KZ, _ZMM | _MEM | M_1to16); 1934 1935 put("vrcp14sd", XMM_KZ, _XMM, _XMM | _MEM); 1936 1937 put("vrcp14ss", XMM_KZ, _XMM, _XMM | _MEM); 1938 1939 put("vrsqrt14pd", XMM_KZ, _XMM | _MEM | M_1to2); 1940 put("vrsqrt14pd", YMM_KZ, _YMM | _MEM | M_1to4); 1941 put("vrsqrt14pd", ZMM_KZ, _ZMM | _MEM | M_1to8); 1942 1943 put("vrsqrt14ps", XMM_KZ, _XMM | _MEM | M_1to4); 1944 put("vrsqrt14ps", YMM_KZ, _YMM | _MEM | M_1to8); 1945 put("vrsqrt14ps", ZMM_KZ, _ZMM | _MEM | M_1to16); 1946 1947 put("vrsqrt14sd", XMM_KZ, _XMM, _XMM | _MEM); 1948 1949 put("vrsqrt14ss", XMM_KZ, _XMM, _XMM | _MEM); 1950 1951 put("vrndscalepd", XMM_KZ, _XMM | _MEM | M_1to2, IMM8); 1952 put("vrndscalepd", YMM_KZ, _YMM | _MEM | M_1to4, IMM8); 1953 put("vrndscalepd", ZMM_KZ, _ZMM | _MEM | M_1to8, IMM8); 1954 put("vrndscalepd", ZMM_KZ, _ZMM | ZMM_SAE, IMM8); 1955 1956 put("vrndscaleps", XMM_KZ, _XMM | _MEM | M_1to4, IMM8); 1957 put("vrndscaleps", YMM_KZ, _YMM | _MEM | M_1to8, IMM8); 1958 put("vrndscaleps", ZMM_KZ, _ZMM | _MEM | M_1to16, IMM8); 1959 put("vrndscaleps", ZMM_KZ, _ZMM | ZMM_SAE, IMM8); 1960 1961 put("vrndscalesd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8); 1962 1963 put("vrndscaless", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8); 1964 1965 put("vscalefpd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2); 1966 put("vscalefpd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4); 1967 put("vscalefpd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 | ZMM_ER); 1968 1969 put("vscalefps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4); 1970 put("vscalefps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8); 1971 put("vscalefps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 | ZMM_ER); 1972 1973 put("vscalefsd", XMM_KZ, _XMM, _XMM | _MEM | XMM_ER); 1974 put("vscalefss", XMM_KZ, _XMM, _XMM | _MEM | XMM_ER); 1975 1976 put("vdbpsadbw", XMM_KZ, _XMM, _XMM | _MEM, IMM8); 1977 put("vdbpsadbw", YMM_KZ, _YMM, _YMM | _MEM, IMM8); 1978 put("vdbpsadbw", ZMM_KZ, _ZMM, _ZMM | _MEM, IMM8); 1979 1980 put("vpmultishiftqb", XMM_KZ, _XMM, _XMM | _MEM | M_1to2); 1981 put("vpmultishiftqb", YMM_KZ, _YMM, _YMM | _MEM | M_1to4); 1982 put("vpmultishiftqb", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8); 1983 1984 put("vpconflictd", XMM_KZ, _XMM | _MEM | M_1to4); 1985 put("vpconflictd", YMM_KZ, _YMM | _MEM | M_1to8); 1986 put("vpconflictd", ZMM_KZ, _ZMM | _MEM | M_1to16); 1987 1988 put("vpconflictq", XMM_KZ, _XMM | _MEM | M_1to2); 1989 put("vpconflictq", YMM_KZ, _YMM | _MEM | M_1to4); 1990 put("vpconflictq", ZMM_KZ, _ZMM | _MEM | M_1to8); 1991 1992 put("vplzcntd", XMM_KZ, _XMM | _MEM | M_1to4); 1993 put("vplzcntd", YMM_KZ, _YMM | _MEM | M_1to8); 1994 put("vplzcntd", ZMM_KZ, _ZMM | _MEM | M_1to16); 1995 1996 put("vplzcntq", XMM_KZ, _XMM | _MEM | M_1to2); 1997 put("vplzcntq", YMM_KZ, _YMM | _MEM | M_1to4); 1998 put("vplzcntq", ZMM_KZ, _ZMM | _MEM | M_1to8); 1999 2000 put("vpbroadcastmb2q", _XMM | _YMM | _ZMM, K); 2001 put("vpbroadcastmw2d", _XMM | _YMM | _ZMM, K); 2002 2003 put("vexp2pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE); 2004 put("vexp2ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE); 2005 2006 put("vrcp28pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE); 2007 put("vrcp28ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE); 2008 put("vrcp28sd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE); 2009 put("vrcp28ss", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE); 2010 2011 put("vrsqrt28pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE); 2012 put("vrsqrt28ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE); 2013 put("vrsqrt28sd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE); 2014 put("vrsqrt28ss", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE); 2015 2016 put("vgatherpf0dps", VM32Z_K); 2017 put("vgatherpf0qps", VM32Z_K); 2018 put("vgatherpf0dpd", VM32Y_K); 2019 put("vgatherpf0qpd", VM32Z_K); 2020 2021 put("vgatherpf1dps", VM32Z_K); 2022 put("vgatherpf1qps", VM32Z_K); 2023 put("vgatherpf1dpd", VM32Y_K); 2024 put("vgatherpf1qpd", VM32Z_K); 2025 2026 put("vscatterpf0dps", VM32Z_K); 2027 put("vscatterpf0qps", VM32Z_K); 2028 put("vscatterpf0dpd", VM32Y_K); 2029 put("vscatterpf0qpd", VM32Z_K); 2030 2031 put("vscatterpf1dps", VM32Z_K); 2032 put("vscatterpf1qps", VM32Z_K); 2033 put("vscatterpf1dpd", VM32Y_K); 2034 put("vscatterpf1qpd", VM32Z_K); 2035 2036 put("vrangepd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2, IMM8); 2037 put("vrangepd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8); 2038 put("vrangepd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 | ZMM_SAE, IMM8); 2039 2040 put("vrangeps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4, IMM8); 2041 put("vrangeps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8); 2042 put("vrangeps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 | ZMM_SAE, IMM8); 2043 2044 put("vrangesd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8); 2045 put("vrangess", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8); 2046 2047 put("vreducepd", XMM_KZ, _XMM | _MEM | M_1to2, IMM8); 2048 put("vreducepd", YMM_KZ, _YMM | _MEM | M_1to4, IMM8); 2049 put("vreducepd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE, IMM8); 2050 2051 put("vreduceps", XMM_KZ, _XMM | _MEM | M_1to4, IMM8); 2052 put("vreduceps", YMM_KZ, _YMM | _MEM | M_1to8, IMM8); 2053 put("vreduceps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE, IMM8); 2054 2055 put("vreducesd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8); 2056 put("vreducess", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8); 2057 2058 put("vpmadd52luq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2); 2059 put("vpmadd52luq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4); 2060 put("vpmadd52luq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8); 2061 2062 put("vpmadd52huq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2); 2063 put("vpmadd52huq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4); 2064 put("vpmadd52huq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8); 2065 #endif 2066 } 2067 void classSubMem(const char *nm, char x, bool broadcast, int size) 2068 { 2069 printf("%s ", nm); 2070 if (isXbyak_) { 2071 printf("(k5|k3, %cword%s [rax+64], 5);dump();\n", x, broadcast ? "_b" : ""); 2072 } else { 2073 if (broadcast) { 2074 int d = x == 'x' ? 128 / size : x == 'y' ? 256 / size : 512 / size; 2075 printf("k5{k3}, [rax+64]{1to%d}, 5\n", d); 2076 } else { 2077 if (x == 'x') x = 'o'; // nasm 2078 printf("k5{k3}, %cword [rax+64], 5\n", x); 2079 } 2080 } 2081 } 2082 void putClassSub(const char *name, int size) 2083 { 2084 put(name, K_K, _XMM | _YMM | _ZMM, IMM8); 2085 for (int i = 0; i < 2; i++) { 2086 classSubMem(name, 'x', i == 0, size); 2087 classSubMem(name, 'y', i == 0, size); 2088 classSubMem(name, 'z', i == 0, size); 2089 } 2090 } 2091 void putClass() 2092 { 2093 #ifdef XBYAK64 2094 putClassSub("vfpclasspd", 64); 2095 putClassSub("vfpclassps", 32); 2096 put("vfpclasssd", K_K, _XMM | _MEM, IMM8); 2097 put("vfpclassss", K_K, _XMM | _MEM, IMM8); 2098 #endif 2099 } 2100 void putMin() 2101 { 2102 #ifdef XBYAK64 2103 put("vextractf32x4", XMM_KZ, _YMM, IMM8); 2104 #endif 2105 } 2106 void putDisp8N() 2107 { 2108 { 2109 const int tbl[] = { 2110 -129, -128, -127, 0, 1, 64, 65, 127, 128 2111 }; 2112 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 2113 char xs[128], ns[128]; 2114 int v = tbl[i]; 2115 CYBOZU_SNPRINTF(xs, sizeof(xs), "xmm0, ptr[eax%+d]", v); 2116 CYBOZU_SNPRINTF(ns, sizeof(ns), "xmm0, [eax%+d]", v); 2117 put("vpbroadcastb", xs, ns); 2118 } 2119 } 2120 { 2121 const int tbl[] = { 2122 -1024, -512 -256, -128, -64, -32, -16, -8, -4, -2, -1, 2123 0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 2124 }; 2125 for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { 2126 char xs[128], ns[128]; 2127 int v = tbl[i]; 2128 CYBOZU_SNPRINTF(xs, sizeof(xs), "zmm0, zmm1, ptr_b[eax%+d]", v); 2129 CYBOZU_SNPRINTF(ns, sizeof(ns), "zmm0, zmm1, [eax%+d]{1to16}", v); 2130 put("vaddps", xs, ns); 2131 } 2132 } 2133 #ifdef XBYAK64 2134 put("vfmadd231ps", "zmm8, zmm31, ptr_b[r14+rbp-0x1e4]", "zmm8, zmm31, [r14+rbp-0x1e4]{1to16}"); 2135 #endif 2136 } 2137 void putAVX512() 2138 { 2139 #ifdef MIN_TEST 2140 putMin(); 2141 #else 2142 putOpmask(); 2143 separateFunc(); 2144 putCombi(); 2145 separateFunc(); 2146 putCmpK(); 2147 separateFunc(); 2148 putBroadcast(); 2149 separateFunc(); 2150 putAVX512_M_X(); 2151 separateFunc(); 2152 put_vmov(); 2153 separateFunc(); 2154 put512_X_XM(); 2155 separateFunc(); 2156 put512_X_X_XM(); 2157 separateFunc(); 2158 put512_X3(); 2159 separateFunc(); 2160 put512_X3_I(); 2161 separateFunc(); 2162 put512_FMA(); 2163 separateFunc(); 2164 put512_Y_XM(); 2165 separateFunc(); 2166 put512_AVX1(); 2167 separateFunc(); 2168 put512_cvt(); 2169 separateFunc(); 2170 putMisc1(); 2171 separateFunc(); 2172 putGather(); 2173 separateFunc(); 2174 putBlend(); 2175 separateFunc(); 2176 putVpcmp(); 2177 separateFunc(); 2178 putVtest(); 2179 separateFunc(); 2180 putCompExp(); 2181 separateFunc(); 2182 putPerm(); 2183 separateFunc(); 2184 putShuff(); 2185 separateFunc(); 2186 putMisc2(); 2187 separateFunc(); 2188 putMov(); 2189 separateFunc(); 2190 putRot(); 2191 separateFunc(); 2192 putScatter(); 2193 separateFunc(); 2194 putClass(); 2195 putDisp8N(); 2196 #endif 2197 } 2198 }; 2199 2200 int main(int argc, char *[]) 2201 { 2202 Test test(argc > 1); 2203 test.put(); 2204 }