/ externals / xbyak / test / make_512.cpp
make_512.cpp
   1  #include <stdio.h>
   2  #include "xbyak/xbyak.h"
   3  #include <stdlib.h>
   4  #include <string.h>
   5  #include "cybozu/inttype.hpp"
   6  #define NUM_OF_ARRAY(x) (sizeof(x) / sizeof(x[0]))
   7  
   8  using namespace Xbyak;
   9  
  10  const int bitEnd = 64;
  11  
  12  const uint64_t YMM_SAE = 1ULL << 0;
  13  const uint64_t _XMM = 1ULL << 1;
  14  const uint64_t _MEM = 1ULL << 2;
  15  const uint64_t _REG32 = 1ULL << 3;
  16  const uint64_t EAX = 1ULL << 4;
  17  const uint64_t IMM32 = 1ULL << 5;
  18  const uint64_t IMM8 = 1ULL << 6;
  19  const uint64_t _REG8 = 1ULL << 7;
  20  const uint64_t _REG16 = 1ULL << 8;
  21  const uint64_t XMM_K = 1ULL << 9;
  22  const uint64_t YMM_K = 1ULL << 10;
  23  const uint64_t ZMM_K = 1ULL << 11;
  24  const uint64_t AX = 1ULL << 12;
  25  const uint64_t AL = 1ULL << 13;
  26  const uint64_t IMM_1 = 1ULL << 14;
  27  const uint64_t MEM8 = 1ULL << 15;
  28  const uint64_t MEM16 = 1ULL << 16;
  29  const uint64_t MEM32 = 1ULL << 17;
  30  const uint64_t VM32Z = 1ULL << 19;
  31  const uint64_t K_K = 1ULL << 20;
  32  const uint64_t MEM_ONLY_DISP = 1ULL << 21;
  33  const uint64_t VM32X_K = 1ULL << 23;
  34  const uint64_t _YMM = 1ULL << 24;
  35  const uint64_t VM32X_32 = 1ULL << 39;
  36  const uint64_t VM32X_64 = 1ULL << 40;
  37  const uint64_t VM32Y_32 = 1ULL << 41;
  38  const uint64_t VM32Y_64 = 1ULL << 42;
  39  const uint64_t VM32Z_K = 1ULL << 32;
  40  #ifdef XBYAK64
  41  const uint64_t _MEMe = 1ULL << 25;
  42  const uint64_t REG32_2 = 1ULL << 26; // r8d, ...
  43  const uint64_t REG16_2 = 1ULL << 27; // r8w, ...
  44  const uint64_t REG8_2 = 1ULL << 28; // r8b, ...
  45  const uint64_t REG8_3 = 1ULL << 29; // spl, ...
  46  const uint64_t _REG64 = 1ULL << 30; // rax, ...
  47  const uint64_t _REG64_2 = 1ULL << 31; // r8, ...
  48  const uint64_t _XMM2 = 1ULL << 33;
  49  const uint64_t _YMM2 = 1ULL << 34;
  50  const uint64_t VM32X = VM32X_32 | VM32X_64;
  51  const uint64_t VM32Y = VM32Y_32 | VM32Y_64;
  52  #else
  53  const uint64_t _MEMe = 0;
  54  const uint64_t REG32_2 = 0;
  55  const uint64_t REG16_2 = 0;
  56  const uint64_t REG8_2 = 0;
  57  const uint64_t REG8_3 = 0;
  58  const uint64_t _REG64 = 0;
  59  const uint64_t _REG64_2 = 0;
  60  const uint64_t _XMM2 = 0;
  61  const uint64_t _YMM2 = 0;
  62  const uint64_t VM32X = VM32X_32;
  63  const uint64_t VM32Y = VM32Y_32;
  64  #endif
  65  const uint64_t REG64 = _REG64 | _REG64_2;
  66  const uint64_t REG32 = _REG32 | REG32_2 | EAX;
  67  const uint64_t REG16 = _REG16 | REG16_2 | AX;
  68  const uint64_t REG32e = REG32 | REG64;
  69  const uint64_t REG8 = _REG8 | REG8_2|AL;
  70  const uint64_t MEM = _MEM | _MEMe;
  71  const uint64_t MEM64 = 1ULL << 35;
  72  const uint64_t YMM_ER = 1ULL << 36;
  73  const uint64_t VM32Y_K = 1ULL << 37;
  74  const uint64_t IMM_2 = 1ULL << 38;
  75  const uint64_t IMM = IMM_1 | IMM_2;
  76  const uint64_t YMM = _YMM | _YMM2;
  77  const uint64_t K = 1ULL << 43;
  78  const uint64_t _ZMM = 1ULL << 44;
  79  const uint64_t _ZMM2 = 1ULL << 45;
  80  #ifdef XBYAK64
  81  const uint64_t ZMM = _ZMM | _ZMM2;
  82  const uint64_t _YMM3 = 1ULL << 46;
  83  #else
  84  const uint64_t ZMM = _ZMM;
  85  const uint64_t _YMM3 = 0;
  86  #endif
  87  const uint64_t K2 = 1ULL << 47;
  88  const uint64_t ZMM_SAE = 1ULL << 48;
  89  const uint64_t ZMM_ER = 1ULL << 49;
  90  #ifdef XBYAK64
  91  const uint64_t _XMM3 = 1ULL << 50;
  92  #else
  93  const uint64_t _XMM3 = 0;
  94  #endif
  95  const uint64_t XMM = _XMM | _XMM2 | _XMM3;
  96  const uint64_t XMM_SAE = 1ULL << 51;
  97  #ifdef XBYAK64
  98  const uint64_t XMM_KZ = 1ULL << 52;
  99  const uint64_t YMM_KZ = 1ULL << 53;
 100  const uint64_t ZMM_KZ = 1ULL << 54;
 101  #else
 102  const uint64_t XMM_KZ = 0;
 103  const uint64_t YMM_KZ = 0;
 104  const uint64_t ZMM_KZ = 0;
 105  #endif
 106  const uint64_t MEM_K = 1ULL << 55;
 107  const uint64_t M_1to2 = 1ULL << 56;
 108  const uint64_t M_1to4 = 1ULL << 57;
 109  const uint64_t M_1to8 = 1ULL << 58;
 110  const uint64_t M_1to16 = 1ULL << 59;
 111  const uint64_t XMM_ER = 1ULL << 60;
 112  const uint64_t M_xword = 1ULL << 61;
 113  const uint64_t M_yword = 1ULL << 62;
 114  const uint64_t MY_1to4 = 1ULL << 18;
 115  
 116  const uint64_t NOPARA = 1ULL << (bitEnd - 1);
 117  
 118  class Test {
 119  	Test(const Test&);
 120  	void operator=(const Test&);
 121  	const bool isXbyak_;
 122  	int funcNum_;
 123  	// check all op1, op2, op3
 124  	void put(const std::string& nm, uint64_t op1 = NOPARA, uint64_t op2 = NOPARA, uint64_t op3 = NOPARA, uint64_t op4 = NOPARA) const
 125  	{
 126  		for (int i = 0; i < bitEnd; i++) {
 127  			if ((op1 & (1ULL << i)) == 0) continue;
 128  			for (int j = 0; j < bitEnd; j++) {
 129  				if ((op2 & (1ULL << j)) == 0) continue;
 130  				for (int k = 0; k < bitEnd; k++) {
 131  					if ((op3 & (1ULL << k)) == 0) continue;
 132  					for (int s = 0; s < bitEnd; s++) {
 133  						if ((op4 & (1ULL << s)) == 0) continue;
 134  						printf("%s ", nm.c_str());
 135  						if (isXbyak_) printf("(");
 136  						if (!(op1 & NOPARA)) printf("%s", get(1ULL << i));
 137  						if (!(op2 & NOPARA)) printf(", %s", get(1ULL << j));
 138  						if (!(op3 & NOPARA)) printf(", %s", get(1ULL << k));
 139  						if (!(op4 & NOPARA)) printf(", %s", get(1ULL << s));
 140  						if (isXbyak_) printf("); dump();");
 141  						printf("\n");
 142  					}
 143  				}
 144  			}
 145  		}
 146  	}
 147  	void put(const char *nm, uint64_t op, const char *xbyak, const char *nasm) const
 148  	{
 149  		for (int i = 0; i < bitEnd; i++) {
 150  			if ((op & (1ULL << i)) == 0) continue;
 151  			printf("%s ", nm);
 152  			if (isXbyak_) printf("(");
 153  			if (!(op & NOPARA)) printf("%s", get(1ULL << i));
 154  			printf(", %s", isXbyak_ ? xbyak : nasm);
 155  			if (isXbyak_) printf("); dump();");
 156  			printf("\n");
 157  		}
 158  	}
 159  	void put(const char *nm, const char *xbyak, const char *nasm = 0, uint64_t op = NOPARA) const
 160  	{
 161  		if (nasm == 0) nasm = xbyak;
 162  		for (int i = 0; i < bitEnd; i++) {
 163  			if ((op & (1ULL << i)) == 0) continue;
 164  			printf("%s ", nm);
 165  			if (isXbyak_) printf("(");
 166  			printf("%s ", isXbyak_ ? xbyak : nasm);
 167  			if (!(op & NOPARA)) printf(", %s", get(1ULL << i));
 168  			if (isXbyak_) printf("); dump();");
 169  			printf("\n");
 170  		}
 171  	}
 172  	const char *get(uint64_t type) const
 173  	{
 174  		int idx = (rand() / 31) & 7;
 175  		switch (type) {
 176  		case _XMM:
 177  			{
 178  				static const char tbl[][6] = {
 179  					"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
 180  				};
 181  				return tbl[idx];
 182  			}
 183  		case _YMM:
 184  			{
 185  				static const char tbl[][6] = {
 186  					"ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7"
 187  				};
 188  				return tbl[idx];
 189  			}
 190  		case _ZMM:
 191  			{
 192  				static const char tbl[][6] = {
 193  					"zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7"
 194  				};
 195  				return tbl[idx];
 196  			}
 197  #ifdef XBYAK64
 198  		case _XMM2:
 199  			{
 200  				static const char tbl[][6] = {
 201  					"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
 202  				};
 203  				return tbl[idx];
 204  			}
 205  		case _XMM3:
 206  			{
 207  				static const char tbl[][6] = {
 208  					"xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23"
 209  				};
 210  				return tbl[idx];
 211  			}
 212  		case _YMM2:
 213  			{
 214  				static const char tbl[][6] = {
 215  					"ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
 216  				};
 217  				return tbl[idx];
 218  			}
 219  		case _YMM3:
 220  			{
 221  				static const char tbl[][6] = {
 222  					"ymm16", "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23",
 223  				};
 224  				return tbl[idx];
 225  			}
 226  		case _ZMM2:
 227  			{
 228  				static const char tbl[][6] = {
 229  					"zmm8", "zmm9", "zmm10", "zmm11", "zmm28", "zmm29", "zmm30", "zmm31",
 230  				};
 231  				return tbl[idx];
 232  			}
 233  #endif
 234  		case _MEM:
 235  			return isXbyak_ ? "ptr[eax+ecx+64]" : "[eax+ecx+64]"; // QQQ
 236  //			return isXbyak_ ? "ptr[eax+ecx+6]" : "[eax+ecx+6]";
 237  		case _MEMe:
 238  			{
 239  				static int ccc = 1;
 240  #ifdef USE_YASM
 241  				ccc++;
 242  #endif
 243  				if (ccc & 1) {
 244  					return isXbyak_ ? "ptr[rdx+r15+0x12]" : "[rdx+r15+0x12]";
 245  				} else {
 246  					return isXbyak_ ? "ptr[rip - 0x13456+1-3]" : "[rip - 0x13456+1-3]";
 247  				}
 248  			}
 249  		case MEM8:
 250  			return "byte [eax+edx]";
 251  		case MEM16:
 252  			return "word [esi]";
 253  		case MEM32:
 254  			return "dword [eax+64]";
 255  		case MEM64:
 256  			return "qword [rax+64]";
 257  		case MEM_ONLY_DISP:
 258  			return isXbyak_ ? "ptr[(void*)0x123]" : "[0x123]";
 259  		case _REG16: // not ax
 260  			{
 261  				static const char Reg16Tbl[][4] = {
 262  					"ax", "cx", "dx", "bx", "sp", "bp", "si", "di"
 263  				};
 264  				return Reg16Tbl[(idx % 7) + 1];
 265  			}
 266  		case _REG8: // not al
 267  			{
 268  				static const char Reg8Tbl[][4] = {
 269  #ifdef XBYAK64 // QQQ
 270  					"al", "cl", "dl", "bl", "al", "cl", "dl", "bl"
 271  #else
 272  					"al", "cl", "dl", "bl", "ah", "ch", "dh", "bh"
 273  #endif
 274  				};
 275  				return Reg8Tbl[(idx % 7) + 1];
 276  			}
 277  		case _REG32: // not eax
 278  			{
 279  				static const char Reg32Tbl[][4] = {
 280  					"eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi"
 281  				};
 282  				return Reg32Tbl[(idx % 7) + 1];
 283  			}
 284  #ifdef XBYAK64
 285  		case _REG64: // not rax
 286  			{
 287  				static const char Reg64Tbl[][4] = {
 288  					"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi"
 289  				};
 290  				return Reg64Tbl[(idx % 7) + 1];
 291  			}
 292  		case _REG64_2:
 293  			{
 294  				static const char Reg64_2Tbl[][4] = {
 295  					"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
 296  				};
 297  				return Reg64_2Tbl[idx];
 298  			}
 299  		case REG32_2:
 300  			{
 301  				static const char Reg32eTbl[][5] = {
 302  					"r8d", "r9d", "r10d", "r11d", "r12d", "r13d", "r14d", "r15d"
 303  				};
 304  				return Reg32eTbl[idx];
 305  			}
 306  		case REG16_2:
 307  			{
 308  				static const char Reg16eTbl[][5] = {
 309  					"r8w", "r9w", "r10w", "r11w", "r12w", "r13w", "r14w", "r15w"
 310  				};
 311  				return Reg16eTbl[idx];
 312  			}
 313  		case REG8_2:
 314  			{
 315  				static const char Reg8_2Tbl[][5] = {
 316  					"r8b", "r9b", "r10b", "r11b", "r12b", "r13b", "r14b", "r15b"
 317  				};
 318  				return Reg8_2Tbl[idx];
 319  			}
 320  		case REG8_3:
 321  			{
 322  				static const char Reg8_3Tbl[][5] = {
 323  					"spl", "bpl", "sil", "dil", "spl", "bpl", "sil", "dil"
 324  				};
 325  				return Reg8_3Tbl[idx];
 326  			}
 327  #endif
 328  		case EAX:
 329  			return "eax";
 330  		case AX:
 331  			return "ax";
 332  		case AL:
 333  			return "al";
 334  		case K_K:
 335  			return isXbyak_ ? "k5 | k3" : "k5{k3}";
 336  		case IMM32:
 337  			return isXbyak_ ? "12345678" : "dword 12345678";
 338  		case IMM8:
 339  			return isXbyak_ ? "4" : "byte 4";
 340  		case IMM_1:
 341  			return "4";
 342  		case IMM_2:
 343  			return isXbyak_ ? "0xda" : "0xda";
 344  		case VM32X_32:
 345  			return isXbyak_ ? "ptr [ebp+64+xmm1*8]" : "[ebp+64+xmm1*8]";
 346  		case VM32X_64:
 347  			return isXbyak_ ? "ptr [rax+64+xmm13*2]" : "[rax+64+xmm13*2]";
 348  		case VM32Y_32:
 349  			return isXbyak_ ? "ptr [ymm4]" : "[ymm4]";
 350  		case VM32Y_64:
 351  			return isXbyak_ ? "ptr [64+ymm13*2+r13]" : "[64+ymm13*2+r13]";
 352  		case VM32X_K:
 353  			return isXbyak_ ? "ptr [64+xmm13*2+r13] | k6" : "[64+xmm13*2+r13]{k6}";
 354  		case VM32Y_K:
 355  			return isXbyak_ ? "ptr [64+ymm13*2+r13] | k6" : "[64+ymm13*2+r13]{k6}";
 356  		case VM32Z_K:
 357  			if (idx & 1) return isXbyak_ ? "ptr [64+zmm10*8+r9] | k6" : "[64+zmm10*8+r9]{k6}";
 358  			return isXbyak_ ? "ptr [64+zmm30*2+r13] | k6" : "[64+zmm30*2+r13]{k6}";
 359  		case VM32Z:
 360  			return isXbyak_ ? "ptr [64+zmm13*2+rcx]" : "[64+zmm13*2+rcx]";
 361  		case M_1to2: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to2}";
 362  		case M_1to4: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to4}";
 363  		case M_1to8: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to8}";
 364  		case M_1to16: return isXbyak_ ? "ptr_b [eax+32]" : "[eax+32]{1to16}";
 365  
 366  		case M_xword: return isXbyak_ ? "ptr [eax+32]" : "oword [eax+32]";
 367  		case M_yword: return isXbyak_ ? "yword [eax+32]" : "yword [eax+32]";
 368  		case MY_1to4: return isXbyak_ ? "yword_b [eax+32]" : "[eax+32]{1to4}";
 369  		case K:
 370  			{
 371  				static const char kTbl[][5] = {
 372  					"k1", "k2", "k3", "k4", "k5", "k6", "k7",
 373  				};
 374  				return kTbl[idx % 7];
 375  			}
 376  		case K2:
 377  			return isXbyak_ ? "k3 | k5" : "k3{k5}";
 378  #ifdef XBYAK64
 379  		case XMM_SAE:
 380  			return isXbyak_ ? "xmm25 | T_sae" : "xmm25, {sae}";
 381  		case YMM_SAE:
 382  			return isXbyak_ ? "ymm25 | T_sae" : "ymm25, {sae}";
 383  		case ZMM_SAE:
 384  			return isXbyak_ ? "zmm25 | T_sae" : "zmm25, {sae}";
 385  		case XMM_ER:
 386  			return isXbyak_ ? "xmm4 | T_rd_sae" : "xmm4, {rd-sae}";
 387  		case YMM_ER:
 388  			return isXbyak_ ? "ymm20 | T_rd_sae" : "ymm20, {rd-sae}";
 389  		case ZMM_ER:
 390  			return isXbyak_ ? "zmm20 | T_rd_sae" : "zmm20, {rd-sae}";
 391  		case XMM_KZ:
 392  			return isXbyak_ ? "xmm5 | k5" : "xmm5{k5}";
 393  		case YMM_KZ:
 394  			return isXbyak_ ? "ymm2 |k3|T_z" : "ymm2{k3}{z}";
 395  		case ZMM_KZ:
 396  			return isXbyak_ ? "zmm7|k1" : "zmm7{k1}";
 397  		case MEM_K:
 398  			return isXbyak_ ? "ptr [rax] | k1" : "[rax]{k1}";
 399  #else
 400  		case XMM_SAE:
 401  			return isXbyak_ ? "xmm5 | T_sae" : "xmm5, {sae}";
 402  		case YMM_SAE:
 403  			return isXbyak_ ? "ymm5 | T_sae" : "ymm5, {sae}";
 404  		case ZMM_SAE:
 405  			return isXbyak_ ? "zmm5 | T_sae" : "zmm5, {sae}";
 406  		case XMM_ER:
 407  			return isXbyak_ ? "xmm30 | T_rd_sae" : "xmm30, {rd-sae}";
 408  		case YMM_ER:
 409  			return isXbyak_ ? "ymm2 | T_rd_sae" : "ymm2, {rd-sae}";
 410  		case ZMM_ER:
 411  			return isXbyak_ ? "zmm2 | T_rd_sae" : "zmm2, {rd-sae}";
 412  		case MEM_K:
 413  			return isXbyak_ ? "ptr [eax] | k1" : "[eax]{k1}";
 414  #endif
 415  		case XMM_K:
 416  			return isXbyak_ ? "xmm5 | k7" : "xmm5{k7}";
 417  		case YMM_K:
 418  			return isXbyak_ ? "ymm5 | k4" : "ymm5{k4}";
 419  		case ZMM_K:
 420  			return isXbyak_ ? "zmm5 | k3" : "zmm5{k3}";
 421  		}
 422  		return 0;
 423  	}
 424  public:
 425  	Test(bool isXbyak)
 426  		: isXbyak_(isXbyak)
 427  		, funcNum_(1)
 428  	{
 429  		if (!isXbyak_) return;
 430  		printf("%s",
 431  			"    void gen0()\n"
 432  			"    {\n");
 433  	}
 434  	/*
 435  		gcc and vc give up to compile this source,
 436  		so I split functions.
 437  	*/
 438  	void separateFunc()
 439  	{
 440  		if (!isXbyak_) return;
 441  		printf(
 442  			"    }\n"
 443  			"    void gen%d()\n"
 444  			"    {\n", funcNum_++);
 445  	}
 446  	~Test()
 447  	{
 448  		if (!isXbyak_) return;
 449  		printf("%s",
 450  			"    }\n"
 451  			"    void gen()\n"
 452  			"    {\n");
 453  		for (int i = 0; i < funcNum_; i++) {
 454  			printf(
 455  			"        gen%d();\n", i);
 456  		}
 457  		printf(
 458  			"    }\n");
 459  	}
 460  	void put()
 461  	{
 462  		putAVX512();
 463  	}
 464  	void putOpmask()
 465  	{
 466  		{
 467  			const char *tbl[] = {
 468  				"kadd",
 469  				"kand",
 470  				"kandn",
 471  				"kor",
 472  				"kxnor",
 473  				"kxor",
 474  			};
 475  			for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 476  				std::string name = tbl[i];
 477  				put(name + "b", K, K, K);
 478  				put(name + "w", K, K, K);
 479  				put(name + "q", K, K, K);
 480  				put(name + "d", K, K, K);
 481  			}
 482  			put("kunpckbw", K, K, K);
 483  			put("kunpckwd", K, K, K);
 484  			put("kunpckdq", K, K, K);
 485  		}
 486  		{
 487  			const char *tbl[] = {
 488  				"knot",
 489  				"kortest",
 490  				"ktest",
 491  			};
 492  			for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 493  				std::string name = tbl[i];
 494  				put(name + "b", K, K);
 495  				put(name + "w", K, K);
 496  				put(name + "q", K, K);
 497  				put(name + "d", K, K);
 498  			}
 499  		}
 500  		{
 501  			const char *tbl[] = {
 502  				"kshiftl",
 503  				"kshiftr",
 504  			};
 505  			for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 506  				std::string name = tbl[i];
 507  				put(name + "b", K, K, IMM8);
 508  				put(name + "w", K, K, IMM8);
 509  				put(name + "q", K, K, IMM8);
 510  				put(name + "d", K, K, IMM8);
 511  			}
 512  		}
 513  		put("kmovw", K, K | MEM | REG32);
 514  		put("kmovq", K, K | MEM);
 515  		put("kmovb", K, K | MEM | REG32);
 516  		put("kmovd", K, K | MEM | REG32);
 517  
 518  		put("kmovw", MEM | REG32, K);
 519  		put("kmovq", MEM, K);
 520  		put("kmovb", MEM | REG32, K);
 521  		put("kmovd", MEM | REG32, K);
 522  #ifdef XBYAK64
 523  		put("kmovq", K, REG64);
 524  		put("kmovq", REG64, K);
 525  #endif
 526  	}
 527  	void put_vaddpd(const char *r1, const char *r2, const char *r3, int kIdx = 0, bool z = false, int sae = 0)
 528  	{
 529  		std::string modifier;
 530  		char pk[16] = "";
 531  		const char *pz = "";
 532  		const char *saeTblXbyak[] = { "", "|T_rn_sae", "|T_rd_sae", "|T_ru_sae", "|T_rz_sae" };
 533  		const char *saeTblNASM[] = { "", ",{rn-sae}", ",{rd-sae}", ",{ru-sae}", ",{rz-sae}" };
 534  		if (isXbyak_) {
 535  			if (kIdx) CYBOZU_SNPRINTF(pk, sizeof(pk), "|k%d", kIdx);
 536  			if (z) pz = "|T_z";
 537  			printf("vaddpd(%s%s%s, %s, %s%s); dump();\n", r1, pk, pz, r2, r3, saeTblXbyak[sae]);
 538  		} else {
 539  			if (kIdx) CYBOZU_SNPRINTF(pk, sizeof(pk), "{k%d}", kIdx);
 540  			if (z && kIdx) pz = "{z}";
 541  			printf("vaddpd %s%s%s, %s, %s%s\n", r1, pk, pz, r2, r3, saeTblNASM[sae]);
 542  		}
 543  	}
 544  	void putCombi()
 545  	{
 546  		const char *xTbl[] = {
 547  			"xmm2",
 548  #ifdef XBYAK64
 549  			"xmm8", "xmm31"
 550  #else
 551  			"xmm5", "xmm6"
 552  #endif
 553  		};
 554  		const char *yTbl[] = {
 555  			"ymm0",
 556  #ifdef XBYAK64
 557  			"ymm15", "ymm31"
 558  #else
 559  			"ymm4", "ymm2"
 560  #endif
 561  		};
 562  		const char *zTbl[] = {
 563  			"zmm1",
 564  #ifdef XBYAK64
 565  			"zmm9", "zmm30"
 566  #else
 567  			"zmm3", "zmm7"
 568  #endif
 569  		};
 570  		const size_t N = NUM_OF_ARRAY(zTbl);
 571  		for (size_t i = 0; i < N; i++) {
 572  			for (size_t j = 0; j < N; j++) {
 573  				separateFunc();
 574  				for (size_t k = 0; k < N; k++) {
 575  #ifdef XBYAK64
 576  					for (int kIdx = 0; kIdx < 8; kIdx++) {
 577  						put_vaddpd(xTbl[i], xTbl[j], xTbl[k], kIdx);
 578  						put_vaddpd(yTbl[i], yTbl[j], yTbl[k], kIdx);
 579  						for (int z = 0; z < 2; z++) {
 580  							for (int sae = 0; sae < 5; sae++) {
 581  								put_vaddpd(zTbl[i], zTbl[j], zTbl[k], kIdx, z == 1, sae);
 582  							}
 583  						}
 584  					}
 585  #else
 586  					put_vaddpd(xTbl[i], xTbl[j], xTbl[k]);
 587  					put_vaddpd(yTbl[i], yTbl[j], yTbl[k]);
 588  					for (int sae = 0; sae < 5; sae++) {
 589  						put_vaddpd(zTbl[i], zTbl[j], zTbl[k], sae);
 590  					}
 591  #endif
 592  				}
 593  			}
 594  		}
 595  		put("vaddpd", XMM, XMM, _MEM);
 596  		put("vaddpd", YMM, YMM, _MEM);
 597  		put("vaddpd", ZMM, ZMM, _MEM);
 598  	}
 599  	void putCmpK()
 600  	{
 601  		{
 602  			const struct Tbl {
 603  				const char *name;
 604  				bool supportYMM;
 605  			} tbl[] = {
 606  				{ "vcmppd", true },
 607  				{ "vcmpps", true },
 608  				{ "vcmpsd", false },
 609  				{ "vcmpss", false },
 610  			};
 611  			for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 612  				const Tbl *p = &tbl[i];
 613  				put(p->name, K, XMM, _XMM | MEM, IMM8);
 614  				if (!p->supportYMM) continue;
 615  				put(p->name, K, _YMM, _YMM | MEM, IMM8);
 616  				put(p->name, K, _ZMM, _ZMM | MEM, IMM8);
 617  			}
 618  			put("vcmppd", K, XMM, M_1to2, IMM8);
 619  			put("vcmppd", K, YMM, M_1to4, IMM8);
 620  			put("vcmppd", K, ZMM, M_1to8, IMM8);
 621  
 622  			put("vcmpps", K, XMM, M_1to4, IMM8);
 623  			put("vcmpps", K, YMM, M_1to8, IMM8);
 624  			put("vcmpps", K, ZMM, M_1to16, IMM8);
 625  		}
 626  		put("vcmppd", K2, ZMM, ZMM_SAE, IMM);
 627  #ifdef XBYAK64
 628  		{
 629  			const struct Tbl {
 630  				const char *name;
 631  			} tbl[] = {
 632  				{ "vcomisd" },
 633  				{ "vcomiss" },
 634  				{ "vucomisd" },
 635  				{ "vucomiss" },
 636  			};
 637  			for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 638  				const Tbl *p = &tbl[i];
 639  				put(p->name, XMM, XMM_SAE | XMM | MEM);
 640  			}
 641  		}
 642  		put("vcomiss", XMM, _XMM3 | MEM);
 643  		put("vcomiss", XMM, XMM_SAE);
 644  #endif
 645  	}
 646  	void putBroadcastSub(int idx, int disp)
 647  	{
 648  #ifdef XBYAK64
 649  		const char *a = "rax";
 650  #else
 651  		const char *a = "eax";
 652  #endif
 653  		if (isXbyak_) {
 654  			printf("vaddpd(zmm%d, zmm1, ptr_b[%s+%d]);dump();\n", idx, a, disp);
 655  			printf("vaddpd(ymm%d, ymm1, ptr_b[%s+%d]);dump();\n", idx, a, disp);
 656  			printf("vaddpd(xmm%d, xmm1, ptr_b[%s+%d]);dump();\n", idx, a, disp);
 657  		} else {
 658  			printf("vaddpd zmm%d, zmm1, [%s+%d]{1to8}\n", idx, a, disp);
 659  			printf("vaddpd ymm%d, ymm1, [%s+%d]{1to4}\n", idx, a, disp);
 660  			printf("vaddpd xmm%d, xmm1, [%s+%d]{1to2}\n", idx, a, disp);
 661  		}
 662  	}
 663  	void putBroadcast()
 664  	{
 665  		for (int i = 0; i < 9; i++) {
 666  			putBroadcastSub(0, i);
 667  #ifdef XBYAK64
 668  			putBroadcastSub(10, i);
 669  			putBroadcastSub(20, i);
 670  #endif
 671  		}
 672  		put("vpbroadcastb", XMM_KZ | ZMM_KZ, REG8 | _MEM);
 673  		put("vpbroadcastw", XMM_KZ | ZMM_KZ, REG16 | _MEM);
 674  		put("vpbroadcastd", XMM_KZ | ZMM_KZ, REG32 | _MEM);
 675  #ifdef XBYAK64
 676  		put("vpbroadcastq", XMM_KZ | ZMM_KZ, REG64 | _MEM);
 677  #endif
 678  		{
 679  			const char *tbl[] = {
 680  				"vpbroadcastb",
 681  				"vpbroadcastw",
 682  				"vpbroadcastd",
 683  				"vpbroadcastq",
 684  			};
 685  			for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 686  				put(tbl[i], XMM_KZ | ZMM_KZ, XMM | _MEM);
 687  			}
 688  		}
 689  		put("vbroadcasti32x2", XMM_KZ | YMM_KZ | ZMM_KZ, XMM | _MEM);
 690  		put("vbroadcasti32x4", YMM_KZ | ZMM_KZ, _MEM);
 691  		put("vbroadcasti64x2", YMM_KZ | ZMM_KZ, _MEM);
 692  		put("vbroadcasti32x8", ZMM_KZ, _MEM);
 693  		put("vbroadcasti64x4", ZMM_KZ, _MEM);
 694  	}
 695  	void putMisc1()
 696  	{
 697  		put("vmaskmovps", _XMM, _XMM, MEM);
 698  		put("vmaskmovps", YMM, YMM, MEM);
 699  
 700  		put("vmaskmovpd", YMM, YMM, MEM);
 701  		put("vmaskmovpd", _XMM, _XMM, MEM);
 702  
 703  		put("vmaskmovps", MEM, _XMM, _XMM);
 704  		put("vmaskmovpd", MEM, _XMM, _XMM);
 705  
 706  		put("vbroadcastf128", YMM, MEM);
 707  		put("vbroadcasti128", YMM, MEM);
 708  		put("vbroadcastsd", YMM|_YMM3, XMM|MEM);
 709  		put("vbroadcastsd", ZMM, XMM|MEM);
 710  		{
 711  			const char *tbl[] = {
 712  				"vbroadcastss",
 713  				"vpbroadcastb",
 714  				"vpbroadcastw",
 715  				"vpbroadcastd",
 716  				"vpbroadcastq",
 717  			};
 718  			for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 719  				put(tbl[i], XMM | YMM | ZMM, XMM|MEM);
 720  			}
 721  		}
 722  
 723  		put("vinsertf128", YMM, YMM, _XMM | _XMM2 | MEM, IMM8);
 724  		put("vinserti128", YMM, YMM, _XMM | _XMM2  | MEM, IMM8);
 725  		put("vperm2f128", YMM, YMM, YMM | MEM, IMM8);
 726  		put("vperm2i128", YMM, YMM, YMM | MEM, IMM8);
 727  
 728  		{
 729  			const char *tbl[] = {
 730  				"vpmaskmovd", "vpmaskmovq"
 731  			};
 732  			for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 733  				const char *name = tbl[i];
 734  				put(name, _XMM, _XMM, MEM);
 735  				put(name, YMM, YMM, MEM);
 736  				put(name, MEM, _XMM, _XMM);
 737  				put(name, MEM, YMM, YMM);
 738  			}
 739  		}
 740  		{
 741  			const char *tbl[] = {
 742  				"vpermd", "vpermps",
 743  			};
 744  			for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 745  				const char *name = tbl[i];
 746  				put(name, YMM, YMM, YMM | MEM);
 747  			}
 748  		}
 749  		{
 750  			const char *tbl[] = {
 751  				"vpermq", "vpermpd",
 752  			};
 753  			for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 754  				const char *name = tbl[i];
 755  				put(name, YMM, YMM | MEM, IMM8);
 756  			}
 757  		}
 758  		put("vpextrw", REG32e | MEM, XMM, IMM); // nasm is ok, yasm generate redundant code
 759  	}
 760  	void putAVX512_M_X()
 761  	{
 762  		const char *tbl[] = {
 763  			"vmovapd",
 764  			"vmovaps",
 765  			"vmovupd",
 766  			"vmovups",
 767  		};
 768  		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 769  			const char *name = tbl[i];
 770  			put(name, MEM, ZMM);
 771  			put(name, ZMM, MEM);
 772  #ifdef XBYAK64
 773  			put(name, MEM, XMM);
 774  			put(name, XMM, MEM);
 775  #endif
 776  		}
 777  	}
 778  	void put_vmov()
 779  	{
 780  #ifdef XBYAK64
 781  		put("vmovd", XMM, MEM|REG32);
 782  		put("vmovd", MEM|REG32, XMM);
 783  		put("vmovq", XMM, MEM|REG64|XMM);
 784  		put("vmovq", MEM|REG64|XMM, XMM);
 785  		put("vmovhlps", XMM, _XMM3, _XMM3);
 786  		put("vmovlhps", XMM, _XMM3, _XMM3);
 787  		put("vmovntdqa", XMM|_YMM3|ZMM, MEM);
 788  		put("vmovntdq", MEM, XMM | _YMM3 | ZMM);
 789  		put("vmovntpd", MEM, XMM | _YMM3 | ZMM);
 790  		put("vmovntps", MEM, XMM | _YMM3 | ZMM);
 791  
 792  		put("vmovsd", XMM_KZ, XMM, _XMM3);
 793  		put("vmovsd", XMM_KZ, MEM);
 794  		put("vmovsd", MEM_K, XMM);
 795  		put("vmovss", XMM_KZ, XMM, _XMM3);
 796  		put("vmovss", XMM_KZ, MEM);
 797  		put("vmovss", MEM_K, XMM);
 798  
 799  		put("vmovshdup", _ZMM, _ZMM);
 800  		put("vmovsldup", _ZMM, _ZMM);
 801  
 802  
 803  		{
 804  			const char *tbl[] = {
 805  				"valignd",
 806  				"valignq",
 807  			};
 808  			for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 809  				const char *name = tbl[i];
 810  				put(name, XMM_KZ, XMM, _XMM | MEM, IMM);
 811  				put(name, _YMM3, _YMM3, _YMM3 | _MEM, IMM);
 812  				put(name, _ZMM, _ZMM, _ZMM | _MEM, IMM);
 813  			}
 814  		}
 815  		{
 816  			const char tbl[][16] = {
 817  				"vmovhpd",
 818  				"vmovhps",
 819  				"vmovlpd",
 820  				"vmovlps",
 821  			};
 822  			for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 823  				put(tbl[i], XMM, _XMM3, MEM);
 824  				put(tbl[i], MEM, _XMM3);
 825  			}
 826  		}
 827  #endif
 828  	}
 829  	void put512_X_XM()
 830  	{
 831  		const struct Tbl {
 832  			const char *name;
 833  			bool M_X;
 834  		} tbl[] = {
 835  			{ "vmovddup", false },
 836  			{ "vmovdqa32", true },
 837  			{ "vmovdqa64", true },
 838  			{ "vmovdqu8", true },
 839  			{ "vmovdqu16", true },
 840  			{ "vmovdqu32", true },
 841  			{ "vmovdqu64", true },
 842  			{ "vpabsb", false },
 843  			{ "vpabsw", false },
 844  			{ "vpabsd", false },
 845  			{ "vpabsq", false },
 846  		};
 847  		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 848  			const Tbl& p = tbl[i];
 849  			put(p.name, XMM|XMM_KZ, _XMM|MEM);
 850  			put(p.name, _YMM|YMM_KZ, _YMM|MEM);
 851  			put(p.name, _ZMM|ZMM_KZ, _ZMM|MEM);
 852  			if (!p.M_X) continue;
 853  			put(p.name, MEM|MEM_K, XMM);
 854  			put(p.name, MEM|MEM_K, _YMM);
 855  			put(p.name, MEM|MEM_K, _ZMM);
 856  		}
 857  		put("vsqrtpd", XMM_KZ, M_1to2 | _MEM);
 858  		put("vsqrtpd", YMM_KZ, M_1to4 | _MEM);
 859  		put("vsqrtpd", ZMM_KZ, M_1to8 | _MEM);
 860  		put("vsqrtpd", ZMM_KZ, ZMM_ER);
 861  
 862  		put("vsqrtps", XMM_KZ, M_1to4 | _MEM);
 863  		put("vsqrtps", YMM_KZ, M_1to8 | _MEM);
 864  		put("vsqrtps", ZMM_KZ, M_1to16 | _MEM);
 865  		put("vsqrtps", ZMM_KZ, ZMM_ER);
 866  
 867  		put("vpabsd", ZMM_KZ, M_1to16 | _MEM);
 868  		put("vpabsq", ZMM_KZ, M_1to8 | _MEM);
 869  
 870  		put("vbroadcastf32x2", YMM_KZ | ZMM_KZ, XMM | _MEM);
 871  		put("vbroadcastf32x4", YMM_KZ | ZMM_KZ, _MEM);
 872  
 873  		put("vbroadcastf64x2", YMM_KZ | ZMM_KZ, _MEM);
 874  		put("vbroadcastf64x4", ZMM_KZ, _MEM);
 875  		put("vbroadcastf32x8", ZMM_KZ, _MEM);
 876  	}
 877  	void put512_X_X_XM()
 878  	{
 879  		const struct Tbl {
 880  			const char *name;
 881  			uint64_t mem;
 882  		} tbl[] = {
 883  			{ "vsqrtsd", MEM },
 884  			{ "vsqrtss", MEM },
 885  			{ "vunpckhpd", M_1to2 },
 886  			{ "vunpckhps", M_1to4 },
 887  			{ "vunpcklpd", M_1to2 },
 888  			{ "vunpcklps", M_1to4 },
 889  		};
 890  		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
 891  			const Tbl& p = tbl[i];
 892  			put(p.name, XMM_KZ, XMM, _XMM|p.mem);
 893  		}
 894  	}
 895  	void put512_X3()
 896  	{
 897  #ifdef XBYAK64
 898  		const struct Tbl {
 899  			const char *name;
 900  			uint64_t x1;
 901  			uint64_t x2;
 902  			uint64_t xm;
 903  		} tbl[] = {
 904  			{ "vpacksswb", XMM_KZ, XMM, _XMM | _MEM },
 905  			{ "vpacksswb", YMM_KZ, _YMM, _YMM | _MEM },
 906  			{ "vpacksswb", ZMM_KZ, _ZMM, _ZMM | _MEM },
 907  
 908  			{ "vpackssdw", XMM_KZ, XMM, _XMM | M_1to4 | _MEM },
 909  			{ "vpackssdw", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM },
 910  			{ "vpackssdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
 911  
 912  			{ "vpackusdw", XMM_KZ, XMM, _XMM | M_1to4 | _MEM },
 913  			{ "vpackusdw", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM },
 914  			{ "vpackusdw", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
 915  
 916  			{ "vpackuswb", XMM_KZ, XMM, _XMM | _MEM },
 917  			{ "vpackuswb", YMM_KZ, _YMM, _YMM | _MEM },
 918  			{ "vpackuswb", ZMM_KZ, _ZMM, _ZMM | _MEM },
 919  
 920  			{ "vpaddb", XMM_KZ, XMM, _XMM | _MEM },
 921  			{ "vpaddw", XMM_KZ, _XMM, _XMM | _MEM },
 922  			{ "vpaddd", XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
 923  			{ "vpaddq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
 924  
 925  			{ "vpaddsb", XMM_KZ, XMM, _XMM | _MEM },
 926  			{ "vpaddsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
 927  
 928  			{ "vpaddsw", XMM_KZ, XMM, _XMM | _MEM },
 929  			{ "vpaddsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
 930  
 931  			{ "vpaddusb", XMM_KZ, XMM, _XMM | MEM },
 932  			{ "vpaddusb", ZMM_KZ, _ZMM, _ZMM | MEM },
 933  
 934  			{ "vpaddusw", XMM_KZ, XMM, _XMM | MEM },
 935  			{ "vpaddusw", ZMM_KZ, _ZMM, _ZMM | MEM },
 936  
 937  			{ "vpsubb", XMM_KZ, XMM, _XMM | _MEM },
 938  			{ "vpsubw", XMM_KZ, XMM, _XMM | _MEM },
 939  			{ "vpsubd", XMM_KZ, XMM, _XMM | M_1to4 | _MEM },
 940  			{ "vpsubq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
 941  
 942  			{ "vpsubsb", XMM_KZ, XMM, _XMM | _MEM },
 943  			{ "vpsubsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
 944  
 945  			{ "vpsubsw", XMM_KZ, XMM, _XMM | _MEM },
 946  			{ "vpsubsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
 947  
 948  			{ "vpsubusb", XMM_KZ, XMM, _XMM | MEM },
 949  			{ "vpsubusb", ZMM_KZ, _ZMM, _ZMM | MEM },
 950  
 951  			{ "vpsubusw", XMM_KZ, XMM, _XMM | MEM },
 952  			{ "vpsubusw", ZMM_KZ, _ZMM, _ZMM | MEM },
 953  
 954  			{ "vpandd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
 955  			{ "vpandq", ZMM_KZ, _ZMM, _ZMM | M_1to8 | _MEM },
 956  
 957  			{ "vpandnd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
 958  			{ "vpandnq", ZMM_KZ, _ZMM, _ZMM | M_1to8 | _MEM },
 959  
 960  			{ "vpavgb", ZMM_KZ, _ZMM, _ZMM },
 961  			{ "vpavgw", ZMM_KZ, _ZMM, _ZMM },
 962  
 963  			{ "vpcmpeqb", K2, _ZMM, _ZMM | _MEM },
 964  			{ "vpcmpeqw", K2, _ZMM, _ZMM | _MEM },
 965  			{ "vpcmpeqd", K2, _ZMM, _ZMM | M_1to16 | _MEM },
 966  			{ "vpcmpeqq", K2, _ZMM, _ZMM | M_1to8 | _MEM },
 967  
 968  			{ "vpcmpgtb", K2, _ZMM, _ZMM | _MEM },
 969  			{ "vpcmpgtw", K2, _ZMM, _ZMM | _MEM },
 970  			{ "vpcmpgtd", K2, _ZMM, _ZMM | M_1to16 | _MEM },
 971  			{ "vpcmpgtq", K2, _ZMM, _ZMM | M_1to8 | _MEM },
 972  
 973  			{ "vpmaddubsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
 974  			{ "vpmaddwd", ZMM_KZ, _ZMM, _ZMM | _MEM },
 975  
 976  			{ "vpmaxsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
 977  			{ "vpmaxsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
 978  			{ "vpmaxsd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
 979  			{ "vpmaxsq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
 980  
 981  			{ "vpmaxub", ZMM_KZ, _ZMM, _ZMM | _MEM },
 982  			{ "vpmaxuw", ZMM_KZ, _ZMM, _ZMM | _MEM },
 983  			{ "vpmaxud", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
 984  			{ "vpmaxuq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
 985  
 986  			{ "vpminsb", ZMM_KZ, _ZMM, _ZMM | _MEM },
 987  			{ "vpminsw", ZMM_KZ, _ZMM, _ZMM | _MEM },
 988  			{ "vpminsd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
 989  			{ "vpminsq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
 990  
 991  			{ "vpminub", ZMM_KZ, _ZMM, _ZMM | _MEM },
 992  			{ "vpminuw", ZMM_KZ, _ZMM, _ZMM | _MEM },
 993  			{ "vpminud", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 },
 994  			{ "vpminuq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 },
 995  
 996  			{ "vpslldq", XMM, _XMM3 | _MEM, IMM8 },
 997  			{ "vpslldq", _YMM3, _YMM3 | _MEM, IMM8 },
 998  			{ "vpslldq", _ZMM, _ZMM | _MEM, IMM8 },
 999  
1000  			{ "vpsrldq", XMM, _XMM3 | _MEM, IMM8 },
1001  			{ "vpsrldq", _YMM3, _YMM3 | _MEM, IMM8 },
1002  			{ "vpsrldq", _ZMM, _ZMM | _MEM, IMM8 },
1003  
1004  			{ "vpsraw", XMM_KZ, XMM | _MEM, IMM8 },
1005  			{ "vpsraw", ZMM_KZ, _ZMM | _MEM, IMM8 },
1006  
1007  			{ "vpsrad", XMM_KZ, XMM | M_1to4 | _MEM, IMM8 },
1008  			{ "vpsrad", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
1009  
1010  			{ "vpsraq", XMM, XMM, IMM8 },
1011  			{ "vpsraq", XMM_KZ, XMM | M_1to2 | _MEM, IMM8 },
1012  			{ "vpsraq", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 },
1013  
1014  			{ "vpsllw", XMM, _XMM3 | _MEM, IMM8 },
1015  			{ "vpslld", XMM, _XMM3 | _MEM | M_1to4, IMM8 },
1016  			{ "vpsllq", XMM, _XMM3 | _MEM | M_1to2, IMM8 },
1017  
1018  			{ "vpsrlw", XMM_KZ, XMM | _MEM, IMM8 },
1019  			{ "vpsrlw", ZMM_KZ, _ZMM | _MEM, IMM8 },
1020  
1021  			{ "vpsrld", XMM_KZ, XMM | M_1to4 | _MEM, IMM8 },
1022  			{ "vpsrld", ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
1023  
1024  			{ "vpsrlq", XMM, _XMM3 | _MEM | M_1to2, IMM8 },
1025  			{ "vpsrlq", _ZMM, _ZMM | _MEM | M_1to8, IMM8 },
1026  
1027  			{ "vpsravw", XMM_KZ | XMM, _XMM, _XMM | _MEM },
1028  			{ "vpsravw", _ZMM, _ZMM, _MEM },
1029  
1030  			{ "vpsravd", XMM_KZ | XMM, _XMM, _XMM | _MEM },
1031  			{ "vpsravd", _ZMM, _ZMM, M_1to16 | _MEM },
1032  
1033  			{ "vpsravq", XMM_KZ | XMM, _XMM, _XMM | _MEM },
1034  			{ "vpsravq", _ZMM, _ZMM, M_1to8 | _MEM },
1035  
1036  			{ "vpsllvw", XMM_KZ | XMM, _XMM, _XMM | _MEM },
1037  			{ "vpsllvw", _ZMM, _ZMM, _MEM },
1038  
1039  			{ "vpsllvd", XMM_KZ | XMM, _XMM, _XMM | _MEM },
1040  			{ "vpsllvd", _ZMM, _ZMM, M_1to16 | _MEM },
1041  
1042  			{ "vpsllvq", XMM_KZ | XMM, _XMM, _XMM | _MEM },
1043  			{ "vpsllvq", _ZMM, _ZMM, M_1to8 | _MEM },
1044  
1045  			{ "vpsrlvw", XMM_KZ | XMM, _XMM, _XMM | _MEM },
1046  			{ "vpsrlvw", _ZMM, _ZMM, _MEM },
1047  
1048  			{ "vpsrlvd", XMM_KZ | XMM, _XMM, _XMM | _MEM },
1049  			{ "vpsrlvd", _ZMM, _ZMM, M_1to16 | _MEM },
1050  
1051  			{ "vpsrlvq", XMM_KZ | XMM, _XMM, _XMM | _MEM },
1052  			{ "vpsrlvq", _ZMM, _ZMM, M_1to8 | _MEM },
1053  
1054  			{ "vpshufb", XMM | XMM_KZ, _XMM, _XMM | _MEM },
1055  			{ "vpshufb", ZMM_KZ, _ZMM, _MEM },
1056  
1057  			{ "vpshufhw", XMM | XMM_KZ, _XMM | _MEM, IMM8 },
1058  			{ "vpshufhw", ZMM_KZ, _MEM, IMM8 },
1059  
1060  			{ "vpshuflw", XMM | XMM_KZ, _XMM | _MEM, IMM8 },
1061  			{ "vpshuflw", ZMM_KZ, _MEM, IMM8 },
1062  
1063  			{ "vpshufd", XMM | XMM_KZ, _XMM | M_1to4 | _MEM, IMM8 },
1064  			{ "vpshufd", _ZMM | ZMM_KZ, _ZMM | M_1to16 | _MEM, IMM8 },
1065  
1066  			{ "vpord", XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
1067  			{ "vpord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM },
1068  
1069  			{ "vporq", XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
1070  			{ "vporq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM },
1071  
1072  			{ "vpxord", XMM | XMM_KZ, _XMM, _XMM | M_1to4 | _MEM },
1073  			{ "vpxord", _ZMM | ZMM_KZ, _ZMM, M_1to16 | _MEM },
1074  
1075  			{ "vpxorq", XMM | XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
1076  			{ "vpxorq", _ZMM | ZMM_KZ, _ZMM, M_1to8 | _MEM },
1077  
1078  			{ "vpsadbw", XMM, _XMM, _XMM | _MEM },
1079  			{ "vpsadbw", _ZMM, _ZMM, _MEM },
1080  
1081  			{ "vpmuldq", XMM, _XMM, _XMM | M_1to2 | _MEM },
1082  			{ "vpmuldq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
1083  
1084  			{ "vpmulhrsw", XMM, _XMM, _XMM | _MEM },
1085  			{ "vpmulhrsw", ZMM_KZ, _ZMM, _MEM },
1086  
1087  			{ "vpmulhuw", XMM, _XMM, _XMM | _MEM },
1088  			{ "vpmulhuw", ZMM_KZ, _ZMM, _MEM },
1089  
1090  			{ "vpmulhw", XMM, _XMM, _XMM | _MEM },
1091  			{ "vpmulhw", ZMM_KZ, _ZMM, _MEM },
1092  
1093  			{ "vpmullw", XMM, _XMM, _XMM | _MEM },
1094  			{ "vpmullw", ZMM_KZ, _ZMM, _MEM },
1095  
1096  			{ "vpmulld", XMM, _XMM, M_1to4 | _MEM },
1097  			{ "vpmulld", ZMM_KZ, _ZMM, M_1to16 | _MEM },
1098  
1099  			{ "vpmullq", XMM, _XMM, M_1to2 | _MEM },
1100  			{ "vpmullq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
1101  
1102  			{ "vpmuludq", XMM, _XMM, M_1to2 | _MEM },
1103  			{ "vpmuludq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
1104  
1105  			{ "vpunpckhbw", XMM, _XMM, _XMM | _MEM },
1106  			{ "vpunpckhbw", _ZMM, _ZMM, _MEM },
1107  
1108  			{ "vpunpckhwd", XMM, _XMM, _XMM | _MEM },
1109  			{ "vpunpckhwd", _ZMM, _ZMM, _MEM },
1110  
1111  			{ "vpunpckhdq", XMM, _XMM, M_1to4 | _MEM },
1112  			{ "vpunpckhdq", _ZMM, _ZMM, M_1to16 | _MEM },
1113  
1114  			{ "vpunpckhqdq", XMM, _XMM, M_1to2 | _MEM },
1115  			{ "vpunpckhqdq", _ZMM, _ZMM, M_1to8 | _MEM },
1116  
1117  			{ "vpunpcklbw", XMM, _XMM, _XMM | _MEM },
1118  			{ "vpunpcklbw", _ZMM, _ZMM, _MEM },
1119  
1120  			{ "vpunpcklwd", XMM, _XMM, _XMM | _MEM },
1121  			{ "vpunpcklwd", _ZMM, _ZMM, _MEM },
1122  
1123  			{ "vpunpckldq", XMM, _XMM, M_1to4 | _MEM },
1124  			{ "vpunpckldq", _ZMM, _ZMM, M_1to16 | _MEM },
1125  
1126  			{ "vpunpcklqdq", XMM, _XMM, M_1to2 | _MEM },
1127  			{ "vpunpcklqdq", _ZMM, _ZMM, M_1to8 | _MEM },
1128  
1129  			{ "vextractf32x4", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
1130  			{ "vextractf64x2", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
1131  			{ "vextractf32x8", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
1132  			{ "vextractf64x4", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
1133  
1134  			{ "vextracti32x4", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
1135  			{ "vextracti64x2", _XMM | XMM_KZ | _MEM, _YMM | _ZMM, IMM8 },
1136  			{ "vextracti32x8", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
1137  			{ "vextracti64x4", _YMM | YMM_KZ | _MEM, _ZMM, IMM8 },
1138  
1139  			{ "vextractps", REG32 | _MEM, XMM, IMM8 },
1140  
1141  			{ "vpermb", XMM_KZ, _XMM, _XMM | _MEM },
1142  			{ "vpermb", ZMM_KZ, _ZMM, _ZMM | _MEM },
1143  
1144  			{ "vpermw", XMM_KZ, _XMM, _XMM | _MEM },
1145  			{ "vpermw", ZMM_KZ, _ZMM, _ZMM | _MEM },
1146  
1147  			{ "vpermd", YMM_KZ, _YMM, _YMM | M_1to8 | _MEM },
1148  			{ "vpermd", ZMM_KZ, _ZMM, _ZMM | M_1to16 | _MEM },
1149  
1150  			{ "vpermilpd", XMM_KZ, _XMM, _XMM | M_1to2 | _MEM },
1151  			{ "vpermilpd", ZMM_KZ, _ZMM, M_1to8 | _MEM },
1152  			{ "vpermilpd", XMM_KZ, M_1to2 | _MEM, IMM8 },
1153  			{ "vpermilpd", ZMM_KZ, M_1to8 | _MEM, IMM8 },
1154  
1155  			{ "vpermilps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4 },
1156  			{ "vpermilps", ZMM_KZ, _ZMM, _MEM | M_1to16 },
1157  			{ "vpermilps", XMM_KZ, _MEM | M_1to4 | _MEM, IMM8 },
1158  			{ "vpermilps", ZMM_KZ, _MEM | M_1to16 | _MEM, IMM8 },
1159  
1160  			{ "vpermpd", YMM_KZ, _YMM | M_1to4 | _MEM, IMM8 },
1161  			{ "vpermpd", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 },
1162  			{ "vpermpd", YMM_KZ, _YMM, M_1to4 | _MEM },
1163  			{ "vpermpd", ZMM_KZ, _ZMM, M_1to8 | _MEM },
1164  
1165  			{ "vpermps", YMM_KZ, _YMM, M_1to8 | _MEM },
1166  			{ "vpermps", ZMM_KZ, _ZMM, M_1to16 | _MEM },
1167  
1168  			{ "vpermq", YMM_KZ, _YMM | M_1to4 | _MEM, IMM8 },
1169  			{ "vpermq", ZMM_KZ, _ZMM | M_1to8 | _MEM, IMM8 },
1170  			{ "vpermq", YMM_KZ, _YMM, M_1to4 | _MEM },
1171  			{ "vpermq", ZMM_KZ, _ZMM, M_1to8 | _MEM },
1172  		};
1173  		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1174  			const Tbl& p = tbl[i];
1175  			put(p.name, p.x1, p.x2, p.xm);
1176  		}
1177  #endif
1178  	}
1179  	void put512_X3_I()
1180  	{
1181  		const struct Tbl {
1182  			const char *name;
1183  			uint64_t x1;
1184  			uint64_t x2;
1185  			uint64_t xm;
1186  		} tbl[] = {
1187  #ifdef XBYAK64
1188  			{ "vinsertps", XMM, _XMM, _XMM3 | _MEM },
1189  
1190  			{ "vshufpd", XMM_KZ, _XMM, M_1to2 | _MEM },
1191  			{ "vshufpd", ZMM_KZ, _ZMM, M_1to8 | _MEM },
1192  
1193  			{ "vshufps", XMM_KZ, _XMM, M_1to4 | _MEM },
1194  			{ "vshufps", ZMM_KZ, _ZMM, M_1to16 | _MEM },
1195  
1196  			{ "vinsertf32x4", _YMM | YMM_KZ, _YMM, _XMM | _MEM },
1197  			{ "vinsertf32x4", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM },
1198  
1199  			{ "vinsertf64x2", _YMM | YMM_KZ, _YMM, _XMM | _MEM },
1200  			{ "vinsertf64x2", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM },
1201  
1202  			{ "vinsertf32x8", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM },
1203  			{ "vinsertf64x4", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM },
1204  
1205  			{ "vinserti32x4", _YMM | YMM_KZ, _YMM, _XMM | _MEM },
1206  			{ "vinserti32x4", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM },
1207  
1208  			{ "vinserti64x2", _YMM | YMM_KZ, _YMM, _XMM | _MEM },
1209  			{ "vinserti64x2", _ZMM | ZMM_KZ, _ZMM, _XMM | _MEM },
1210  
1211  			{ "vinserti32x8", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM },
1212  			{ "vinserti64x4", _ZMM | ZMM_KZ, _ZMM, _YMM | _MEM },
1213  #endif
1214  			{ "vpalignr", ZMM_KZ, _ZMM, _ZMM | _MEM },
1215  		};
1216  		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1217  			const Tbl& p = tbl[i];
1218  			put(p.name, p.x1, p.x2, p.xm, IMM8);
1219  		}
1220  #ifdef XBYAK64
1221  		put("vpextrb", _REG64 | _MEM, XMM, IMM8);
1222  		put("vpextrw", _REG64 | _MEM, XMM, IMM8);
1223  		put("vpextrd", _REG32 | _MEM, XMM, IMM8);
1224  		put("vpextrq", _REG64 | _MEM, XMM, IMM8);
1225  		put("vpinsrb", XMM, _XMM3, _REG32 | _MEM, IMM8);
1226  		put("vpinsrw", XMM, _XMM3, _REG32 | _MEM, IMM8);
1227  		put("vpinsrd", XMM, _XMM3, _REG32 | _MEM, IMM8);
1228  		put("vpinsrq", XMM, _XMM3, _REG64 | _MEM, IMM8);
1229  #endif
1230  	}
1231  	void put512_FMA()
1232  	{
1233  		const struct Tbl {
1234  			const char *name;
1235  			bool supportYMM;
1236  		} tbl[] = {
1237  			{ "vfmadd", true },
1238  			{ "vfmadd", false },
1239  			{ "vfmaddsub", true },
1240  			{ "vfmsubadd", true },
1241  			{ "vfmsub", true },
1242  			{ "vfmsub", false },
1243  			{ "vfnmadd", true },
1244  			{ "vfnmadd", false },
1245  			{ "vfnmsub", true },
1246  			{ "vfnmsub", false },
1247  		};
1248  		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1249  			const Tbl& p = tbl[i];
1250  			const struct Ord {
1251  				const char *name;
1252  			} ord[] = {
1253  				{ "132" },
1254  				{ "213" },
1255  				{ "231" },
1256  			};
1257  			for (size_t j = 0; j < NUM_OF_ARRAY(ord); j++) {
1258  				const char sufTbl[][2][8] = {
1259  					{ "pd", "ps" },
1260  					{ "sd", "ss" },
1261  				};
1262  				for (size_t k = 0; k < 2; k++) {
1263  					const std::string suf = sufTbl[p.supportYMM ? 0 : 1][k];
1264  					uint64_t mem = 0;
1265  					if (suf == "pd") {
1266  						mem = M_1to2;
1267  					} else if (suf == "ps") {
1268  						mem = M_1to4;
1269  					} else {
1270  						mem = XMM_ER;
1271  					}
1272  					std::string name = std::string(p.name) + ord[j].name + suf;
1273  					const char *q = name.c_str();
1274  					put(q, XMM_KZ, _XMM, mem | _MEM);
1275  					if (!p.supportYMM) continue;
1276  					if (suf == "pd") {
1277  						mem = M_1to8;
1278  					} else if (suf == "ps") {
1279  						mem = M_1to16;
1280  					} else {
1281  						mem = XMM_ER;
1282  					}
1283  					put(q, _ZMM, _ZMM, mem | _MEM);
1284  				}
1285  			}
1286  		}
1287  	}
1288  	void put512_Y_XM()
1289  	{
1290  		const struct Tbl {
1291  			const char *name;
1292  			bool all_xmm; // 2nd param
1293  		} tbl[] = {
1294  			{ "vpmovsxbw", false },
1295  			{ "vpmovsxbd", true },
1296  			{ "vpmovsxbq", true },
1297  			{ "vpmovsxwd", false },
1298  			{ "vpmovsxwq", true },
1299  			{ "vpmovsxdq", false },
1300  
1301  			{ "vpmovzxbw", false },
1302  			{ "vpmovzxbd", true },
1303  			{ "vpmovzxbq", true },
1304  			{ "vpmovzxwd", false },
1305  			{ "vpmovzxwq", true },
1306  			{ "vpmovzxdq", false },
1307  		};
1308  		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1309  			const Tbl& p = tbl[i];
1310  			const char *name = p.name;
1311  			put(name, XMM_KZ | YMM, _XMM | _MEM);
1312  			if (p.all_xmm) {
1313  				put(name, ZMM, _XMM | _MEM);
1314  			} else {
1315  				put(name, ZMM, YMM | _MEM);
1316  			}
1317  		}
1318  	}
1319  	void put512_AVX1()
1320  	{
1321  #ifdef XBYAK64
1322  		const struct Tbl {
1323  			std::string name;
1324  			bool only_pd_ps;
1325  		} tbl[] = {
1326  			{ "vadd", false },
1327  			{ "vsub", false },
1328  			{ "vmul", false },
1329  			{ "vdiv", false },
1330  			{ "vmax", false },
1331  			{ "vmin", false },
1332  			{ "vand", true },
1333  			{ "vandn", true },
1334  			{ "vor", true },
1335  			{ "vxor", true },
1336  		};
1337  		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1338  			const struct Suf {
1339  				const char *suf;
1340  				bool supportYMM;
1341  			} sufTbl[] = {
1342  				{ "pd", true },
1343  				{ "ps", true },
1344  				{ "sd", false },
1345  				{ "ss", false },
1346  			};
1347  			for (size_t j = 0; j < NUM_OF_ARRAY(sufTbl); j++) {
1348  				if (tbl[i].only_pd_ps && j == 2) break;
1349  				std::string suf = sufTbl[j].suf;
1350  				std::string name = tbl[i].name + suf;
1351  				const char *p = name.c_str();
1352  				uint64_t mem = 0;
1353  				if (suf == "pd") {
1354  					mem = M_1to2;
1355  				} else if (suf == "ps") {
1356  					mem = M_1to4;
1357  				}
1358  				put(p, XMM | XMM_KZ, _XMM, mem | _MEM);
1359  				if (!sufTbl[j].supportYMM) continue;
1360  				mem = 0;
1361  				if (suf == "pd") {
1362  					mem = M_1to8;
1363  				} else if (suf == "ps") {
1364  					mem = M_1to16;
1365  				}
1366  				put(p, _ZMM, _ZMM, mem | _MEM);
1367  			}
1368  		}
1369  		put("vaddss", XMM, _XMM, XMM_ER);
1370  		put("vaddsd", XMM, _XMM, XMM_ER);
1371  #endif
1372  	}
1373  	void putAVX1()
1374  	{
1375  		const struct Tbl {
1376  			const char *name;
1377  			bool only_pd_ps;
1378  		} tbl[] = {
1379  			{ "add", false },
1380  			{ "sub", false },
1381  			{ "mul", false },
1382  			{ "div", false },
1383  			{ "max", false },
1384  			{ "min", false },
1385  			{ "and", true },
1386  			{ "andn", true },
1387  			{ "or", true },
1388  			{ "xor", true },
1389  		};
1390  		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1391  			const struct Suf {
1392  				const char *suf;
1393  				bool supportYMM;
1394  			} suf[] = {
1395  				{ "pd", true },
1396  				{ "ps", true },
1397  				{ "sd", false },
1398  				{ "ss", false },
1399  			};
1400  			for (size_t j = 0; j < NUM_OF_ARRAY(suf); j++) {
1401  				if (tbl[i].only_pd_ps && j == 2) break;
1402  				std::string name = std::string("v") + tbl[i].name + suf[j].suf;
1403  				const char *p = name.c_str();
1404  				put(p, XMM, XMM | MEM);
1405  				put(p, XMM, XMM, XMM | MEM);
1406  				if (!suf[j].supportYMM) continue;
1407  				put(p, YMM, YMM | MEM);
1408  				put(p, YMM, YMM, YMM | MEM);
1409  				put(p, ZMM, ZMM, ZMM | MEM);
1410  			}
1411  		}
1412  	}
1413  	void put512_cvt()
1414  	{
1415  #ifdef XBYAK64
1416  		put("vcvtdq2pd", XMM_KZ, _XMM | _MEM | M_1to2);
1417  		put("vcvtdq2pd", YMM_KZ, _XMM | _MEM | M_1to4);
1418  		put("vcvtdq2pd", ZMM_KZ, _YMM | _MEM | M_1to8);
1419  
1420  		put("vcvtdq2ps", XMM_KZ, _XMM | _MEM | M_1to4);
1421  		put("vcvtdq2ps", YMM_KZ, _YMM | _MEM | M_1to8);
1422  		put("vcvtdq2ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_ER);
1423  
1424  		put("vcvtpd2dq", XMM_KZ, _XMM | M_xword | M_1to2);
1425  		put("vcvtpd2dq", XMM_KZ, _YMM | M_yword | MY_1to4);
1426  		put("vcvtpd2dq", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
1427  
1428  		put("vcvtpd2ps", XMM_KZ, _XMM | M_xword | M_1to2);
1429  		put("vcvtpd2ps", XMM_KZ, _YMM | M_yword | MY_1to4);
1430  		put("vcvtpd2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
1431  
1432  		put("vcvtpd2qq", XMM_KZ, _XMM | _MEM | M_1to2);
1433  		put("vcvtpd2qq", YMM_KZ, _YMM | _MEM | M_1to4);
1434  		put("vcvtpd2qq", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER);
1435  
1436  		put("vcvtpd2udq", XMM_KZ, _XMM | M_xword | M_1to2);
1437  		put("vcvtpd2udq", XMM_KZ, _YMM | M_yword | MY_1to4);
1438  		put("vcvtpd2udq", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
1439  
1440  		put("vcvtpd2uqq", XMM_KZ, _XMM | _MEM | M_1to2);
1441  		put("vcvtpd2uqq", YMM_KZ, _YMM | _MEM | M_1to4);
1442  		put("vcvtpd2uqq", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER);
1443  
1444  		put("vcvtph2ps", XMM_KZ, _XMM | _MEM);
1445  		put("vcvtph2ps", YMM_KZ, _XMM | _MEM);
1446  		put("vcvtph2ps", ZMM_KZ, _YMM | _MEM | YMM_SAE);
1447  
1448  		put("vcvtps2ph", XMM_KZ | _MEM, _XMM, IMM8);
1449  		put("vcvtps2ph", XMM_KZ | _MEM, _YMM, IMM8);
1450  		put("vcvtps2ph", YMM_KZ | _MEM, _ZMM, IMM8);
1451  		put("vcvtps2ph", YMM_KZ, ZMM_SAE, IMM8);
1452  
1453  		put("vcvtps2dq", XMM_KZ, _XMM | _MEM | M_1to4);
1454  		put("vcvtps2dq", YMM_KZ, _YMM | _MEM | M_1to8);
1455  		put("vcvtps2dq", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_ER);
1456  
1457  		put("vcvtps2udq", XMM_KZ, _XMM | M_1to4);
1458  		put("vcvtps2udq", YMM_KZ, _YMM | M_1to8);
1459  		put("vcvtps2udq", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_ER);
1460  
1461  		put("vcvtps2qq", XMM_KZ, _XMM | _MEM | M_1to2);
1462  		put("vcvtps2qq", YMM_KZ, _XMM | _MEM | M_1to4);
1463  		put("vcvtps2qq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_ER);
1464  
1465  		put("vcvtps2uqq", XMM_KZ, _XMM | _MEM | M_1to2);
1466  		put("vcvtps2uqq", YMM_KZ, _XMM | _MEM | M_1to4);
1467  		put("vcvtps2uqq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_ER);
1468  
1469  		put("vcvtps2pd", XMM_KZ, _XMM | _MEM | M_1to2);
1470  		put("vcvtps2pd", YMM_KZ, _XMM | _MEM | M_1to4);
1471  		put("vcvtps2pd", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_SAE);
1472  
1473  		put("vcvtqq2pd", XMM_KZ, _XMM | _MEM | M_1to2);
1474  		put("vcvtqq2pd", YMM_KZ, _YMM | _MEM | M_1to4);
1475  		put("vcvtqq2pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER);
1476  
1477  		put("vcvtqq2ps", XMM_KZ, _XMM | M_xword | M_1to2);
1478  		put("vcvtqq2ps", XMM_KZ, _YMM | M_yword | MY_1to4);
1479  		put("vcvtqq2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
1480  
1481  		put("vcvtsd2si", REG32 | REG64, XMM | _MEM | XMM_ER);
1482  
1483  		put("vcvtsd2usi", REG32 | REG64, XMM | _MEM | XMM_ER);
1484  
1485  		put("vcvtsd2ss", XMM_KZ, XMM, _XMM3 | _MEM | XMM_ER);
1486  
1487  		put("vcvtsi2sd", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64);
1488  		put("vcvtsi2sd", XMM, XMM_ER, REG64);
1489  
1490  		put("vcvtsi2ss", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64);
1491  		put("vcvtsi2ss", XMM, XMM_ER, REG32 | REG64);
1492  
1493  		put("vcvtss2sd", XMM_KZ, XMM, _XMM3 | _MEM | XMM_SAE);
1494  
1495  		put("vcvtss2si", REG32 | REG64, XMM | _MEM | XMM_ER);
1496  
1497  		put("vcvtss2usi", REG32 | REG64, XMM | _MEM | XMM_ER);
1498  
1499  		put("vcvtpd2dq", XMM_KZ, _XMM | M_xword | M_1to2);
1500  		put("vcvtpd2dq", XMM_KZ, _YMM | M_yword | MY_1to4);
1501  		put("vcvtpd2dq", YMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER);
1502  
1503  		put("vcvttpd2qq", XMM_KZ, _XMM | _MEM | M_1to2);
1504  		put("vcvttpd2qq", YMM_KZ, _YMM | _MEM | M_1to4);
1505  		put("vcvttpd2qq", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE);
1506  
1507  		put("vcvttpd2udq", XMM_KZ, _XMM | M_xword | M_1to2);
1508  		put("vcvttpd2udq", XMM_KZ, _YMM | M_yword | MY_1to4);
1509  		put("vcvttpd2udq", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_SAE);
1510  
1511  		put("vcvttpd2uqq", XMM_KZ, _XMM | _MEM | M_1to2);
1512  		put("vcvttpd2uqq", YMM_KZ, _YMM | _MEM | M_1to4);
1513  		put("vcvttpd2uqq", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE);
1514  
1515  		put("vcvttps2dq", XMM_KZ, _XMM | _MEM | M_1to4);
1516  		put("vcvttps2dq", YMM_KZ, _YMM | _MEM | M_1to8);
1517  		put("vcvttps2dq", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE);
1518  
1519  		put("vcvttps2udq", XMM_KZ, _XMM | M_1to4);
1520  		put("vcvttps2udq", YMM_KZ, _YMM | M_1to8);
1521  		put("vcvttps2udq", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE);
1522  
1523  		put("vcvttps2qq", XMM_KZ, _XMM | _MEM | M_1to2);
1524  		put("vcvttps2qq", YMM_KZ, _XMM | _MEM | M_1to4);
1525  		put("vcvttps2qq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_SAE);
1526  
1527  		put("vcvttps2uqq", XMM_KZ, _XMM | _MEM | M_1to2);
1528  		put("vcvttps2uqq", YMM_KZ, _XMM | _MEM | M_1to4);
1529  		put("vcvttps2uqq", ZMM_KZ, _YMM | _MEM | M_1to8 | YMM_SAE);
1530  
1531  		put("vcvttsd2si", REG32 | REG64, XMM | _MEM | XMM_SAE);
1532  
1533  		put("vcvttsd2usi", REG32 | REG64, XMM | _MEM | XMM_SAE);
1534  
1535  		put("vcvttss2si", REG32 | REG64, XMM | _MEM | XMM_SAE);
1536  
1537  		put("vcvttss2usi", REG32 | REG64, XMM | _MEM | XMM_SAE);
1538  
1539  		put("vcvtudq2pd", XMM_KZ, _XMM | _MEM | M_1to2);
1540  		put("vcvtudq2pd", YMM_KZ, _XMM | _MEM | M_1to4);
1541  		put("vcvtudq2pd", ZMM_KZ, _YMM | _MEM | M_1to8);
1542  
1543  		put("vcvtudq2ps", XMM_KZ, _XMM | _MEM | M_1to4);
1544  		put("vcvtudq2ps", YMM_KZ, _YMM | _MEM | M_1to8);
1545  		put("vcvtudq2ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_ER);
1546  
1547  		put("vcvtuqq2pd", XMM_KZ, _XMM | _MEM | M_1to2);
1548  		put("vcvtuqq2pd", YMM_KZ, _YMM | _MEM | M_1to4);
1549  		put("vcvtuqq2pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_ER);
1550  
1551  		put("vcvtuqq2ps", XMM_KZ, _XMM | M_xword | M_1to2);
1552  		put("vcvtuqq2ps", XMM_KZ, _YMM | M_yword | MY_1to4);
1553  		put("vcvtuqq2ps", YMM_KZ, ZMM | _MEM | M_1to8 | ZMM_ER);
1554  
1555  		put("vcvtusi2sd", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64);
1556  		put("vcvtusi2sd", XMM, XMM_ER, REG64);
1557  
1558  		put("vcvtusi2ss", XMM, _XMM3, REG32 | REG64 | MEM32 | MEM64);
1559  		put("vcvtusi2ss", XMM, XMM_ER, REG32 | REG64);
1560  #endif
1561  	}
1562  	enum {
1563  		xx_yy_zz,
1564  		xx_yx_zy,
1565  		xx_xy_yz
1566  	};
1567  	void putGather()
1568  	{
1569  #ifdef XBYAK64
1570  		const struct Tbl {
1571  			const char *name;
1572  			int mode;
1573  		} tbl[] = {
1574  			{ "vpgatherdd", xx_yy_zz },
1575  			{ "vpgatherdq", xx_yx_zy },
1576  			{ "vpgatherqd", xx_xy_yz },
1577  			{ "vpgatherqq", xx_yy_zz },
1578  			{ "vgatherdps", xx_yy_zz },
1579  			{ "vgatherdpd", xx_yx_zy },
1580  			{ "vgatherqps", xx_xy_yz },
1581  			{ "vgatherqpd", xx_yy_zz },
1582  		};
1583  		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1584  			const Tbl& p = tbl[i];
1585  			switch (p.mode) {
1586  			case xx_yy_zz:
1587  				put(p.name, XMM_K, VM32X);
1588  				put(p.name, YMM_K, VM32Y);
1589  				put(p.name, ZMM_K, VM32Z);
1590  				break;
1591  			case xx_yx_zy:
1592  				put(p.name, XMM_K, VM32X);
1593  				put(p.name, YMM_K, VM32X);
1594  				put(p.name, ZMM_K, VM32Y);
1595  				break;
1596  			case xx_xy_yz:
1597  				put(p.name, XMM_K, VM32X);
1598  				put(p.name, XMM_K, VM32Y);
1599  				put(p.name, YMM_K, VM32Z);
1600  				break;
1601  			}
1602  		}
1603  #endif
1604  	}
1605  	void putScatter()
1606  	{
1607  #ifdef XBYAK64
1608  		const struct Tbl {
1609  			const char *name;
1610  			int mode;
1611  		} tbl[] = {
1612  			{ "vpscatterdd", xx_yy_zz },
1613  			{ "vpscatterdq", xx_xy_yz },
1614  			{ "vpscatterqd", xx_yx_zy },
1615  			{ "vpscatterqq", xx_yy_zz },
1616  
1617  			{ "vscatterdps", xx_yy_zz },
1618  			{ "vscatterdpd", xx_xy_yz },
1619  			{ "vscatterqps", xx_yx_zy },
1620  			{ "vscatterqpd", xx_yy_zz },
1621  		};
1622  		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1623  			const Tbl& p = tbl[i];
1624  			switch (p.mode) {
1625  			case xx_yy_zz:
1626  				put(p.name, VM32X_K, _XMM);
1627  				put(p.name, VM32Y_K, _YMM);
1628  				put(p.name, VM32Z_K, _ZMM);
1629  				break;
1630  			case xx_yx_zy:
1631  				put(p.name, VM32X_K, _XMM);
1632  				put(p.name, VM32Y_K, _XMM);
1633  				put(p.name, VM32Z_K, _YMM);
1634  				break;
1635  			case xx_xy_yz:
1636  				put(p.name, VM32X_K, _XMM);
1637  				put(p.name, VM32X_K, _YMM);
1638  				put(p.name, VM32Y_K, _ZMM);
1639  				break;
1640  			}
1641  		}
1642  #endif
1643  	}
1644  	void putBlend()
1645  	{
1646  		put("vblendmpd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
1647  		put("vblendmpd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
1648  		put("vblendmpd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
1649  
1650  		put("vblendmps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
1651  		put("vblendmps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
1652  		put("vblendmps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16);
1653  
1654  		put("vpblendmb", XMM_KZ, _XMM, _XMM | _MEM);
1655  		put("vpblendmb", YMM_KZ, _YMM, _YMM | _MEM);
1656  		put("vpblendmb", ZMM_KZ, _ZMM, _ZMM | _MEM);
1657  
1658  		put("vpblendmb", XMM_KZ, _XMM, _XMM | _MEM);
1659  		put("vpblendmb", YMM_KZ, _YMM, _YMM | _MEM);
1660  		put("vpblendmb", ZMM_KZ, _ZMM, _ZMM | _MEM);
1661  
1662  		put("vpblendmd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
1663  		put("vpblendmd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
1664  		put("vpblendmd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16);
1665  
1666  		put("vpblendmq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
1667  		put("vpblendmq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
1668  		put("vpblendmq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
1669  	}
1670  	void putVpcmp()
1671  	{
1672  		const uint64_t b0Tbl[] = { 0, 0, 0 };
1673  		const uint64_t b4Tbl[] = { M_1to4, M_1to8, M_1to16 };
1674  		const uint64_t b2Tbl[] = { M_1to2, M_1to4, M_1to8 };
1675  		const struct Tbl {
1676  			const char *name;
1677  			uint64_t b;
1678  		} tbl[] = {
1679  			{ "vpcmpb", 0 },
1680  			{ "vpcmpub", 0 },
1681  			{ "vpcmpw", 0 },
1682  			{ "vpcmpuw", 0 },
1683  			{ "vpcmpd", M_1to4 },
1684  			{ "vpcmpud", M_1to4 },
1685  			{ "vpcmpq", M_1to2 },
1686  			{ "vpcmpuq", M_1to2 },
1687  		};
1688  		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1689  			const Tbl& p = tbl[i];
1690  			const uint64_t *bTbl = p.b == 0 ? b0Tbl : p.b == M_1to4 ? b4Tbl : b2Tbl;
1691  			put(p.name, K_K, _XMM, _XMM | _MEM | bTbl[0], IMM8);
1692  			put(p.name, K_K, _YMM, _YMM | _MEM | bTbl[1], IMM8);
1693  			put(p.name, K_K, _ZMM, _ZMM | _MEM | bTbl[2], IMM8);
1694  		}
1695  	}
1696  	void putVtest()
1697  	{
1698  		const uint64_t b0Tbl[] = { 0, 0, 0 };
1699  		const uint64_t b4Tbl[] = { M_1to4, M_1to8, M_1to16 };
1700  		const uint64_t b2Tbl[] = { M_1to2, M_1to4, M_1to8 };
1701  		const struct Tbl {
1702  			const char *name;
1703  			uint64_t b;
1704  		} tbl[] = {
1705  			{ "vptestmb", 0 },
1706  			{ "vptestmw", 0 },
1707  			{ "vptestmd", M_1to4 },
1708  			{ "vptestmq", M_1to2 },
1709  
1710  			{ "vptestnmb", 0 },
1711  			{ "vptestnmw", 0 },
1712  			{ "vptestnmd", M_1to4 },
1713  			{ "vptestnmq", M_1to2 },
1714  		};
1715  		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1716  			const Tbl& p = tbl[i];
1717  			const uint64_t *bTbl = p.b == 0 ? b0Tbl : p.b == M_1to4 ? b4Tbl : b2Tbl;
1718  			put(p.name, K_K, _XMM, _XMM | _MEM | bTbl[0]);
1719  			put(p.name, K_K, _YMM, _YMM | _MEM | bTbl[1]);
1720  			put(p.name, K_K, _ZMM, _ZMM | _MEM | bTbl[2]);
1721  		}
1722  	}
1723  	void putCompExp()
1724  	{
1725  		{
1726  			const char *tbl[] = {
1727  				"vcompresspd",
1728  				"vcompressps",
1729  				"vpcompressd",
1730  				"vpcompressq",
1731  			};
1732  			for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1733  				const char *name = tbl[i];
1734  				put(name, XMM_KZ | _MEM, _XMM);
1735  				put(name, YMM_KZ | _MEM, _YMM);
1736  				put(name, ZMM_KZ | _MEM, _ZMM);
1737  			}
1738  		}
1739  		{
1740  			const char *tbl[] = {
1741  				"vexpandpd",
1742  				"vexpandps",
1743  				"vpexpandd",
1744  				"vpexpandq",
1745  			};
1746  			for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1747  				const char *name = tbl[i];
1748  				put(name, XMM_KZ, _XMM | _MEM);
1749  				put(name, YMM_KZ, _YMM | _MEM);
1750  				put(name, ZMM_KZ, _ZMM | _MEM);
1751  			}
1752  		}
1753  	}
1754  	void putPerm()
1755  	{
1756  		const uint64_t b0Tbl[] = { 0, 0, 0 };
1757  		const uint64_t b4Tbl[] = { M_1to4, M_1to8, M_1to16 };
1758  		const uint64_t b2Tbl[] = { M_1to2, M_1to4, M_1to8 };
1759  		const struct Tbl {
1760  			const char *name;
1761  			uint64_t b;
1762  		} tbl[] = {
1763  			{ "vpermt2b", 0 },
1764  			{ "vpermt2w", 0 },
1765  			{ "vpermt2d", M_1to4 },
1766  			{ "vpermt2q", M_1to2 },
1767  			{ "vpermt2ps", M_1to4 },
1768  			{ "vpermt2pd", M_1to2 },
1769  
1770  			{ "vpermi2b", 0 },
1771  			{ "vpermi2w", 0 },
1772  			{ "vpermi2d", M_1to4 },
1773  			{ "vpermi2q", M_1to2 },
1774  			{ "vpermi2ps", M_1to4 },
1775  		};
1776  		for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
1777  			const Tbl& p = tbl[i];
1778  			const uint64_t *bTbl = p.b == 0 ? b0Tbl : p.b == M_1to4 ? b4Tbl : b2Tbl;
1779  			put(p.name, XMM_KZ, _XMM, _XMM | _MEM | bTbl[0]);
1780  			put(p.name, YMM_KZ, _YMM, _YMM | _MEM | bTbl[1]);
1781  			put(p.name, ZMM_KZ, _ZMM, _ZMM | _MEM | bTbl[2]);
1782  		}
1783  	}
1784  	void putShuff()
1785  	{
1786  		put("vshuff32x4", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8);
1787  		put("vshuff32x4", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16, IMM8);
1788  
1789  		put("vshuff64x2", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8);
1790  		put("vshuff64x2", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8, IMM8);
1791  
1792  		put("vshufi32x4", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8);
1793  		put("vshufi32x4", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16, IMM8);
1794  
1795  		put("vshufi64x2", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8);
1796  		put("vshufi64x2", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8, IMM8);
1797  	}
1798  	void putMov()
1799  	{
1800  		put("vpmovm2b", _XMM | _YMM | _ZMM, K);
1801  		put("vpmovm2w", _XMM | _YMM | _ZMM, K);
1802  		put("vpmovm2d", _XMM | _YMM | _ZMM, K);
1803  		put("vpmovm2q", _XMM | _YMM | _ZMM, K);
1804  
1805  		put("vpmovb2m", K, _XMM | _YMM | _ZMM);
1806  		put("vpmovw2m", K, _XMM | _YMM | _ZMM);
1807  		put("vpmovd2m", K, _XMM | _YMM | _ZMM);
1808  		put("vpmovq2m", K, _XMM | _YMM | _ZMM);
1809  
1810  		put("vpmovqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
1811  		put("vpmovsqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
1812  		put("vpmovusqb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
1813  
1814  		put("vpmovqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
1815  		put("vpmovsqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
1816  		put("vpmovusqw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
1817  
1818  		put("vpmovqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
1819  		put("vpmovqd", YMM_KZ | _MEM | MEM_K, _ZMM);
1820  
1821  		put("vpmovsqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
1822  		put("vpmovsqd", YMM_KZ | _MEM | MEM_K, _ZMM);
1823  
1824  		put("vpmovusqd", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
1825  		put("vpmovusqd", YMM_KZ | _MEM | MEM_K, _ZMM);
1826  
1827  		put("vpmovdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
1828  		put("vpmovsdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
1829  		put("vpmovusdb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM | _ZMM);
1830  
1831  		put("vpmovdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
1832  		put("vpmovdw", YMM_KZ | _MEM | MEM_K, _ZMM);
1833  
1834  		put("vpmovsdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
1835  		put("vpmovsdw", YMM_KZ | _MEM | MEM_K, _ZMM);
1836  
1837  		put("vpmovusdw", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
1838  		put("vpmovusdw", YMM_KZ | _MEM | MEM_K, _ZMM);
1839  
1840  		put("vpmovwb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
1841  		put("vpmovwb", YMM_KZ | _MEM | MEM_K, _ZMM);
1842  
1843  		put("vpmovswb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
1844  		put("vpmovswb", YMM_KZ | _MEM | MEM_K, _ZMM);
1845  
1846  		put("vpmovuswb", XMM_KZ | _MEM | MEM_K, _XMM | _YMM);
1847  		put("vpmovuswb", YMM_KZ | _MEM | MEM_K, _ZMM);
1848  	}
1849  	void putRot()
1850  	{
1851  		put("vprolvd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
1852  		put("vprolvd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
1853  		put("vprolvd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16);
1854  
1855  		put("vprolvq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
1856  		put("vprolvq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
1857  		put("vprolvq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
1858  
1859  		put("vprorvd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
1860  		put("vprorvd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
1861  		put("vprorvd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16);
1862  
1863  		put("vprorvq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
1864  		put("vprorvq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
1865  		put("vprorvq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
1866  
1867  		put("vprold", XMM_KZ, _XMM | _MEM | M_1to4, IMM8);
1868  		put("vprold", YMM_KZ, _YMM | _MEM | M_1to8, IMM8);
1869  		put("vprold", ZMM_KZ, _ZMM | _MEM | M_1to16, IMM8);
1870  
1871  		put("vprolq", XMM_KZ, _XMM | _MEM | M_1to2, IMM8);
1872  		put("vprolq", YMM_KZ, _YMM | _MEM | M_1to4, IMM8);
1873  		put("vprolq", ZMM_KZ, _ZMM | _MEM | M_1to8, IMM8);
1874  
1875  		put("vprord", XMM_KZ, _XMM | _MEM | M_1to4, IMM8);
1876  		put("vprord", YMM_KZ, _YMM | _MEM | M_1to8, IMM8);
1877  		put("vprord", ZMM_KZ, _ZMM | _MEM | M_1to16, IMM8);
1878  
1879  		put("vprorq", XMM_KZ, _XMM | _MEM | M_1to2, IMM8);
1880  		put("vprorq", YMM_KZ, _YMM | _MEM | M_1to4, IMM8);
1881  		put("vprorq", ZMM_KZ, _ZMM | _MEM | M_1to8, IMM8);
1882  	}
1883  	void putMisc2()
1884  	{
1885  #ifdef XBYAK64
1886  		put("vpternlogd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4, IMM8);
1887  		put("vpternlogd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8);
1888  		put("vpternlogd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16, IMM8);
1889  
1890  		put("vpternlogq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2, IMM8);
1891  		put("vpternlogq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8);
1892  		put("vpternlogq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8, IMM8);
1893  
1894  		put("vgetexppd", XMM_KZ, _XMM | MEM | M_1to2);
1895  		put("vgetexppd", YMM_KZ, _YMM | MEM | M_1to4);
1896  		put("vgetexppd", ZMM_KZ, _ZMM | MEM | M_1to8 | ZMM_SAE);
1897  
1898  		put("vgetexpps", XMM_KZ, _XMM | MEM | M_1to4);
1899  		put("vgetexpps", YMM_KZ, _YMM | MEM | M_1to8);
1900  		put("vgetexpps", ZMM_KZ, _ZMM | MEM | M_1to16 | ZMM_SAE);
1901  
1902  		put("vgetexpsd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
1903  		put("vgetexpss", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
1904  
1905  		put("vgetmantpd", XMM_KZ, _XMM | _MEM | M_1to2, IMM8);
1906  		put("vgetmantpd", YMM_KZ, _YMM | _MEM | M_1to4, IMM8);
1907  		put("vgetmantpd", ZMM_KZ, _ZMM | _MEM | M_1to8, IMM8);
1908  
1909  		put("vgetmantps", XMM_KZ, _XMM | _MEM | M_1to4, IMM8);
1910  		put("vgetmantps", YMM_KZ, _YMM | _MEM | M_1to8, IMM8);
1911  		put("vgetmantps", ZMM_KZ, _ZMM | _MEM | M_1to16, IMM8);
1912  
1913  		put("vgetmantsd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
1914  		put("vgetmantss", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
1915  
1916  		put("vfixupimmpd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2, IMM8);
1917  		put("vfixupimmpd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8);
1918  		put("vfixupimmpd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8, IMM8);
1919  
1920  		put("vfixupimmps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4, IMM8);
1921  		put("vfixupimmps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8);
1922  		put("vfixupimmps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16, IMM8);
1923  
1924  		put("vfixupimmsd", XMM_KZ, _XMM, _XMM | _MEM, IMM8);
1925  		put("vfixupimmss", XMM_KZ, _XMM, _XMM | _MEM, IMM8);
1926  
1927  		put("vrcp14pd", XMM_KZ, _XMM | _MEM | M_1to2);
1928  		put("vrcp14pd", YMM_KZ, _YMM | _MEM | M_1to4);
1929  		put("vrcp14pd", ZMM_KZ, _ZMM | _MEM | M_1to8);
1930  
1931  		put("vrcp14ps", XMM_KZ, _XMM | _MEM | M_1to4);
1932  		put("vrcp14ps", YMM_KZ, _YMM | _MEM | M_1to8);
1933  		put("vrcp14ps", ZMM_KZ, _ZMM | _MEM | M_1to16);
1934  
1935  		put("vrcp14sd", XMM_KZ, _XMM, _XMM | _MEM);
1936  
1937  		put("vrcp14ss", XMM_KZ, _XMM, _XMM | _MEM);
1938  
1939  		put("vrsqrt14pd", XMM_KZ, _XMM | _MEM | M_1to2);
1940  		put("vrsqrt14pd", YMM_KZ, _YMM | _MEM | M_1to4);
1941  		put("vrsqrt14pd", ZMM_KZ, _ZMM | _MEM | M_1to8);
1942  
1943  		put("vrsqrt14ps", XMM_KZ, _XMM | _MEM | M_1to4);
1944  		put("vrsqrt14ps", YMM_KZ, _YMM | _MEM | M_1to8);
1945  		put("vrsqrt14ps", ZMM_KZ, _ZMM | _MEM | M_1to16);
1946  
1947  		put("vrsqrt14sd", XMM_KZ, _XMM, _XMM | _MEM);
1948  
1949  		put("vrsqrt14ss", XMM_KZ, _XMM, _XMM | _MEM);
1950  
1951  		put("vrndscalepd", XMM_KZ, _XMM | _MEM | M_1to2, IMM8);
1952  		put("vrndscalepd", YMM_KZ, _YMM | _MEM | M_1to4, IMM8);
1953  		put("vrndscalepd", ZMM_KZ, _ZMM | _MEM | M_1to8, IMM8);
1954  		put("vrndscalepd", ZMM_KZ, _ZMM | ZMM_SAE, IMM8);
1955  
1956  		put("vrndscaleps", XMM_KZ, _XMM | _MEM | M_1to4, IMM8);
1957  		put("vrndscaleps", YMM_KZ, _YMM | _MEM | M_1to8, IMM8);
1958  		put("vrndscaleps", ZMM_KZ, _ZMM | _MEM | M_1to16, IMM8);
1959  		put("vrndscaleps", ZMM_KZ, _ZMM | ZMM_SAE, IMM8);
1960  
1961  		put("vrndscalesd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
1962  
1963  		put("vrndscaless", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
1964  
1965  		put("vscalefpd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
1966  		put("vscalefpd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
1967  		put("vscalefpd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 | ZMM_ER);
1968  
1969  		put("vscalefps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4);
1970  		put("vscalefps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8);
1971  		put("vscalefps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 | ZMM_ER);
1972  
1973  		put("vscalefsd", XMM_KZ, _XMM, _XMM | _MEM | XMM_ER);
1974  		put("vscalefss", XMM_KZ, _XMM, _XMM | _MEM | XMM_ER);
1975  
1976  		put("vdbpsadbw", XMM_KZ, _XMM, _XMM | _MEM, IMM8);
1977  		put("vdbpsadbw", YMM_KZ, _YMM, _YMM | _MEM, IMM8);
1978  		put("vdbpsadbw", ZMM_KZ, _ZMM, _ZMM | _MEM, IMM8);
1979  
1980  		put("vpmultishiftqb", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
1981  		put("vpmultishiftqb", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
1982  		put("vpmultishiftqb", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
1983  
1984  		put("vpconflictd", XMM_KZ, _XMM | _MEM | M_1to4);
1985  		put("vpconflictd", YMM_KZ, _YMM | _MEM | M_1to8);
1986  		put("vpconflictd", ZMM_KZ, _ZMM | _MEM | M_1to16);
1987  
1988  		put("vpconflictq", XMM_KZ, _XMM | _MEM | M_1to2);
1989  		put("vpconflictq", YMM_KZ, _YMM | _MEM | M_1to4);
1990  		put("vpconflictq", ZMM_KZ, _ZMM | _MEM | M_1to8);
1991  
1992  		put("vplzcntd", XMM_KZ, _XMM | _MEM | M_1to4);
1993  		put("vplzcntd", YMM_KZ, _YMM | _MEM | M_1to8);
1994  		put("vplzcntd", ZMM_KZ, _ZMM | _MEM | M_1to16);
1995  
1996  		put("vplzcntq", XMM_KZ, _XMM | _MEM | M_1to2);
1997  		put("vplzcntq", YMM_KZ, _YMM | _MEM | M_1to4);
1998  		put("vplzcntq", ZMM_KZ, _ZMM | _MEM | M_1to8);
1999  
2000  		put("vpbroadcastmb2q", _XMM | _YMM | _ZMM, K);
2001  		put("vpbroadcastmw2d", _XMM | _YMM | _ZMM, K);
2002  
2003  		put("vexp2pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE);
2004  		put("vexp2ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE);
2005  
2006  		put("vrcp28pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE);
2007  		put("vrcp28ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE);
2008  		put("vrcp28sd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
2009  		put("vrcp28ss", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
2010  
2011  		put("vrsqrt28pd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE);
2012  		put("vrsqrt28ps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE);
2013  		put("vrsqrt28sd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
2014  		put("vrsqrt28ss", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE);
2015  
2016  		put("vgatherpf0dps", VM32Z_K);
2017  		put("vgatherpf0qps", VM32Z_K);
2018  		put("vgatherpf0dpd", VM32Y_K);
2019  		put("vgatherpf0qpd", VM32Z_K);
2020  
2021  		put("vgatherpf1dps", VM32Z_K);
2022  		put("vgatherpf1qps", VM32Z_K);
2023  		put("vgatherpf1dpd", VM32Y_K);
2024  		put("vgatherpf1qpd", VM32Z_K);
2025  
2026  		put("vscatterpf0dps", VM32Z_K);
2027  		put("vscatterpf0qps", VM32Z_K);
2028  		put("vscatterpf0dpd", VM32Y_K);
2029  		put("vscatterpf0qpd", VM32Z_K);
2030  
2031  		put("vscatterpf1dps", VM32Z_K);
2032  		put("vscatterpf1qps", VM32Z_K);
2033  		put("vscatterpf1dpd", VM32Y_K);
2034  		put("vscatterpf1qpd", VM32Z_K);
2035  
2036  		put("vrangepd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2, IMM8);
2037  		put("vrangepd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4, IMM8);
2038  		put("vrangepd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8 | ZMM_SAE, IMM8);
2039  
2040  		put("vrangeps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4, IMM8);
2041  		put("vrangeps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8, IMM8);
2042  		put("vrangeps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16 | ZMM_SAE, IMM8);
2043  
2044  		put("vrangesd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
2045  		put("vrangess", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
2046  
2047  		put("vreducepd", XMM_KZ, _XMM | _MEM | M_1to2, IMM8);
2048  		put("vreducepd", YMM_KZ, _YMM | _MEM | M_1to4, IMM8);
2049  		put("vreducepd", ZMM_KZ, _ZMM | _MEM | M_1to8 | ZMM_SAE, IMM8);
2050  
2051  		put("vreduceps", XMM_KZ, _XMM | _MEM | M_1to4, IMM8);
2052  		put("vreduceps", YMM_KZ, _YMM | _MEM | M_1to8, IMM8);
2053  		put("vreduceps", ZMM_KZ, _ZMM | _MEM | M_1to16 | ZMM_SAE, IMM8);
2054  
2055  		put("vreducesd", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
2056  		put("vreducess", XMM_KZ, _XMM, _XMM | _MEM | XMM_SAE, IMM8);
2057  
2058  		put("vpmadd52luq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
2059  		put("vpmadd52luq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
2060  		put("vpmadd52luq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
2061  
2062  		put("vpmadd52huq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2);
2063  		put("vpmadd52huq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4);
2064  		put("vpmadd52huq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8);
2065  #endif
2066  	}
2067  	void classSubMem(const char *nm, char x, bool broadcast, int size)
2068  	{
2069  		printf("%s ", nm);
2070  		if (isXbyak_) {
2071  			printf("(k5|k3, %cword%s [rax+64], 5);dump();\n", x, broadcast ? "_b" : "");
2072  		} else {
2073  			if (broadcast) {
2074  				int d = x == 'x' ? 128 / size : x == 'y' ? 256 / size : 512 / size;
2075  				printf("k5{k3}, [rax+64]{1to%d}, 5\n", d);
2076  			} else {
2077  				if (x == 'x') x = 'o'; // nasm
2078  				printf("k5{k3}, %cword [rax+64], 5\n", x);
2079  			}
2080  		}
2081  	}
2082  	void putClassSub(const char *name, int size)
2083  	{
2084  		put(name, K_K, _XMM | _YMM | _ZMM, IMM8);
2085  		for (int i = 0; i < 2; i++) {
2086  			classSubMem(name, 'x', i == 0, size);
2087  			classSubMem(name, 'y', i == 0, size);
2088  			classSubMem(name, 'z', i == 0, size);
2089  		}
2090  	}
2091  	void putClass()
2092  	{
2093  #ifdef XBYAK64
2094  		putClassSub("vfpclasspd", 64);
2095  		putClassSub("vfpclassps", 32);
2096  		put("vfpclasssd", K_K, _XMM | _MEM, IMM8);
2097  		put("vfpclassss", K_K, _XMM | _MEM, IMM8);
2098  #endif
2099  	}
2100  	void putMin()
2101  	{
2102  #ifdef XBYAK64
2103  		put("vextractf32x4", XMM_KZ, _YMM, IMM8);
2104  #endif
2105  	}
2106  	void putDisp8N()
2107  	{
2108  		{
2109  			const int tbl[] = {
2110  				-129, -128, -127, 0, 1, 64, 65, 127, 128
2111  			};
2112  			for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
2113  				char xs[128], ns[128];
2114  				int v = tbl[i];
2115  				CYBOZU_SNPRINTF(xs, sizeof(xs), "xmm0, ptr[eax%+d]", v);
2116  				CYBOZU_SNPRINTF(ns, sizeof(ns), "xmm0,    [eax%+d]", v);
2117  				put("vpbroadcastb", xs, ns);
2118  			}
2119  		}
2120  		{
2121  			const int tbl[] = {
2122  				-1024, -512 -256, -128, -64, -32, -16, -8, -4, -2, -1,
2123  				0, 1, 2, 4, 8, 16, 32, 64, 128, 256, 512
2124  			};
2125  			for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) {
2126  				char xs[128], ns[128];
2127  				int v = tbl[i];
2128  				CYBOZU_SNPRINTF(xs, sizeof(xs), "zmm0, zmm1, ptr_b[eax%+d]", v);
2129  				CYBOZU_SNPRINTF(ns, sizeof(ns), "zmm0, zmm1, [eax%+d]{1to16}", v);
2130  				put("vaddps", xs, ns);
2131  			}
2132  		}
2133  #ifdef XBYAK64
2134  		put("vfmadd231ps", "zmm8, zmm31, ptr_b[r14+rbp-0x1e4]", "zmm8, zmm31, [r14+rbp-0x1e4]{1to16}");
2135  #endif
2136  	}
2137  	void putAVX512()
2138  	{
2139  #ifdef MIN_TEST
2140  		putMin();
2141  #else
2142  		putOpmask();
2143  		separateFunc();
2144  		putCombi();
2145  		separateFunc();
2146  		putCmpK();
2147  		separateFunc();
2148  		putBroadcast();
2149  		separateFunc();
2150  		putAVX512_M_X();
2151  		separateFunc();
2152  		put_vmov();
2153  		separateFunc();
2154  		put512_X_XM();
2155  		separateFunc();
2156  		put512_X_X_XM();
2157  		separateFunc();
2158  		put512_X3();
2159  		separateFunc();
2160  		put512_X3_I();
2161  		separateFunc();
2162  		put512_FMA();
2163  		separateFunc();
2164  		put512_Y_XM();
2165  		separateFunc();
2166  		put512_AVX1();
2167  		separateFunc();
2168  		put512_cvt();
2169  		separateFunc();
2170  		putMisc1();
2171  		separateFunc();
2172  		putGather();
2173  		separateFunc();
2174  		putBlend();
2175  		separateFunc();
2176  		putVpcmp();
2177  		separateFunc();
2178  		putVtest();
2179  		separateFunc();
2180  		putCompExp();
2181  		separateFunc();
2182  		putPerm();
2183  		separateFunc();
2184  		putShuff();
2185  		separateFunc();
2186  		putMisc2();
2187  		separateFunc();
2188  		putMov();
2189  		separateFunc();
2190  		putRot();
2191  		separateFunc();
2192  		putScatter();
2193  		separateFunc();
2194  		putClass();
2195  		putDisp8N();
2196  #endif
2197  	}
2198  };
2199  
2200  int main(int argc, char *[])
2201  {
2202  	Test test(argc > 1);
2203  	test.put();
2204  }