AVX-512: Added AVX-512PF instructions
authorJin Kyu Song <jin.kyu.song@intel.com>
Fri, 13 Sep 2013 21:12:57 +0000 (14:12 -0700)
committerCyrill Gorcunov <gorcunov@gmail.com>
Fri, 13 Sep 2013 21:27:10 +0000 (01:27 +0400)
Added Prefetch (AVX-512PF) instructions.
These instructions are supported
if CPUID.(EAX=07H, ECX=0):EBX.AVX512PF[bit 26] = 1.
CPUID feature flag for PREFETCHWT1 is TBD
but PREFETCHWT1 is included in this commit.

Signed-off-by: Jin Kyu Song <jin.kyu.song@intel.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
insns.dat
insns.h
test/avx512pf.asm [new file with mode: 0644]

index 64f8b68..2439a9d 100644 (file)
--- a/insns.dat
+++ b/insns.dat
@@ -4074,6 +4074,24 @@ VRSQRT28PD   zmmreg|mask|z,zmmrm512|b64|sae    [rm:fv:          evex.512.66.0f38
 VRSQRT28PS   zmmreg|mask|z,zmmrm512|b32|sae    [rm:fv:          evex.512.66.0f38.w0 cc /r ]  AVX512ER,FUTURE
 VRSQRT28SD   xmmreg|mask|z,xmmreg,xmmrm64|sae  [rvm:t1s:    evex.nds.lig.66.0f38.w1 cd /r ]  AVX512ER,FUTURE
 VRSQRT28SS   xmmreg|mask|z,xmmreg,xmmrm32|sae  [rvm:t1s:    evex.nds.lig.66.0f38.w0 cd /r ]  AVX512ER,FUTURE
+; AVX-512PF (Prefetch) instructions
+VGATHERPF0DPD   ymem64|mask  [m:t1s:    vsiby evex.512.66.0f38.w1 c6 /1 ]  AVX512PF,FUTURE
+VGATHERPF0DPS   zmem32|mask  [m:t1s:    vsibz evex.512.66.0f38.w0 c6 /1 ]  AVX512PF,FUTURE
+VGATHERPF0QPD   zmem64|mask  [m:t1s:    vsibz evex.512.66.0f38.w1 c7 /1 ]  AVX512PF,FUTURE
+VGATHERPF0QPS   zmem32|mask  [m:t1s:    vsibz evex.512.66.0f38.w0 c7 /1 ]  AVX512PF,FUTURE
+VGATHERPF1DPD   ymem64|mask  [m:t1s:    vsiby evex.512.66.0f38.w1 c6 /2 ]  AVX512PF,FUTURE
+VGATHERPF1DPS   zmem32|mask  [m:t1s:    vsibz evex.512.66.0f38.w0 c6 /2 ]  AVX512PF,FUTURE
+VGATHERPF1QPD   zmem64|mask  [m:t1s:    vsibz evex.512.66.0f38.w1 c7 /2 ]  AVX512PF,FUTURE
+VGATHERPF1QPS   zmem32|mask  [m:t1s:    vsibz evex.512.66.0f38.w0 c7 /2 ]  AVX512PF,FUTURE
+VSCATTERPF0DPD  ymem64|mask  [m:t1s:    vsiby evex.512.66.0f38.w1 c6 /5 ]  AVX512PF,FUTURE
+VSCATTERPF0DPS  zmem32|mask  [m:t1s:    vsibz evex.512.66.0f38.w0 c6 /5 ]  AVX512PF,FUTURE
+VSCATTERPF0QPD  zmem64|mask  [m:t1s:    vsibz evex.512.66.0f38.w1 c7 /5 ]  AVX512PF,FUTURE
+VSCATTERPF0QPS  zmem32|mask  [m:t1s:    vsibz evex.512.66.0f38.w0 c7 /5 ]  AVX512PF,FUTURE
+VSCATTERPF1DPD  ymem64|mask  [m:t1s:    vsiby evex.512.66.0f38.w1 c6 /6 ]  AVX512PF,FUTURE
+VSCATTERPF1DPS  zmem32|mask  [m:t1s:    vsibz evex.512.66.0f38.w0 c6 /6 ]  AVX512PF,FUTURE
+VSCATTERPF1QPD  zmem64|mask  [m:t1s:    vsibz evex.512.66.0f38.w1 c7 /6 ]  AVX512PF,FUTURE
+VSCATTERPF1QPS  zmem32|mask  [m:t1s:    vsibz evex.512.66.0f38.w0 c7 /6 ]  AVX512PF,FUTURE
+PREFETCHWT1     mem8         [m:                               0f 0d /2 ]  FUTURE
 
 
 ;# Systematic names for the hinting nop instructions
diff --git a/insns.h b/insns.h
index 3b12ccf..b12d4eb 100644 (file)
--- a/insns.h
+++ b/insns.h
@@ -130,6 +130,7 @@ extern const uint8_t nasm_bytecodes[];
 #define IF_INVPCID      0x1500000000UL    /* HACK NEED TO REORGANIZE THESE BITS */
 #define IF_AVX512CD     (0x1600000000UL|IF_AVX512) /* AVX-512 Conflict Detection insns */
 #define IF_AVX512ER     (0x1700000000UL|IF_AVX512) /* AVX-512 Exponential and Reciprocal */
+#define IF_AVX512PF     (0x1800000000UL|IF_AVX512) /* AVX-512 Prefetch instructions */
 #define IF_INSMASK      0xFF00000000UL    /* the mask for instruction set types */
 #define IF_PMASK        0xFF000000UL    /* the mask for processor types */
 #define IF_PLEVEL       0x0F000000UL    /* the mask for processor instr. level */
diff --git a/test/avx512pf.asm b/test/avx512pf.asm
new file mode 100644 (file)
index 0000000..5227123
--- /dev/null
@@ -0,0 +1,87 @@
+; AVX-512PF testcases from gas
+;------------------------
+;
+; This file is taken from there
+;     https://gnu.googlesource.com/binutils/+/master/gas/testsuite/gas/i386/x86-64-avx512pf-intel.d
+; So the original author is "H.J. Lu" <hongjiu dot lu at intel dot com>
+;
+; Jin Kyu Song converted it for the nasm testing suite using gas2nasm.py
+
+%macro testcase 2
+ %ifdef BIN
+  db %1
+ %endif
+ %ifdef SRC
+  %2
+ %endif
+%endmacro
+
+
+bits 64
+
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc6, 0x8c, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vgatherpf0dpd  [r14+ymm31*8+0x7b]\{k1\}                      }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc6, 0x8c, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vgatherpf0dpd  [r14+ymm31*8+0x7b]\{k1\}                      }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc6, 0x4c, 0x39, 0x20                         }, { vgatherpf0dpd  [r9+ymm31*1+0x100]\{k1\}                      }
+testcase       { 0x62, 0xb2, 0xfd, 0x41, 0xc6, 0x8c, 0xb9, 0x00, 0x04, 0x00, 0x00       }, { vgatherpf0dpd  [rcx+ymm31*4+0x400]\{k1\}                     }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc6, 0x8c, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vgatherpf0dps  [r14+zmm31*8+0x7b]\{k1\}                      }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc6, 0x8c, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vgatherpf0dps  [r14+zmm31*8+0x7b]\{k1\}                      }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc6, 0x4c, 0x39, 0x40                         }, { vgatherpf0dps  [r9+zmm31*1+0x100]\{k1\}                      }
+testcase       { 0x62, 0xb2, 0x7d, 0x41, 0xc6, 0x8c, 0xb9, 0x00, 0x04, 0x00, 0x00       }, { vgatherpf0dps  [rcx+zmm31*4+0x400]\{k1\}                     }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc7, 0x8c, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vgatherpf0qpd  [r14+zmm31*8+0x7b]\{k1\}                      }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc7, 0x8c, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vgatherpf0qpd  [r14+zmm31*8+0x7b]\{k1\}                      }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc7, 0x4c, 0x39, 0x20                         }, { vgatherpf0qpd  [r9+zmm31*1+0x100]\{k1\}                      }
+testcase       { 0x62, 0xb2, 0xfd, 0x41, 0xc7, 0x8c, 0xb9, 0x00, 0x04, 0x00, 0x00       }, { vgatherpf0qpd  [rcx+zmm31*4+0x400]\{k1\}                     }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc7, 0x8c, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vgatherpf0qps  [r14+zmm31*8+0x7b]\{k1\}                      }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc7, 0x8c, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vgatherpf0qps  [r14+zmm31*8+0x7b]\{k1\}                      }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc7, 0x4c, 0x39, 0x40                         }, { vgatherpf0qps  [r9+zmm31*1+0x100]\{k1\}                      }
+testcase       { 0x62, 0xb2, 0x7d, 0x41, 0xc7, 0x8c, 0xb9, 0x00, 0x04, 0x00, 0x00       }, { vgatherpf0qps  [rcx+zmm31*4+0x400]\{k1\}                     }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc6, 0x94, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vgatherpf1dpd  [r14+ymm31*8+0x7b]\{k1\}                      }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc6, 0x94, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vgatherpf1dpd  [r14+ymm31*8+0x7b]\{k1\}                      }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc6, 0x54, 0x39, 0x20                         }, { vgatherpf1dpd  [r9+ymm31*1+0x100]\{k1\}                      }
+testcase       { 0x62, 0xb2, 0xfd, 0x41, 0xc6, 0x94, 0xb9, 0x00, 0x04, 0x00, 0x00       }, { vgatherpf1dpd  [rcx+ymm31*4+0x400]\{k1\}                     }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc6, 0x94, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vgatherpf1dps  [r14+zmm31*8+0x7b]\{k1\}                      }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc6, 0x94, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vgatherpf1dps  [r14+zmm31*8+0x7b]\{k1\}                      }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc6, 0x54, 0x39, 0x40                         }, { vgatherpf1dps  [r9+zmm31*1+0x100]\{k1\}                      }
+testcase       { 0x62, 0xb2, 0x7d, 0x41, 0xc6, 0x94, 0xb9, 0x00, 0x04, 0x00, 0x00       }, { vgatherpf1dps  [rcx+zmm31*4+0x400]\{k1\}                     }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc7, 0x94, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vgatherpf1qpd  [r14+zmm31*8+0x7b]\{k1\}                      }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc7, 0x94, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vgatherpf1qpd  [r14+zmm31*8+0x7b]\{k1\}                      }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc7, 0x54, 0x39, 0x20                         }, { vgatherpf1qpd  [r9+zmm31*1+0x100]\{k1\}                      }
+testcase       { 0x62, 0xb2, 0xfd, 0x41, 0xc7, 0x94, 0xb9, 0x00, 0x04, 0x00, 0x00       }, { vgatherpf1qpd  [rcx+zmm31*4+0x400]\{k1\}                     }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc7, 0x94, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vgatherpf1qps  [r14+zmm31*8+0x7b]\{k1\}                      }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc7, 0x94, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vgatherpf1qps  [r14+zmm31*8+0x7b]\{k1\}                      }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc7, 0x54, 0x39, 0x40                         }, { vgatherpf1qps  [r9+zmm31*1+0x100]\{k1\}                      }
+testcase       { 0x62, 0xb2, 0x7d, 0x41, 0xc7, 0x94, 0xb9, 0x00, 0x04, 0x00, 0x00       }, { vgatherpf1qps  [rcx+zmm31*4+0x400]\{k1\}                     }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc6, 0xac, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vscatterpf0dpd  [r14+ymm31*8+0x7b]\{k1\}                     }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc6, 0xac, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vscatterpf0dpd  [r14+ymm31*8+0x7b]\{k1\}                     }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc6, 0x6c, 0x39, 0x20                         }, { vscatterpf0dpd  [r9+ymm31*1+0x100]\{k1\}                     }
+testcase       { 0x62, 0xb2, 0xfd, 0x41, 0xc6, 0xac, 0xb9, 0x00, 0x04, 0x00, 0x00       }, { vscatterpf0dpd  [rcx+ymm31*4+0x400]\{k1\}                    }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc6, 0xac, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vscatterpf0dps  [r14+zmm31*8+0x7b]\{k1\}                     }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc6, 0xac, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vscatterpf0dps  [r14+zmm31*8+0x7b]\{k1\}                     }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc6, 0x6c, 0x39, 0x40                         }, { vscatterpf0dps  [r9+zmm31*1+0x100]\{k1\}                     }
+testcase       { 0x62, 0xb2, 0x7d, 0x41, 0xc6, 0xac, 0xb9, 0x00, 0x04, 0x00, 0x00       }, { vscatterpf0dps  [rcx+zmm31*4+0x400]\{k1\}                    }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc7, 0xac, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vscatterpf0qpd  [r14+zmm31*8+0x7b]\{k1\}                     }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc7, 0xac, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vscatterpf0qpd  [r14+zmm31*8+0x7b]\{k1\}                     }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc7, 0x6c, 0x39, 0x20                         }, { vscatterpf0qpd  [r9+zmm31*1+0x100]\{k1\}                     }
+testcase       { 0x62, 0xb2, 0xfd, 0x41, 0xc7, 0xac, 0xb9, 0x00, 0x04, 0x00, 0x00       }, { vscatterpf0qpd  [rcx+zmm31*4+0x400]\{k1\}                    }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc7, 0xac, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vscatterpf0qps  [r14+zmm31*8+0x7b]\{k1\}                     }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc7, 0xac, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vscatterpf0qps  [r14+zmm31*8+0x7b]\{k1\}                     }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc7, 0x6c, 0x39, 0x40                         }, { vscatterpf0qps  [r9+zmm31*1+0x100]\{k1\}                     }
+testcase       { 0x62, 0xb2, 0x7d, 0x41, 0xc7, 0xac, 0xb9, 0x00, 0x04, 0x00, 0x00       }, { vscatterpf0qps  [rcx+zmm31*4+0x400]\{k1\}                    }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc6, 0xb4, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vscatterpf1dpd  [r14+ymm31*8+0x7b]\{k1\}                     }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc6, 0xb4, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vscatterpf1dpd  [r14+ymm31*8+0x7b]\{k1\}                     }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc6, 0x74, 0x39, 0x20                         }, { vscatterpf1dpd  [r9+ymm31*1+0x100]\{k1\}                     }
+testcase       { 0x62, 0xb2, 0xfd, 0x41, 0xc6, 0xb4, 0xb9, 0x00, 0x04, 0x00, 0x00       }, { vscatterpf1dpd  [rcx+ymm31*4+0x400]\{k1\}                    }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc6, 0xb4, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vscatterpf1dps  [r14+zmm31*8+0x7b]\{k1\}                     }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc6, 0xb4, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vscatterpf1dps  [r14+zmm31*8+0x7b]\{k1\}                     }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc6, 0x74, 0x39, 0x40                         }, { vscatterpf1dps  [r9+zmm31*1+0x100]\{k1\}                     }
+testcase       { 0x62, 0xb2, 0x7d, 0x41, 0xc6, 0xb4, 0xb9, 0x00, 0x04, 0x00, 0x00       }, { vscatterpf1dps  [rcx+zmm31*4+0x400]\{k1\}                    }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc7, 0xb4, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vscatterpf1qpd  [r14+zmm31*8+0x7b]\{k1\}                     }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc7, 0xb4, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vscatterpf1qpd  [r14+zmm31*8+0x7b]\{k1\}                     }
+testcase       { 0x62, 0x92, 0xfd, 0x41, 0xc7, 0x74, 0x39, 0x20                         }, { vscatterpf1qpd  [r9+zmm31*1+0x100]\{k1\}                     }
+testcase       { 0x62, 0xb2, 0xfd, 0x41, 0xc7, 0xb4, 0xb9, 0x00, 0x04, 0x00, 0x00       }, { vscatterpf1qpd  [rcx+zmm31*4+0x400]\{k1\}                    }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc7, 0xb4, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vscatterpf1qps  [r14+zmm31*8+0x7b]\{k1\}                     }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc7, 0xb4, 0xfe, 0x7b, 0x00, 0x00, 0x00       }, { vscatterpf1qps  [r14+zmm31*8+0x7b]\{k1\}                     }
+testcase       { 0x62, 0x92, 0x7d, 0x41, 0xc7, 0x74, 0x39, 0x40                         }, { vscatterpf1qps  [r9+zmm31*1+0x100]\{k1\}                     }
+testcase       { 0x62, 0xb2, 0x7d, 0x41, 0xc7, 0xb4, 0xb9, 0x00, 0x04, 0x00, 0x00       }, { vscatterpf1qps  [rcx+zmm31*4+0x400]\{k1\}                    }
+testcase       { 0x0f, 0x0d, 0x11                                                       }, { prefetchwt1 BYTE [rcx]                                       }
+testcase       { 0x42, 0x0f, 0x0d, 0x94, 0xf0, 0x23, 0x01, 0x00, 0x00                   }, { prefetchwt1 BYTE [rax+r14*8+0x123]                           }