src/native_client/src/trusted/validator_ragel/def_format.py

   1 # Copyright (c) 2013 The Native Client Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 ################################################################################
   6 # File format:
   7 #     three columns separated by commas.  Each line describes one instruction.
   8 #     Notation for argument types and sizes and for opcodes is based on
   9 #     AMD64 Architecture Programmer's Manual.
  10 ################################################################################
  11 # First column: instruction description.
  12 #   Includes name of the instruction and arguments.
  13 #
  14 #   Arguments consist of four parts:
  15 #      1. Read/write attribute (optional).
  16 #      2. Argument type.
  17 #      3. Argument size.
  18 #      4. Implicit argument mark (optional).
  19 #
  20 #      Read/write attribute:
  21 #       ': Instruction does not use this argument (lea or nop).
  22 #       =: Instruction reads from this argument.
  23 #       !: Instruction writes to this argument.
  24 #       &: Instruction reads this argument and writes the result to it.
  25 #        By default one- and two-operand instructions are assumed to read all
  26 #        operands and store result to the last one, while instructions with
  27 #        three or more operands are assumed to read all operands except last one
  28 #        which is used to store the result of the execution.
  29 #      Possible argument types:
  30 #       a: Accumulator: %al/%ax/%eax/%rax/%xmm0 (depending on size).
  31 #       c: Counter register: %cl/%cx/%ecx/%rcx (depending on size).
  32 #       d: Data register: %dl/%dx/%edx/%rdx (depending on size).
  33 #       f: x87 register in opcode (3 least significant bits).
  34 #       i: Second immediate value encoded in the instruction.
  35 #       o: I/O port in %dx (used in "in"/"out" instructions).
  36 #       r: Register in opcode (3 least significant bits plus rex.B).
  37 #       t: Top of the x87 stack (%st).
  38 #       x: A memory operand addressed by the %ds:(%[er]bx). See "xlat".
  39 #       B: General purpose register specified by the VEX/XOP.vvvv field.
  40 #       C: Control register specified by the ModRM.reg field.
  41 #       D: Debug register specified by the ModRM.reg field.
  42 #       E: General purpose register or memory operand specified by the r/m
  43 #          field of the ModRM byte.  For memory operands, the ModRM byte may
  44 #          be followed by a SIB byte to specify one of the indexed
  45 #          register-indirect addressing forms.
  46 #       G: General purpose register specified by the reg field of ModRM.
  47 #       H: YMM or XMM register specified by the VEX/XOP.vvvv field.
  48 #       I: Immediate value encoded in the instruction.
  49 #       J: The instruction encoding includes a relative offset that is added to
  50 #          the rIP.
  51 #       L: YMM or XMM register specified using the most-significant 4 bits of
  52 #          the last byte of the instruction.  In legacy or compatibility mode
  53 #          the most significant bit is ignored.
  54 #       M: A memory operand specified by the {mod, r/m} field of the ModRM byte.
  55 #          ModRM.mod != 11b.
  56 #       N: 64-bit MMX register specified by the ModRM.r/m field. The ModRM.mod
  57 #          field must be 11b.
  58 #       O: The offset of an operand is encoded in the instruction. There is no
  59 #          ModRM byte in the instruction encoding. Indexed register-indirect
  60 #          addressing using the SIB byte is not supported.
  61 #       P: 64-bit MMX register specified by the ModRM.reg field.
  62 #       Q: 64-bit MMX-register or memory operand specified by the {mod, r/m}
  63 #          field of the ModRM byte.  For memory operands, the ModRM byte may
  64 #          be followed by a SIB byte to specify one of the indexed
  65 #          register-indirect addressing forms.
  66 #       R: General purpose register specified by the ModRM.r/m field.
  67 #          The ModRM.mod field must be 11b.
  68 #       S: Segment register specified by the ModRM.reg field.
  69 #       U: YMM/XMM register specified by the ModRM.r/m field.
  70 #          The ModRM.mod field must be 11b.
  71 #       V: YMM/XMM register specified by the ModRM.reg field.
  72 #       W: YMM/XMM register or memory operand specified by the {mod, r/m} field
  73 #          of the ModRM byte.   For memory operands, the ModRM byte may be
  74 #          followed by a SIB byte to specify one of the indexed
  75 #          register-indirect addressing forms.
  76 #       X: A memory operand addressed by the %ds:%[er]si registers. Used in
  77 #          string instructions.
  78 #       Y: A memory operand addressed by the %es:%[er]di registers. Used in
  79 #          string instructions.
  80 #      Possible sizes:
  81 #       (no size provided):
  82 #             A byte, word, doubleword, or quadword (in 64-bit mode),
  83 #             depending on the effective operand size.
  84 #       2:    Two bits (see VPERMIL2Px instruction).
  85 #       7:    x87 register %st(N).
  86 #       b:    A byte, irrespective of the effective operand size.
  87 #       d:    A doubleword (32-bit), irrespective of the effective operand size.
  88 #       do:   A double octword (256 bits), irrespective of the effective operand
  89 #             size.
  90 #       dq:   A double quadword (128 bits), irrespective of the effective
  91 #             operand size.
  92 #       fq:   A quadra quadword (256 bits), irrespective of the effective
  93 #             operand size.
  94 #       o:    An octword (128 bits), irrespective of the effective operand size.
  95 #       p:    A 32-bit or 48-bit far pointer, depending on the effective operand
  96 #             size.
  97 #       pb:   A Vector with byte-wide (8-bit) elements (packed byte).
  98 #       pd:   A double-precision (64-bit) floating-point vector operand (packed
  99 #             double-precision).
 100 #       pdw:  Vector composed of 32-bit doublewords.
 101 #       pdwx: Vector composed of 32-bit doublewords. L bit selects 256bit YMM
 102 #             registers.
 103 #       pdx:  A double-precision (64-bit) floating-point vector operand (packed
 104 #             double-precision).  L bit selects 256bit YMM registers.
 105 #       ph:   A half-precision (16-bit) floating-point vector operand (packed
 106 #             half-precision).
 107 #       phx:  A half-precision (16-bit) floating-point vector operand (packed
 108 #             half-precision).  L bit selects 256bit YMM registers.
 109 #       pi:   Vector composed of 16-bit integers (packed integer).
 110 #       pj:   Vector composed of 32-bit integers (packed double integer).
 111 #       pjx:  Vector composed of 32-bit integers (packed double integer).
 112 #             L bit selects 256bit YMM registers.
 113 #       pk:   Vector composed of 8-bit integers (packed half-word integer).
 114 #       pkx:  Vector composed of 8-bit integers (packed half-word integer).
 115 #             L bit selects 256bit YMM registers.
 116 #       pq:   Vector composed of 64-bit integers (packed quadword integer).
 117 #       pqw:  Vector composed of 64-bit quadwords (packed quadword).
 118 #       pqwx: Vector composed of 64-bit quadwords (packed quadword).  L bit
 119 #             selects 256bit YMM registers.
 120 #       pqx:  Vector composed of 64-bit integers (packed quadword integer).
 121 #             L bit selects 256bit YMM registers.
 122 #       ps:   A single-precision floating-point vector operand (packed
 123 #             single-precision).
 124 #       psx:  A single-precision floating-point vector operand (packed
 125 #             single-precision).  L bit selects 256bit YMM registers.
 126 #       pw:   Vector composed of 16-bit words (packed word).
 127 #       q:    A quadword (64-bit), irrespective of the effective operand size.
 128 #       r:    Register size (32bit in 32bit mode, 64bit in 64bit mode).
 129 #       s:    Segment register (if register operand).
 130 #       s:    A 6-byte or 10-byte pseudo-descriptor (if memory operand).
 131 #       sb:   A scalar 10-byte packed BCD value (scalar BCD).
 132 #       sd:   A scalar double-precision floating-point operand (scalar double).
 133 #       se:   A 14-byte or 28-byte x87 environment.
 134 #       si:   A scalar doubleword (32-bit) integer operand (scalar integer).
 135 #       sq:   A scalar quadword (64-bit) integer operand (scalar integer).
 136 #       sr:   A 94-byte or 108-byte x87 state.
 137 #       ss:   A scalar single-precision floating-point operand (scalar single).
 138 #       st:   A scalar 80bit-precision floating-point operand (scalar tenbytes).
 139 #       sw:   A scalar word (16-bit) integer operand (scalar integer).
 140 #       sx:   A 512-byte extended x87/MMX/XMM state.
 141 #       v:    A word, doubleword, or quadword (in 64-bit mode), depending on
 142 #             the effective operand size.
 143 #       w:    A word, irrespective of the effective operand size.
 144 #       x:    Instruction supports both vector sizes (128 bits or 256 bits).
 145 #             Size is encoded using the VEX/XOP.L field. (L=0: 128 bits;
 146 #             L=1: 256 bits). Usually this symbol is appended to ps or pd, but
 147 #             sometimes it is used alone. For gen_dfa psx, pdx and x
 148 #             are the same.
 149 #       y:    A doubleword or quadword depending on effective operand size.
 150 #       z:    A word if the effective operand size is 16 bits, or a doubleword
 151 #             if the effective operand size is 32 or 64 bits.
 152 #      Implicit argument mark:
 153 #       *: This argument is implicit. It's not shown in the diassembly listing.
 154 ################################################################################
 155 # Second column: instruction opcodes.
 156 #   Includes all opcode bytes.  If first opcode bytes is 0x66/data16,
 157 #   0xf2/repnz, or 0xf3/rep/repz then they can be moved before other prefixes
 158 #   (and will be moved before REX prefix if it's allowed).  Note: data16, repnz,
 159 #   and rep/repz opcodes will set appropriate flags while 0x66, 0xf2, and 0xf3
 160 #   will not.
 161 #   If part of the opcode is stored in ModRM byte then opcode should include the
 162 #   usual "/0", "/1", ..., "/7" "bytes".
 163 #   For VEX/XOP instructions it is expected that first three opcode bytes are
 164 #   specified in the following form:
 165 #     0xc4 (or 0x8f)
 166 #     RXB.<map_select>
 167 #     <W>.<vvvv>.<L>.<pp>
 168 #   (so they describe long form of VEX prefix; short form is deduced
 169 #   automatically when appropriate)
 170 ################################################################################
 171 # Third column: additional instruction notes.
 172 #   Different kind of notes for the instruction: non-typical prefixes (for
 173 #   example "lock" prefix or "rep" prefix), CPUID checks, etc.
 174 #
 175 #     Possible prefixes:
 176 #       branch_hint: branch hint prefixes are allowed (0x2E, 0x3E)
 177 #       condrep: prefixes "repnz" and "repz" are allowed for the instruction
 178 #       lock: prefix "lock" is allowed for the instruction
 179 #       rep: prefix "rep" is allowed for the instruction (it's alias of "repz")
 180 #       no_memory_access: command does not access memory in detectable way: lea,
 181 #         nop, prefetch* instructions...
 182 #       norex: "rex" prefix can not be used with this instruction (various "nop"
 183 #         instructions use this flag)
 184 #       norexw: "rex.W" can not be used with this instruction (usually used when
 185 #         instruction with "rex.W" have a different name: e.g. "movd"/"movq")
 186 #
 187 #     Instruction enabling/disabling:
 188 #       ia32: ia32-only instruction
 189 #       amd64: amd64-only instruction
 190 #       nacl-forbidden: instruction is not supported in NaCl sandbox
 191 #       nacl-ia32-forbidden: instruction is not supported in ia32 NaCl sandbox
 192 #       nacl-amd64-forbidden: instruction is not supported in amd64 NaCl sandbox
 193 #       disabled_untested: instruction is disabled because it is not tested yet.
 194 #
 195 #     Special marks:
 196 #       nacl-amd64-zero-extends: instruction can be used to zero-extend register
 197 #         in amd64 mode
 198 #       nacl-amd64-modifiable: instruction can be modified in amd64 mode
 199 #       att-show-name-suffix-{b,l,ll,t,s,q,x,y,w}: instruction is shown with the
 200 #         given suffix by objdump in AT&T mode
 201 #
 202 #     CPU features are defined in validator_internal.h.
 203 ################################################################################
 204
 205
 206 # Technically, columns are separated with mere ',' followed by spaces for
 207 # readability, but there are quoted instruction names that include commas
 208 # not followed by spaces (see nops.def).
 209 # For simplicity I choose to rely on this coincidence and use split-based parser
 210 # instead of proper recursive descent one.
 211 # If by accident somebody put ', ' in quoted instruction name, it will fail
 212 # loudly, because closing quote then will fall into second or third column and
 213 # will cause parse error.
 214 # TODO(shcherbina): use for column separator something that is never encountered
 215 # in columns, like semicolon?
 216 COLUMN_SEPARATOR = ', '
 217
 218
 219 SUPPORTED_ATTRIBUTES = [
 220     # Parsing attributes.
 221     'branch_hint',
 222     'condrep',
 223     'lock',
 224     'no_memory_access',
 225     'norex',
 226     'norexw',
 227     'rep',
 228
 229     # CPUID attributes.
 230     'CPUFeature_3DNOW',
 231     'CPUFeature_3DPRFTCH',
 232     'CPUFeature_AES',
 233     'CPUFeature_AESAVX',
 234     'CPUFeature_ALTMOVCR8',
 235     'CPUFeature_AVX',
 236     'CPUFeature_BMI1',
 237     'CPUFeature_CLFLUSH',
 238     'CPUFeature_CLMUL',
 239     'CPUFeature_CLMULAVX',
 240     'CPUFeature_CMOV',
 241     'CPUFeature_CMOVx87',
 242     'CPUFeature_CX16',
 243     'CPUFeature_CX8',
 244     'CPUFeature_E3DNOW',
 245     'CPUFeature_EMMX',
 246     'CPUFeature_EMMXSSE',
 247     'CPUFeature_F16C',
 248     'CPUFeature_FMA',
 249     'CPUFeature_FMA4',
 250     'CPUFeature_FXSR',
 251     'CPUFeature_LAHF',
 252     'CPUFeature_LWP',
 253     'CPUFeature_LZCNT',
 254     'CPUFeature_MMX',
 255     'CPUFeature_MON',
 256     'CPUFeature_MOVBE',
 257     'CPUFeature_MSR',
 258     'CPUFeature_POPCNT',
 259     'CPUFeature_SEP',
 260     'CPUFeature_SFENCE',
 261     'CPUFeature_SKINIT',
 262     'CPUFeature_SSE',
 263     'CPUFeature_SSE2',
 264     'CPUFeature_SSE3',
 265     'CPUFeature_SSE41',
 266     'CPUFeature_SSE42',
 267     'CPUFeature_SSE4A',
 268     'CPUFeature_SSSE3',
 269     'CPUFeature_SVM',
 270     'CPUFeature_SYSCALL',
 271     'CPUFeature_TBM',
 272     'CPUFeature_TSC',
 273     'CPUFeature_TSCP',
 274     'CPUFeature_TZCNT',
 275     'CPUFeature_x87',
 276     'CPUFeature_XOP',
 277
 278     # Attributes for enabling/disabling based on architecture and validity.
 279     'ia32',
 280     'amd64',
 281     'nacl-ia32-forbidden',
 282     'nacl-amd64-forbidden',
 283     'nacl-forbidden',
 284     'nacl-amd64-zero-extends',
 285     'nacl-amd64-modifiable',
 286     'disabled_untested',
 287
 288     # AT&T Decoder attributes.
 289     'att-show-name-suffix-b',
 290     'att-show-name-suffix-l',
 291     'att-show-name-suffix-ll',
 292     'att-show-name-suffix-t',
 293     'att-show-name-suffix-s',
 294     'att-show-name-suffix-q',
 295     'att-show-name-suffix-x',
 296     'att-show-name-suffix-y',
 297     'att-show-name-suffix-w',
 298 ]
 299
 300
 301 class OperandReadWriteMode(object):
 302   UNUSED = '\''
 303   READ = '='
 304   WRITE = '!'
 305   READ_WRITE = '&'
 306
 307
 308 class OperandType(object):
 309   AX = 'a'
 310   CX = 'c'
 311   DX = 'd'
 312
 313   IMMEDIATE = 'I'
 314   SECOND_IMMEDIATE = 'i'
 315
 316   CONTROL_REGISTER = 'C'  # in ModRM.reg
 317   DEBUG_REGISTER = 'D'  # in ModRM.reg
 318
 319   REGISTER_IN_OPCODE = 'r'
 320   X87_REGISTER_IN_OPCODE = 'f'
 321
 322   X87_ST = 't'  # st0 that objdump displays as 'st'
 323
 324   ABSOLUTE_DISP = 'O'
 325
 326   RELATIVE_TARGET = 'J'
 327
 328   REGISTER_IN_RM = 'R'
 329   REGISTER_IN_REG = 'G'
 330   REGISTER_OR_MEMORY = 'E'  # in ModRM.mod and .r/m
 331   MEMORY = 'M'  # in ModRM.mod and .r/m
 332   SEGMENT_REGISTER_IN_REG = 'S'
 333
 334   MMX_REGISTER_IN_RM = 'N'
 335   MMX_REGISTER_IN_REG = 'P'
 336   MMX_REGISTER_OR_MEMORY = 'Q'  # in ModRM.mod and .r/m
 337
 338   XMM_REGISTER_IN_RM = 'U'
 339   XMM_REGISTER_IN_REG = 'V'
 340   XMM_REGISTER_OR_MEMORY = 'W'  # in ModRM.mod and .r/m
 341
 342   XMM_REGISTER_IN_LAST_BYTE = 'L'  # most-significant 4 bits
 343
 344   DS_SI = 'X'
 345   ES_DI = 'Y'
 346   DS_BX = 'x'
 347
 348   REGISTER_IN_VVVV = 'B'
 349   XMM_REGISTER_IN_VVVV = 'H'
 350
 351   PORT_IN_DX = 'o'
 352
 353
 354 ALL_OPERAND_TYPES = set(
 355     v for k, v in OperandType.__dict__.items() if not k.startswith('__'))