aco/gfx11: optimize dual source export
[platform/upstream/mesa.git] / src / amd / compiler / aco_validate.cpp
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24
25 #include "aco_ir.h"
26
27 #include "util/memstream.h"
28 #include "util/ralloc.h"
29
30 #include <array>
31 #include <map>
32 #include <set>
33 #include <vector>
34
35 namespace aco {
36
37 static void
38 aco_log(Program* program, enum aco_compiler_debug_level level, const char* prefix, const char* file,
39         unsigned line, const char* fmt, va_list args)
40 {
41    char* msg;
42
43    if (program->debug.shorten_messages) {
44       msg = ralloc_vasprintf(NULL, fmt, args);
45    } else {
46       msg = ralloc_strdup(NULL, prefix);
47       ralloc_asprintf_append(&msg, "    In file %s:%u\n", file, line);
48       ralloc_asprintf_append(&msg, "    ");
49       ralloc_vasprintf_append(&msg, fmt, args);
50    }
51
52    if (program->debug.func)
53       program->debug.func(program->debug.private_data, level, msg);
54
55    fprintf(program->debug.output, "%s\n", msg);
56
57    ralloc_free(msg);
58 }
59
60 void
61 _aco_perfwarn(Program* program, const char* file, unsigned line, const char* fmt, ...)
62 {
63    va_list args;
64
65    va_start(args, fmt);
66    aco_log(program, ACO_COMPILER_DEBUG_LEVEL_PERFWARN, "ACO PERFWARN:\n", file, line, fmt, args);
67    va_end(args);
68 }
69
70 void
71 _aco_err(Program* program, const char* file, unsigned line, const char* fmt, ...)
72 {
73    va_list args;
74
75    va_start(args, fmt);
76    aco_log(program, ACO_COMPILER_DEBUG_LEVEL_ERROR, "ACO ERROR:\n", file, line, fmt, args);
77    va_end(args);
78 }
79
80 bool
81 validate_ir(Program* program)
82 {
83    bool is_valid = true;
84    auto check = [&program, &is_valid](bool success, const char* msg,
85                                       aco::Instruction* instr) -> void
86    {
87       if (!success) {
88          char* out;
89          size_t outsize;
90          struct u_memstream mem;
91          u_memstream_open(&mem, &out, &outsize);
92          FILE* const memf = u_memstream_get(&mem);
93
94          fprintf(memf, "%s: ", msg);
95          aco_print_instr(program->gfx_level, instr, memf);
96          u_memstream_close(&mem);
97
98          aco_err(program, "%s", out);
99          free(out);
100
101          is_valid = false;
102       }
103    };
104
105    for (Block& block : program->blocks) {
106       for (aco_ptr<Instruction>& instr : block.instructions) {
107
108          /* check base format */
109          Format base_format = instr->format;
110          base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::SDWA);
111          base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::DPP16);
112          base_format = (Format)((uint32_t)base_format & ~(uint32_t)Format::DPP8);
113          if ((uint32_t)base_format & (uint32_t)Format::VOP1)
114             base_format = Format::VOP1;
115          else if ((uint32_t)base_format & (uint32_t)Format::VOP2)
116             base_format = Format::VOP2;
117          else if ((uint32_t)base_format & (uint32_t)Format::VOPC)
118             base_format = Format::VOPC;
119          else if ((uint32_t)base_format & (uint32_t)Format::VINTRP) {
120             if (instr->opcode == aco_opcode::v_interp_p1ll_f16 ||
121                 instr->opcode == aco_opcode::v_interp_p1lv_f16 ||
122                 instr->opcode == aco_opcode::v_interp_p2_legacy_f16 ||
123                 instr->opcode == aco_opcode::v_interp_p2_f16) {
124                /* v_interp_*_fp16 are considered VINTRP by the compiler but
125                 * they are emitted as VOP3.
126                 */
127                base_format = Format::VOP3;
128             } else {
129                base_format = Format::VINTRP;
130             }
131          }
132          check(base_format == instr_info.format[(int)instr->opcode],
133                "Wrong base format for instruction", instr.get());
134
135          /* check VOP3 modifiers */
136          if (instr->isVOP3() && withoutDPP(instr->format) != Format::VOP3) {
137             check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
138                      base_format == Format::VOPC || base_format == Format::VINTRP,
139                   "Format cannot have VOP3/VOP3B applied", instr.get());
140          }
141
142          if (instr->isDPP()) {
143             check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
144                      base_format == Format::VOPC || base_format == Format::VOP3 ||
145                      base_format == Format::VOP3P,
146                   "Format cannot have DPP applied", instr.get());
147             check((!instr->isVOP3() && !instr->isVOP3P()) || program->gfx_level >= GFX11,
148                   "VOP3+DPP is GFX11+ only", instr.get());
149
150             bool fi =
151                instr->isDPP8() ? instr->dpp8().fetch_inactive : instr->dpp16().fetch_inactive;
152             check(!fi || program->gfx_level >= GFX10, "DPP Fetch-Inactive is GFX10+ only",
153                   instr.get());
154          }
155
156          /* check SDWA */
157          if (instr->isSDWA()) {
158             check(base_format == Format::VOP2 || base_format == Format::VOP1 ||
159                      base_format == Format::VOPC,
160                   "Format cannot have SDWA applied", instr.get());
161
162             check(program->gfx_level >= GFX8, "SDWA is GFX8 to GFX10.3 only", instr.get());
163             check(program->gfx_level < GFX11, "SDWA is GFX8 to GFX10.3 only", instr.get());
164
165             SDWA_instruction& sdwa = instr->sdwa();
166             check(sdwa.omod == 0 || program->gfx_level >= GFX9, "SDWA omod only supported on GFX9+",
167                   instr.get());
168             if (base_format == Format::VOPC) {
169                check(sdwa.clamp == false || program->gfx_level == GFX8,
170                      "SDWA VOPC clamp only supported on GFX8", instr.get());
171                check((instr->definitions[0].isFixed() && instr->definitions[0].physReg() == vcc) ||
172                         program->gfx_level >= GFX9,
173                      "SDWA+VOPC definition must be fixed to vcc on GFX8", instr.get());
174             } else {
175                const Definition& def = instr->definitions[0];
176                check(def.bytes() <= 4, "SDWA definitions must not be larger than 4 bytes",
177                      instr.get());
178                check(def.bytes() >= sdwa.dst_sel.size() + sdwa.dst_sel.offset(),
179                      "SDWA definition selection size must be at most definition size", instr.get());
180                check(
181                   sdwa.dst_sel.size() == 1 || sdwa.dst_sel.size() == 2 || sdwa.dst_sel.size() == 4,
182                   "SDWA definition selection size must be 1, 2 or 4 bytes", instr.get());
183                check(sdwa.dst_sel.offset() % sdwa.dst_sel.size() == 0, "Invalid selection offset",
184                      instr.get());
185                check(def.bytes() == 4 || def.bytes() == sdwa.dst_sel.size(),
186                      "SDWA dst_sel size must be definition size for subdword definitions",
187                      instr.get());
188                check(def.bytes() == 4 || sdwa.dst_sel.offset() == 0,
189                      "SDWA dst_sel offset must be 0 for subdword definitions", instr.get());
190             }
191
192             for (unsigned i = 0; i < std::min<unsigned>(2, instr->operands.size()); i++) {
193                const Operand& op = instr->operands[i];
194                check(op.bytes() <= 4, "SDWA operands must not be larger than 4 bytes", instr.get());
195                check(op.bytes() >= sdwa.sel[i].size() + sdwa.sel[i].offset(),
196                      "SDWA operand selection size must be at most operand size", instr.get());
197                check(sdwa.sel[i].size() == 1 || sdwa.sel[i].size() == 2 || sdwa.sel[i].size() == 4,
198                      "SDWA operand selection size must be 1, 2 or 4 bytes", instr.get());
199                check(sdwa.sel[i].offset() % sdwa.sel[i].size() == 0, "Invalid selection offset",
200                      instr.get());
201             }
202             if (instr->operands.size() >= 3) {
203                check(instr->operands[2].isFixed() && instr->operands[2].physReg() == vcc,
204                      "3rd operand must be fixed to vcc with SDWA", instr.get());
205             }
206             if (instr->definitions.size() >= 2) {
207                check(instr->definitions[1].isFixed() && instr->definitions[1].physReg() == vcc,
208                      "2nd definition must be fixed to vcc with SDWA", instr.get());
209             }
210
211             const bool sdwa_opcodes =
212                instr->opcode != aco_opcode::v_fmac_f32 && instr->opcode != aco_opcode::v_fmac_f16 &&
213                instr->opcode != aco_opcode::v_fmamk_f32 &&
214                instr->opcode != aco_opcode::v_fmaak_f32 &&
215                instr->opcode != aco_opcode::v_fmamk_f16 &&
216                instr->opcode != aco_opcode::v_fmaak_f16 &&
217                instr->opcode != aco_opcode::v_madmk_f32 &&
218                instr->opcode != aco_opcode::v_madak_f32 &&
219                instr->opcode != aco_opcode::v_madmk_f16 &&
220                instr->opcode != aco_opcode::v_madak_f16 &&
221                instr->opcode != aco_opcode::v_readfirstlane_b32 &&
222                instr->opcode != aco_opcode::v_clrexcp && instr->opcode != aco_opcode::v_swap_b32;
223
224             const bool feature_mac =
225                program->gfx_level == GFX8 &&
226                (instr->opcode == aco_opcode::v_mac_f32 && instr->opcode == aco_opcode::v_mac_f16);
227
228             check(sdwa_opcodes || feature_mac, "SDWA can't be used with this opcode", instr.get());
229          }
230
231          /* check opsel */
232          if (instr->isVOP3() || instr->isVOP1() || instr->isVOP2() || instr->isVOPC()) {
233             VALU_instruction& valu = instr->valu();
234             check(valu.opsel == 0 || program->gfx_level >= GFX9, "Opsel is only supported on GFX9+",
235                   instr.get());
236             check(valu.opsel == 0 || instr->format == Format::VOP3 || program->gfx_level >= GFX11,
237                   "Opsel is only supported for VOP3 before GFX11", instr.get());
238
239             for (unsigned i = 0; i < 3; i++) {
240                if (i >= instr->operands.size() ||
241                    (!instr->isVOP3() && !instr->operands[i].isOfType(RegType::vgpr)) ||
242                    (instr->operands[i].hasRegClass() &&
243                     instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed()))
244                   check(!valu.opsel[i], "Unexpected opsel for operand", instr.get());
245             }
246             if (instr->definitions[0].regClass().is_subdword() && !instr->definitions[0].isFixed())
247                check(!valu.opsel[3], "Unexpected opsel for sub-dword definition", instr.get());
248          } else if (instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
249                     instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
250                     instr->opcode == aco_opcode::v_fma_mix_f32) {
251             check(instr->definitions[0].regClass() ==
252                      (instr->opcode == aco_opcode::v_fma_mix_f32 ? v1 : v2b),
253                   "v_fma_mix_f32/v_fma_mix_f16 must have v1/v2b definition", instr.get());
254          } else if (instr->isVOP3P()) {
255             VALU_instruction& vop3p = instr->valu();
256             for (unsigned i = 0; i < instr->operands.size(); i++) {
257                if (instr->operands[i].hasRegClass() &&
258                    instr->operands[i].regClass().is_subdword() && !instr->operands[i].isFixed())
259                   check(!vop3p.opsel_lo[i] && !vop3p.opsel_hi[i],
260                         "Unexpected opsel for subdword operand", instr.get());
261             }
262             check(instr->definitions[0].regClass() == v1, "VOP3P must have v1 definition",
263                   instr.get());
264          }
265
266          /* check for undefs */
267          for (unsigned i = 0; i < instr->operands.size(); i++) {
268             if (instr->operands[i].isUndefined()) {
269                bool flat = instr->isFlatLike();
270                bool can_be_undef = is_phi(instr) || instr->isEXP() || instr->isReduction() ||
271                                    instr->opcode == aco_opcode::p_create_vector ||
272                                    instr->opcode == aco_opcode::p_jump_to_epilog ||
273                                    instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
274                                    instr->opcode == aco_opcode::p_end_with_regs ||
275                                    (instr->opcode == aco_opcode::p_interp_gfx11 && i == 0) ||
276                                    (instr->opcode == aco_opcode::p_bpermute_permlane && i == 0) ||
277                                    (flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) ||
278                                    ((instr->isMUBUF() || instr->isMTBUF()) && i == 1) ||
279                                    (instr->isScratch() && i == 0) || (instr->isDS() && i == 0) ||
280                                    (instr->opcode == aco_opcode::p_init_scratch && i == 0);
281                check(can_be_undef, "Undefs can only be used in certain operands", instr.get());
282             } else {
283                check(instr->operands[i].isFixed() || instr->operands[i].isTemp() ||
284                         instr->operands[i].isConstant(),
285                      "Uninitialized Operand", instr.get());
286             }
287          }
288
289          /* check subdword definitions */
290          for (unsigned i = 0; i < instr->definitions.size(); i++) {
291             if (instr->definitions[i].regClass().is_subdword())
292                check(instr->definitions[i].bytes() <= 4 || instr->isPseudo() || instr->isVMEM(),
293                      "Only Pseudo and VMEM instructions can write subdword registers > 4 bytes",
294                      instr.get());
295          }
296
297          if ((instr->isSALU() && instr->opcode != aco_opcode::p_constaddr_addlo &&
298               instr->opcode != aco_opcode::p_resumeaddr_addlo) ||
299              instr->isVALU()) {
300             /* check literals */
301             Operand literal(s1);
302             for (unsigned i = 0; i < instr->operands.size(); i++) {
303                Operand op = instr->operands[i];
304                if (!op.isLiteral())
305                   continue;
306
307                check(!instr->isDPP() && !instr->isSDWA() &&
308                         (!instr->isVOP3() || program->gfx_level >= GFX10) &&
309                         (!instr->isVOP3P() || program->gfx_level >= GFX10),
310                      "Literal applied on wrong instruction format", instr.get());
311
312                check(literal.isUndefined() || (literal.size() == op.size() &&
313                                                literal.constantValue() == op.constantValue()),
314                      "Only 1 Literal allowed", instr.get());
315                literal = op;
316                check(instr->isSALU() || instr->isVOP3() || instr->isVOP3P() || i == 0 || i == 2,
317                      "Wrong source position for Literal argument", instr.get());
318             }
319
320             /* check num sgprs for VALU */
321             if (instr->isVALU()) {
322                bool is_shift64 = instr->opcode == aco_opcode::v_lshlrev_b64 ||
323                                  instr->opcode == aco_opcode::v_lshrrev_b64 ||
324                                  instr->opcode == aco_opcode::v_ashrrev_i64;
325                unsigned const_bus_limit = 1;
326                if (program->gfx_level >= GFX10 && !is_shift64)
327                   const_bus_limit = 2;
328
329                uint32_t scalar_mask =
330                   instr->isVOP3() || instr->isVOP3P() || instr->isVINTERP_INREG() ? 0x7 : 0x5;
331                if (instr->isSDWA())
332                   scalar_mask = program->gfx_level >= GFX9 ? 0x7 : 0x4;
333                else if (instr->isDPP())
334                   scalar_mask = 0x4;
335
336                if (instr->isVOPC() || instr->opcode == aco_opcode::v_readfirstlane_b32 ||
337                    instr->opcode == aco_opcode::v_readlane_b32 ||
338                    instr->opcode == aco_opcode::v_readlane_b32_e64) {
339                   check(instr->definitions[0].regClass().type() == RegType::sgpr,
340                         "Wrong Definition type for VALU instruction", instr.get());
341                } else {
342                   check(instr->definitions[0].regClass().type() == RegType::vgpr,
343                         "Wrong Definition type for VALU instruction", instr.get());
344                }
345
346                unsigned num_sgprs = 0;
347                unsigned sgpr[] = {0, 0};
348                for (unsigned i = 0; i < instr->operands.size(); i++) {
349                   Operand op = instr->operands[i];
350                   if (instr->opcode == aco_opcode::v_readfirstlane_b32 ||
351                       instr->opcode == aco_opcode::v_readlane_b32 ||
352                       instr->opcode == aco_opcode::v_readlane_b32_e64) {
353                      check(i != 1 || op.isOfType(RegType::sgpr) || op.isConstant(),
354                            "Must be a SGPR or a constant", instr.get());
355                      check(i == 1 || (op.isOfType(RegType::vgpr) && op.bytes() <= 4),
356                            "Wrong Operand type for VALU instruction", instr.get());
357                      continue;
358                   }
359                   if (instr->opcode == aco_opcode::v_permlane16_b32 ||
360                       instr->opcode == aco_opcode::v_permlanex16_b32) {
361                      check(i != 0 || op.isOfType(RegType::vgpr),
362                            "Operand 0 of v_permlane must be VGPR", instr.get());
363                      check(i == 0 || op.isOfType(RegType::sgpr) || op.isConstant(),
364                            "Lane select operands of v_permlane must be SGPR or constant",
365                            instr.get());
366                   }
367
368                   if (instr->opcode == aco_opcode::v_writelane_b32 ||
369                       instr->opcode == aco_opcode::v_writelane_b32_e64) {
370                      check(i != 2 || (op.isOfType(RegType::vgpr) && op.bytes() <= 4),
371                            "Wrong Operand type for VALU instruction", instr.get());
372                      check(i == 2 || op.isOfType(RegType::sgpr) || op.isConstant(),
373                            "Must be a SGPR or a constant", instr.get());
374                      continue;
375                   }
376                   if (op.isOfType(RegType::sgpr)) {
377                      check(scalar_mask & (1 << i), "Wrong source position for SGPR argument",
378                            instr.get());
379
380                      if (op.tempId() != sgpr[0] && op.tempId() != sgpr[1]) {
381                         if (num_sgprs < 2)
382                            sgpr[num_sgprs++] = op.tempId();
383                      }
384                   }
385
386                   if (op.isConstant() && !op.isLiteral())
387                      check(scalar_mask & (1 << i), "Wrong source position for constant argument",
388                            instr.get());
389                }
390                check(num_sgprs + (literal.isUndefined() ? 0 : 1) <= const_bus_limit,
391                      "Too many SGPRs/literals", instr.get());
392
393                /* Validate modifiers. */
394                check(!instr->valu().opsel || instr->isVOP3() || instr->isVOP1() ||
395                         instr->isVOP2() || instr->isVOPC() || instr->isVINTERP_INREG(),
396                      "OPSEL set for unsupported instruction format", instr.get());
397                check(!instr->valu().opsel_lo || instr->isVOP3P(),
398                      "OPSEL_LO set for unsupported instruction format", instr.get());
399                check(!instr->valu().opsel_hi || instr->isVOP3P(),
400                      "OPSEL_HI set for unsupported instruction format", instr.get());
401                check(!instr->valu().omod || instr->isVOP3() || instr->isSDWA(),
402                      "OMOD set for unsupported instruction format", instr.get());
403                check(!instr->valu().clamp || instr->isVOP3() || instr->isVOP3P() ||
404                         instr->isSDWA() || instr->isVINTERP_INREG(),
405                      "CLAMP set for unsupported instruction format", instr.get());
406
407                for (bool abs : instr->valu().abs) {
408                   check(!abs || instr->isVOP3() || instr->isVOP3P() || instr->isSDWA() ||
409                            instr->isDPP16(),
410                         "ABS/NEG_HI set for unsupported instruction format", instr.get());
411                }
412                for (bool neg : instr->valu().neg) {
413                   check(!neg || instr->isVOP3() || instr->isVOP3P() || instr->isSDWA() ||
414                            instr->isDPP16() || instr->isVINTERP_INREG(),
415                         "NEG/NEG_LO set for unsupported instruction format", instr.get());
416                }
417             }
418
419             if (instr->isSOP1() || instr->isSOP2()) {
420                if (!instr->definitions.empty())
421                   check(instr->definitions[0].regClass().type() == RegType::sgpr,
422                         "Wrong Definition type for SALU instruction", instr.get());
423                for (const Operand& op : instr->operands) {
424                   check(op.isConstant() || op.isOfType(RegType::sgpr),
425                         "Wrong Operand type for SALU instruction", instr.get());
426                }
427             }
428          }
429
430          switch (instr->format) {
431          case Format::PSEUDO: {
432             if (instr->opcode == aco_opcode::p_create_vector) {
433                unsigned size = 0;
434                for (const Operand& op : instr->operands) {
435                   check(op.bytes() < 4 || size % 4 == 0, "Operand is not aligned", instr.get());
436                   size += op.bytes();
437                }
438                check(size == instr->definitions[0].bytes(),
439                      "Definition size does not match operand sizes", instr.get());
440                if (instr->definitions[0].regClass().type() == RegType::sgpr) {
441                   for (const Operand& op : instr->operands) {
442                      check(op.isConstant() || op.regClass().type() == RegType::sgpr,
443                            "Wrong Operand type for scalar vector", instr.get());
444                   }
445                }
446             } else if (instr->opcode == aco_opcode::p_extract_vector) {
447                check(!instr->operands[0].isConstant() && instr->operands[1].isConstant(),
448                      "Wrong Operand types", instr.get());
449                check((instr->operands[1].constantValue() + 1) * instr->definitions[0].bytes() <=
450                         instr->operands[0].bytes(),
451                      "Index out of range", instr.get());
452                check(instr->definitions[0].regClass().type() == RegType::vgpr ||
453                         instr->operands[0].regClass().type() == RegType::sgpr,
454                      "Cannot extract SGPR value from VGPR vector", instr.get());
455                check(program->gfx_level >= GFX9 ||
456                         !instr->definitions[0].regClass().is_subdword() ||
457                         instr->operands[0].regClass().type() == RegType::vgpr,
458                      "Cannot extract subdword from SGPR before GFX9+", instr.get());
459             } else if (instr->opcode == aco_opcode::p_split_vector) {
460                check(!instr->operands[0].isConstant(), "Operand must not be constant", instr.get());
461                unsigned size = 0;
462                for (const Definition& def : instr->definitions) {
463                   size += def.bytes();
464                }
465                check(size == instr->operands[0].bytes(),
466                      "Operand size does not match definition sizes", instr.get());
467                if (instr->operands[0].isOfType(RegType::vgpr)) {
468                   for (const Definition& def : instr->definitions)
469                      check(def.regClass().type() == RegType::vgpr,
470                            "Wrong Definition type for VGPR split_vector", instr.get());
471                } else {
472                   for (const Definition& def : instr->definitions)
473                      check(program->gfx_level >= GFX9 || !def.regClass().is_subdword(),
474                            "Cannot split SGPR into subdword VGPRs before GFX9+", instr.get());
475                }
476             } else if (instr->opcode == aco_opcode::p_parallelcopy) {
477                check(instr->definitions.size() == instr->operands.size(),
478                      "Number of Operands does not match number of Definitions", instr.get());
479                for (unsigned i = 0; i < instr->operands.size(); i++) {
480                   check(instr->definitions[i].bytes() == instr->operands[i].bytes(),
481                         "Operand and Definition size must match", instr.get());
482                   if (instr->operands[i].hasRegClass()) {
483                      check((instr->definitions[i].regClass().type() ==
484                             instr->operands[i].regClass().type()) ||
485                               (instr->definitions[i].regClass().type() == RegType::vgpr &&
486                                instr->operands[i].regClass().type() == RegType::sgpr),
487                            "Operand and Definition types do not match", instr.get());
488                      check(instr->definitions[i].regClass().is_linear_vgpr() ==
489                               instr->operands[i].regClass().is_linear_vgpr(),
490                            "Operand and Definition types do not match", instr.get());
491                   } else {
492                      check(!instr->definitions[i].regClass().is_linear_vgpr(),
493                            "Can only copy linear VGPRs into linear VGPRs, not constant/undef",
494                            instr.get());
495                   }
496                }
497             } else if (instr->opcode == aco_opcode::p_phi) {
498                check(instr->operands.size() == block.logical_preds.size(),
499                      "Number of Operands does not match number of predecessors", instr.get());
500                check(instr->definitions[0].regClass().type() == RegType::vgpr,
501                      "Logical Phi Definition must be vgpr", instr.get());
502                for (const Operand& op : instr->operands)
503                   check(instr->definitions[0].size() == op.size(),
504                         "Operand sizes must match Definition size", instr.get());
505             } else if (instr->opcode == aco_opcode::p_linear_phi) {
506                for (const Operand& op : instr->operands) {
507                   check(!op.isTemp() || op.getTemp().is_linear(), "Wrong Operand type",
508                         instr.get());
509                   check(instr->definitions[0].size() == op.size(),
510                         "Operand sizes must match Definition size", instr.get());
511                }
512                check(instr->operands.size() == block.linear_preds.size(),
513                      "Number of Operands does not match number of predecessors", instr.get());
514             } else if (instr->opcode == aco_opcode::p_extract ||
515                        instr->opcode == aco_opcode::p_insert) {
516                check(!instr->operands[0].isConstant(), "Data operand must not be constant",
517                      instr.get());
518                check(instr->operands[1].isConstant(), "Index must be constant", instr.get());
519                if (instr->opcode == aco_opcode::p_extract)
520                   check(instr->operands[3].isConstant(), "Sign-extend flag must be constant",
521                         instr.get());
522
523                check(instr->definitions[0].regClass().type() != RegType::sgpr ||
524                         instr->operands[0].regClass().type() == RegType::sgpr,
525                      "Can't extract/insert VGPR to SGPR", instr.get());
526
527                if (instr->opcode == aco_opcode::p_insert)
528                   check(instr->operands[0].bytes() == instr->definitions[0].bytes(),
529                         "Sizes of p_insert data operand and definition must match", instr.get());
530
531                if (instr->definitions[0].regClass().type() == RegType::sgpr)
532                   check(instr->definitions.size() >= 2 && instr->definitions[1].isFixed() &&
533                            instr->definitions[1].physReg() == scc,
534                         "SGPR extract/insert needs an SCC definition", instr.get());
535
536                unsigned data_bits = instr->operands[0].bytes() * 8u;
537                unsigned op_bits = instr->operands[2].constantValue();
538
539                if (instr->opcode == aco_opcode::p_insert) {
540                   check(op_bits == 8 || op_bits == 16, "Size must be 8 or 16", instr.get());
541                   check(op_bits < data_bits, "Size must be smaller than source", instr.get());
542                } else if (instr->opcode == aco_opcode::p_extract) {
543                   check(op_bits == 8 || op_bits == 16 || op_bits == 32,
544                         "Size must be 8 or 16 or 32", instr.get());
545                   check(data_bits >= op_bits, "Can't extract more bits than what the data has.",
546                         instr.get());
547                }
548
549                unsigned comp = data_bits / MAX2(op_bits, 1);
550                check(instr->operands[1].constantValue() < comp, "Index must be in-bounds",
551                      instr.get());
552             } else if (instr->opcode == aco_opcode::p_jump_to_epilog) {
553                check(instr->definitions.size() == 0, "p_jump_to_epilog must have 0 definitions",
554                      instr.get());
555                check(instr->operands.size() > 0 && instr->operands[0].isOfType(RegType::sgpr) &&
556                         instr->operands[0].size() == 2,
557                      "First operand of p_jump_to_epilog must be a SGPR", instr.get());
558                for (unsigned i = 1; i < instr->operands.size(); i++) {
559                   check(instr->operands[i].isOfType(RegType::vgpr) ||
560                            instr->operands[i].isOfType(RegType::sgpr) ||
561                            instr->operands[i].isUndefined(),
562                         "Other operands of p_jump_to_epilog must be VGPRs, SGPRs or undef",
563                         instr.get());
564                }
565             } else if (instr->opcode == aco_opcode::p_dual_src_export_gfx11) {
566                check(instr->definitions.size() == 6,
567                      "p_dual_src_export_gfx11 must have 6 definitions", instr.get());
568                check(instr->definitions[2].regClass() == program->lane_mask,
569                      "Third definition of p_dual_src_export_gfx11 must be a lane mask",
570                      instr.get());
571                check(instr->definitions[3].regClass() == program->lane_mask,
572                      "Fourth definition of p_dual_src_export_gfx11 must be a lane mask",
573                      instr.get());
574                check(instr->definitions[4].physReg() == vcc,
575                      "Fifth definition of p_dual_src_export_gfx11 must be vcc", instr.get());
576                check(instr->definitions[5].physReg() == scc,
577                      "Sixth definition of p_dual_src_export_gfx11 must be scc", instr.get());
578                check(instr->operands.size() == 8, "p_dual_src_export_gfx11 must have 8 operands",
579                      instr.get());
580                for (unsigned i = 0; i < instr->operands.size(); i++) {
581                   check(
582                      instr->operands[i].isOfType(RegType::vgpr) || instr->operands[i].isUndefined(),
583                      "Operands of p_dual_src_export_gfx11 must be VGPRs or undef", instr.get());
584                }
585             } else if (instr->opcode == aco_opcode::p_start_linear_vgpr) {
586                check(instr->definitions.size() == 1, "Must have one definition", instr.get());
587                check(instr->operands.size() <= 1, "Must have one or zero operands", instr.get());
588                if (!instr->definitions.empty())
589                   check(instr->definitions[0].regClass().is_linear_vgpr(),
590                         "Definition must be linear VGPR", instr.get());
591                if (!instr->definitions.empty() && !instr->operands.empty())
592                   check(instr->definitions[0].bytes() == instr->operands[0].bytes(),
593                         "Operand size must match definition", instr.get());
594             }
595             break;
596          }
597          case Format::PSEUDO_REDUCTION: {
598             for (const Operand& op : instr->operands)
599                check(op.regClass().type() == RegType::vgpr,
600                      "All operands of PSEUDO_REDUCTION instructions must be in VGPRs.",
601                      instr.get());
602
603             if (instr->opcode == aco_opcode::p_reduce &&
604                 instr->reduction().cluster_size == program->wave_size)
605                check(instr->definitions[0].regClass().type() == RegType::sgpr ||
606                         program->wave_size == 32,
607                      "The result of unclustered reductions must go into an SGPR.", instr.get());
608             else
609                check(instr->definitions[0].regClass().type() == RegType::vgpr,
610                      "The result of scans and clustered reductions must go into a VGPR.",
611                      instr.get());
612
613             break;
614          }
615          case Format::SMEM: {
616             if (instr->operands.size() >= 1)
617                check(instr->operands[0].isOfType(RegType::sgpr), "SMEM operands must be sgpr",
618                      instr.get());
619             if (instr->operands.size() >= 2)
620                check(instr->operands[1].isConstant() || instr->operands[1].isOfType(RegType::sgpr),
621                      "SMEM offset must be constant or sgpr", instr.get());
622             if (!instr->definitions.empty())
623                check(instr->definitions[0].regClass().type() == RegType::sgpr,
624                      "SMEM result must be sgpr", instr.get());
625             break;
626          }
627          case Format::MTBUF:
628          case Format::MUBUF: {
629             check(instr->operands.size() > 1, "VMEM instructions must have at least one operand",
630                   instr.get());
631             check(instr->operands[1].isOfType(RegType::vgpr),
632                   "VADDR must be in vgpr for VMEM instructions", instr.get());
633             check(instr->operands[0].isOfType(RegType::sgpr), "VMEM resource constant must be sgpr",
634                   instr.get());
635             check(instr->operands.size() < 4 || instr->operands[3].isOfType(RegType::vgpr),
636                   "VMEM write data must be vgpr", instr.get());
637
638             const bool d16 =
639                instr->opcode ==
640                   aco_opcode::buffer_load_dword || // FIXME: used to spill subdword variables
641                instr->opcode == aco_opcode::buffer_load_ubyte ||
642                instr->opcode == aco_opcode::buffer_load_sbyte ||
643                instr->opcode == aco_opcode::buffer_load_ushort ||
644                instr->opcode == aco_opcode::buffer_load_sshort ||
645                instr->opcode == aco_opcode::buffer_load_ubyte_d16 ||
646                instr->opcode == aco_opcode::buffer_load_ubyte_d16_hi ||
647                instr->opcode == aco_opcode::buffer_load_sbyte_d16 ||
648                instr->opcode == aco_opcode::buffer_load_sbyte_d16_hi ||
649                instr->opcode == aco_opcode::buffer_load_short_d16 ||
650                instr->opcode == aco_opcode::buffer_load_short_d16_hi ||
651                instr->opcode == aco_opcode::buffer_load_format_d16_x ||
652                instr->opcode == aco_opcode::buffer_load_format_d16_hi_x ||
653                instr->opcode == aco_opcode::buffer_load_format_d16_xy ||
654                instr->opcode == aco_opcode::buffer_load_format_d16_xyz ||
655                instr->opcode == aco_opcode::buffer_load_format_d16_xyzw ||
656                instr->opcode == aco_opcode::tbuffer_load_format_d16_x ||
657                instr->opcode == aco_opcode::tbuffer_load_format_d16_xy ||
658                instr->opcode == aco_opcode::tbuffer_load_format_d16_xyz ||
659                instr->opcode == aco_opcode::tbuffer_load_format_d16_xyzw;
660             if (instr->definitions.size()) {
661                check(instr->definitions[0].regClass().type() == RegType::vgpr,
662                      "VMEM definitions[0] (VDATA) must be VGPR", instr.get());
663                check(d16 || !instr->definitions[0].regClass().is_subdword(),
664                      "Only D16 opcodes can load subdword values.", instr.get());
665                check(instr->definitions[0].bytes() <= 8 || !d16,
666                      "D16 opcodes can only load up to 8 bytes.", instr.get());
667             }
668             break;
669          }
670          case Format::MIMG: {
671             check(instr->operands.size() >= 4, "MIMG instructions must have at least 4 operands",
672                   instr.get());
673             check(instr->operands[0].hasRegClass() &&
674                      (instr->operands[0].regClass() == s4 || instr->operands[0].regClass() == s8),
675                   "MIMG operands[0] (resource constant) must be in 4 or 8 SGPRs", instr.get());
676             if (instr->operands[1].hasRegClass())
677                check(instr->operands[1].regClass() == s4,
678                      "MIMG operands[1] (sampler constant) must be 4 SGPRs", instr.get());
679             if (!instr->operands[2].isUndefined()) {
680                bool is_cmpswap = instr->opcode == aco_opcode::image_atomic_cmpswap ||
681                                  instr->opcode == aco_opcode::image_atomic_fcmpswap;
682                check(instr->definitions.empty() ||
683                         (instr->definitions[0].regClass() == instr->operands[2].regClass() ||
684                          is_cmpswap),
685                      "MIMG operands[2] (VDATA) must be the same as definitions[0] for atomics and "
686                      "TFE/LWE loads",
687                      instr.get());
688             }
689
690             if (instr->mimg().strict_wqm) {
691                check(instr->operands[3].hasRegClass() &&
692                         instr->operands[3].regClass().is_linear_vgpr(),
693                      "MIMG operands[3] must be temp linear VGPR.", instr.get());
694
695                unsigned total_size = 0;
696                for (unsigned i = 4; i < instr->operands.size(); i++) {
697                   check(instr->operands[i].hasRegClass() && instr->operands[i].regClass() == v1,
698                         "MIMG operands[4+] (VADDR) must be v1", instr.get());
699                   total_size += instr->operands[i].bytes();
700                }
701                check(total_size <= instr->operands[3].bytes(),
702                      "MIMG operands[4+] must fit within operands[3].", instr.get());
703             } else {
704                check(instr->operands.size() == 4 || program->gfx_level >= GFX10,
705                      "NSA is only supported on GFX10+", instr.get());
706                for (unsigned i = 3; i < instr->operands.size(); i++) {
707                   check(instr->operands[i].hasRegClass() &&
708                            instr->operands[i].regClass().type() == RegType::vgpr,
709                         "MIMG operands[3+] (VADDR) must be VGPR", instr.get());
710                   if (instr->operands.size() > 4) {
711                      if (program->gfx_level < GFX11) {
712                         check(instr->operands[i].regClass() == v1,
713                               "GFX10 MIMG VADDR must be v1 if NSA is used", instr.get());
714                      } else {
715                         if (instr->opcode != aco_opcode::image_bvh_intersect_ray &&
716                             instr->opcode != aco_opcode::image_bvh64_intersect_ray && i < 7) {
717                            check(instr->operands[i].regClass() == v1,
718                                  "first 4 GFX11 MIMG VADDR must be v1 if NSA is used", instr.get());
719                         }
720                      }
721                   }
722                }
723             }
724
725             if (instr->definitions.size()) {
726                check(instr->definitions[0].regClass().type() == RegType::vgpr,
727                      "MIMG definitions[0] (VDATA) must be VGPR", instr.get());
728                check(instr->mimg().d16 || !instr->definitions[0].regClass().is_subdword(),
729                      "Only D16 MIMG instructions can load subdword values.", instr.get());
730                check(instr->definitions[0].bytes() <= 8 || !instr->mimg().d16,
731                      "D16 MIMG instructions can only load up to 8 bytes.", instr.get());
732             }
733             break;
734          }
735          case Format::DS: {
736             for (const Operand& op : instr->operands) {
737                check(op.isOfType(RegType::vgpr) || op.physReg() == m0 || op.isUndefined(),
738                      "Only VGPRs are valid DS instruction operands", instr.get());
739             }
740             if (!instr->definitions.empty())
741                check(instr->definitions[0].regClass().type() == RegType::vgpr,
742                      "DS instruction must return VGPR", instr.get());
743             break;
744          }
745          case Format::EXP: {
746             for (unsigned i = 0; i < 4; i++)
747                check(instr->operands[i].isOfType(RegType::vgpr),
748                      "Only VGPRs are valid Export arguments", instr.get());
749             break;
750          }
751          case Format::FLAT:
752             check(instr->operands[1].isUndefined(), "Flat instructions don't support SADDR",
753                   instr.get());
754             FALLTHROUGH;
755          case Format::GLOBAL:
756             check(instr->operands[0].isOfType(RegType::vgpr), "FLAT/GLOBAL address must be vgpr",
757                   instr.get());
758             FALLTHROUGH;
759          case Format::SCRATCH: {
760             check(instr->operands[0].isOfType(RegType::vgpr),
761                   "FLAT/GLOBAL/SCRATCH address must be undefined or vgpr", instr.get());
762             check(instr->operands[1].isOfType(RegType::sgpr),
763                   "FLAT/GLOBAL/SCRATCH sgpr address must be undefined or sgpr", instr.get());
764             if (instr->format == Format::SCRATCH && program->gfx_level < GFX10_3)
765                check(!instr->operands[0].isUndefined() || !instr->operands[1].isUndefined(),
766                      "SCRATCH must have either SADDR or ADDR operand", instr.get());
767             if (!instr->definitions.empty())
768                check(instr->definitions[0].regClass().type() == RegType::vgpr,
769                      "FLAT/GLOBAL/SCRATCH result must be vgpr", instr.get());
770             else
771                check(instr->operands[2].isOfType(RegType::vgpr),
772                      "FLAT/GLOBAL/SCRATCH data must be vgpr", instr.get());
773             break;
774          }
775          case Format::LDSDIR: {
776             check(instr->definitions.size() == 1 && instr->definitions[0].regClass() == v1,
777                   "LDSDIR must have an v1 definition", instr.get());
778             check(instr->operands.size() == 1, "LDSDIR must have an operand", instr.get());
779             if (!instr->operands.empty()) {
780                check(instr->operands[0].regClass() == s1, "LDSDIR must have an s1 operand",
781                      instr.get());
782                check(instr->operands[0].isFixed() && instr->operands[0].physReg() == m0,
783                      "LDSDIR must have an operand fixed to m0", instr.get());
784             }
785             break;
786          }
787          default: break;
788          }
789       }
790    }
791
792    return is_valid;
793 }
794
795 bool
796 validate_cfg(Program* program)
797 {
798    if (!(debug_flags & DEBUG_VALIDATE_IR))
799       return true;
800
801    bool is_valid = true;
802    auto check_block = [&program, &is_valid](bool success, const char* msg,
803                                             aco::Block* block) -> void
804    {
805       if (!success) {
806          aco_err(program, "%s: BB%u", msg, block->index);
807          is_valid = false;
808       }
809    };
810
811    /* validate CFG */
812    for (unsigned i = 0; i < program->blocks.size(); i++) {
813       Block& block = program->blocks[i];
814       check_block(block.index == i, "block.index must match actual index", &block);
815
816       /* predecessors/successors should be sorted */
817       for (unsigned j = 0; j + 1 < block.linear_preds.size(); j++)
818          check_block(block.linear_preds[j] < block.linear_preds[j + 1],
819                      "linear predecessors must be sorted", &block);
820       for (unsigned j = 0; j + 1 < block.logical_preds.size(); j++)
821          check_block(block.logical_preds[j] < block.logical_preds[j + 1],
822                      "logical predecessors must be sorted", &block);
823       for (unsigned j = 0; j + 1 < block.linear_succs.size(); j++)
824          check_block(block.linear_succs[j] < block.linear_succs[j + 1],
825                      "linear successors must be sorted", &block);
826       for (unsigned j = 0; j + 1 < block.logical_succs.size(); j++)
827          check_block(block.logical_succs[j] < block.logical_succs[j + 1],
828                      "logical successors must be sorted", &block);
829
830       /* critical edges are not allowed */
831       if (block.linear_preds.size() > 1) {
832          for (unsigned pred : block.linear_preds)
833             check_block(program->blocks[pred].linear_succs.size() == 1,
834                         "linear critical edges are not allowed", &program->blocks[pred]);
835          for (unsigned pred : block.logical_preds)
836             check_block(program->blocks[pred].logical_succs.size() == 1,
837                         "logical critical edges are not allowed", &program->blocks[pred]);
838       }
839    }
840
841    return is_valid;
842 }
843
844 /* RA validation */
845 namespace {
846
847 struct Location {
848    Location() : block(NULL), instr(NULL) {}
849
850    Block* block;
851    Instruction* instr; // NULL if it's the block's live-in
852 };
853
854 struct Assignment {
855    Location defloc;
856    Location firstloc;
857    PhysReg reg;
858    bool valid;
859 };
860
861 bool
862 ra_fail(Program* program, Location loc, Location loc2, const char* fmt, ...)
863 {
864    va_list args;
865    va_start(args, fmt);
866    char msg[1024];
867    vsprintf(msg, fmt, args);
868    va_end(args);
869
870    char* out;
871    size_t outsize;
872    struct u_memstream mem;
873    u_memstream_open(&mem, &out, &outsize);
874    FILE* const memf = u_memstream_get(&mem);
875
876    fprintf(memf, "RA error found at instruction in BB%d:\n", loc.block->index);
877    if (loc.instr) {
878       aco_print_instr(program->gfx_level, loc.instr, memf);
879       fprintf(memf, "\n%s", msg);
880    } else {
881       fprintf(memf, "%s", msg);
882    }
883    if (loc2.block) {
884       fprintf(memf, " in BB%d:\n", loc2.block->index);
885       aco_print_instr(program->gfx_level, loc2.instr, memf);
886    }
887    fprintf(memf, "\n\n");
888    u_memstream_close(&mem);
889
890    aco_err(program, "%s", out);
891    free(out);
892
893    return true;
894 }
895
896 bool
897 validate_subdword_operand(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr,
898                           unsigned index)
899 {
900    Operand op = instr->operands[index];
901    unsigned byte = op.physReg().byte();
902
903    if (instr->opcode == aco_opcode::p_as_uniform)
904       return byte == 0;
905    if (instr->isPseudo() && gfx_level >= GFX8)
906       return true;
907    if (instr->isSDWA())
908       return byte + instr->sdwa().sel[index].offset() + instr->sdwa().sel[index].size() <= 4 &&
909              byte % instr->sdwa().sel[index].size() == 0;
910    if (instr->isVOP3P()) {
911       bool fma_mix = instr->opcode == aco_opcode::v_fma_mixlo_f16 ||
912                      instr->opcode == aco_opcode::v_fma_mixhi_f16 ||
913                      instr->opcode == aco_opcode::v_fma_mix_f32;
914       return instr->valu().opsel_lo[index] == (byte >> 1) &&
915              instr->valu().opsel_hi[index] == (fma_mix || (byte >> 1));
916    }
917    if (byte == 2 && can_use_opsel(gfx_level, instr->opcode, index))
918       return true;
919
920    switch (instr->opcode) {
921    case aco_opcode::v_cvt_f32_ubyte1:
922       if (byte == 1)
923          return true;
924       break;
925    case aco_opcode::v_cvt_f32_ubyte2:
926       if (byte == 2)
927          return true;
928       break;
929    case aco_opcode::v_cvt_f32_ubyte3:
930       if (byte == 3)
931          return true;
932       break;
933    case aco_opcode::ds_write_b8_d16_hi:
934    case aco_opcode::ds_write_b16_d16_hi:
935       if (byte == 2 && index == 1)
936          return true;
937       break;
938    case aco_opcode::buffer_store_byte_d16_hi:
939    case aco_opcode::buffer_store_short_d16_hi:
940    case aco_opcode::buffer_store_format_d16_hi_x:
941       if (byte == 2 && index == 3)
942          return true;
943       break;
944    case aco_opcode::flat_store_byte_d16_hi:
945    case aco_opcode::flat_store_short_d16_hi:
946    case aco_opcode::scratch_store_byte_d16_hi:
947    case aco_opcode::scratch_store_short_d16_hi:
948    case aco_opcode::global_store_byte_d16_hi:
949    case aco_opcode::global_store_short_d16_hi:
950       if (byte == 2 && index == 2)
951          return true;
952       break;
953    default: break;
954    }
955
956    return byte == 0;
957 }
958
959 bool
960 validate_subdword_definition(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr)
961 {
962    Definition def = instr->definitions[0];
963    unsigned byte = def.physReg().byte();
964
965    if (instr->isPseudo() && gfx_level >= GFX8)
966       return true;
967    if (instr->isSDWA())
968       return byte + instr->sdwa().dst_sel.offset() + instr->sdwa().dst_sel.size() <= 4 &&
969              byte % instr->sdwa().dst_sel.size() == 0;
970    if (byte == 2 && can_use_opsel(gfx_level, instr->opcode, -1))
971       return true;
972
973    switch (instr->opcode) {
974    case aco_opcode::v_fma_mixhi_f16:
975    case aco_opcode::buffer_load_ubyte_d16_hi:
976    case aco_opcode::buffer_load_sbyte_d16_hi:
977    case aco_opcode::buffer_load_short_d16_hi:
978    case aco_opcode::buffer_load_format_d16_hi_x:
979    case aco_opcode::flat_load_ubyte_d16_hi:
980    case aco_opcode::flat_load_short_d16_hi:
981    case aco_opcode::scratch_load_ubyte_d16_hi:
982    case aco_opcode::scratch_load_short_d16_hi:
983    case aco_opcode::global_load_ubyte_d16_hi:
984    case aco_opcode::global_load_short_d16_hi:
985    case aco_opcode::ds_read_u8_d16_hi:
986    case aco_opcode::ds_read_u16_d16_hi: return byte == 2;
987    default: break;
988    }
989
990    return byte == 0;
991 }
992
993 unsigned
994 get_subdword_bytes_written(Program* program, const aco_ptr<Instruction>& instr, unsigned index)
995 {
996    amd_gfx_level gfx_level = program->gfx_level;
997    Definition def = instr->definitions[index];
998
999    if (instr->isPseudo())
1000       return gfx_level >= GFX8 ? def.bytes() : def.size() * 4u;
1001    if (instr->isVALU()) {
1002       assert(def.bytes() <= 2);
1003       if (instr->isSDWA())
1004          return instr->sdwa().dst_sel.size();
1005
1006       if (instr_is_16bit(gfx_level, instr->opcode))
1007          return 2;
1008
1009       return 4;
1010    }
1011
1012    if (instr->isMIMG()) {
1013       assert(instr->mimg().d16);
1014       return program->dev.sram_ecc_enabled ? def.size() * 4u : def.bytes();
1015    }
1016
1017    switch (instr->opcode) {
1018    case aco_opcode::buffer_load_ubyte_d16:
1019    case aco_opcode::buffer_load_sbyte_d16:
1020    case aco_opcode::buffer_load_short_d16:
1021    case aco_opcode::buffer_load_format_d16_x:
1022    case aco_opcode::tbuffer_load_format_d16_x:
1023    case aco_opcode::flat_load_ubyte_d16:
1024    case aco_opcode::flat_load_short_d16:
1025    case aco_opcode::scratch_load_ubyte_d16:
1026    case aco_opcode::scratch_load_short_d16:
1027    case aco_opcode::global_load_ubyte_d16:
1028    case aco_opcode::global_load_short_d16:
1029    case aco_opcode::ds_read_u8_d16:
1030    case aco_opcode::ds_read_u16_d16:
1031    case aco_opcode::buffer_load_ubyte_d16_hi:
1032    case aco_opcode::buffer_load_sbyte_d16_hi:
1033    case aco_opcode::buffer_load_short_d16_hi:
1034    case aco_opcode::buffer_load_format_d16_hi_x:
1035    case aco_opcode::flat_load_ubyte_d16_hi:
1036    case aco_opcode::flat_load_short_d16_hi:
1037    case aco_opcode::scratch_load_ubyte_d16_hi:
1038    case aco_opcode::scratch_load_short_d16_hi:
1039    case aco_opcode::global_load_ubyte_d16_hi:
1040    case aco_opcode::global_load_short_d16_hi:
1041    case aco_opcode::ds_read_u8_d16_hi:
1042    case aco_opcode::ds_read_u16_d16_hi: return program->dev.sram_ecc_enabled ? 4 : 2;
1043    case aco_opcode::buffer_load_format_d16_xyz:
1044    case aco_opcode::tbuffer_load_format_d16_xyz: return program->dev.sram_ecc_enabled ? 8 : 6;
1045    default: return def.size() * 4;
1046    }
1047 }
1048
1049 bool
1050 validate_instr_defs(Program* program, std::array<unsigned, 2048>& regs,
1051                     const std::vector<Assignment>& assignments, const Location& loc,
1052                     aco_ptr<Instruction>& instr)
1053 {
1054    bool err = false;
1055
1056    for (unsigned i = 0; i < instr->definitions.size(); i++) {
1057       Definition& def = instr->definitions[i];
1058       if (!def.isTemp())
1059          continue;
1060       Temp tmp = def.getTemp();
1061       PhysReg reg = assignments[tmp.id()].reg;
1062       for (unsigned j = 0; j < tmp.bytes(); j++) {
1063          if (regs[reg.reg_b + j])
1064             err |=
1065                ra_fail(program, loc, assignments[regs[reg.reg_b + j]].defloc,
1066                        "Assignment of element %d of %%%d already taken by %%%d from instruction", i,
1067                        tmp.id(), regs[reg.reg_b + j]);
1068          regs[reg.reg_b + j] = tmp.id();
1069       }
1070       if (def.regClass().is_subdword() && def.bytes() < 4) {
1071          unsigned written = get_subdword_bytes_written(program, instr, i);
1072          /* If written=4, the instruction still might write the upper half. In that case, it's
1073           * the lower half that isn't preserved */
1074          for (unsigned j = reg.byte() & ~(written - 1); j < written; j++) {
1075             unsigned written_reg = reg.reg() * 4u + j;
1076             if (regs[written_reg] && regs[written_reg] != def.tempId())
1077                err |= ra_fail(program, loc, assignments[regs[written_reg]].defloc,
1078                               "Assignment of element %d of %%%d overwrites the full register "
1079                               "taken by %%%d from instruction",
1080                               i, tmp.id(), regs[written_reg]);
1081          }
1082       }
1083    }
1084
1085    for (const Definition& def : instr->definitions) {
1086       if (!def.isTemp())
1087          continue;
1088       if (def.isKill()) {
1089          for (unsigned j = 0; j < def.getTemp().bytes(); j++)
1090             regs[def.physReg().reg_b + j] = 0;
1091       }
1092    }
1093
1094    return err;
1095 }
1096
1097 } /* end namespace */
1098
1099 bool
1100 validate_ra(Program* program)
1101 {
1102    if (!(debug_flags & DEBUG_VALIDATE_RA))
1103       return false;
1104
1105    bool err = false;
1106    aco::live live_vars = aco::live_var_analysis(program);
1107    std::vector<std::vector<Temp>> phi_sgpr_ops(program->blocks.size());
1108    uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->num_waves);
1109
1110    std::vector<Assignment> assignments(program->peekAllocationId());
1111    for (Block& block : program->blocks) {
1112       Location loc;
1113       loc.block = &block;
1114       for (aco_ptr<Instruction>& instr : block.instructions) {
1115          if (instr->opcode == aco_opcode::p_phi) {
1116             for (unsigned i = 0; i < instr->operands.size(); i++) {
1117                if (instr->operands[i].isTemp() &&
1118                    instr->operands[i].getTemp().type() == RegType::sgpr &&
1119                    instr->operands[i].isFirstKill())
1120                   phi_sgpr_ops[block.logical_preds[i]].emplace_back(instr->operands[i].getTemp());
1121             }
1122          }
1123
1124          loc.instr = instr.get();
1125          for (unsigned i = 0; i < instr->operands.size(); i++) {
1126             Operand& op = instr->operands[i];
1127             if (!op.isTemp())
1128                continue;
1129             if (!op.isFixed())
1130                err |= ra_fail(program, loc, Location(), "Operand %d is not assigned a register", i);
1131             if (assignments[op.tempId()].valid && assignments[op.tempId()].reg != op.physReg())
1132                err |=
1133                   ra_fail(program, loc, assignments[op.tempId()].firstloc,
1134                           "Operand %d has an inconsistent register assignment with instruction", i);
1135             if ((op.getTemp().type() == RegType::vgpr &&
1136                  op.physReg().reg_b + op.bytes() > (256 + program->config->num_vgprs) * 4) ||
1137                 (op.getTemp().type() == RegType::sgpr &&
1138                  op.physReg() + op.size() > program->config->num_sgprs &&
1139                  op.physReg() < sgpr_limit))
1140                err |= ra_fail(program, loc, assignments[op.tempId()].firstloc,
1141                               "Operand %d has an out-of-bounds register assignment", i);
1142             if (op.physReg() == vcc && !program->needs_vcc)
1143                err |= ra_fail(program, loc, Location(),
1144                               "Operand %d fixed to vcc but needs_vcc=false", i);
1145             if (op.regClass().is_subdword() &&
1146                 !validate_subdword_operand(program->gfx_level, instr, i))
1147                err |= ra_fail(program, loc, Location(), "Operand %d not aligned correctly", i);
1148             if (!assignments[op.tempId()].firstloc.block)
1149                assignments[op.tempId()].firstloc = loc;
1150             if (!assignments[op.tempId()].defloc.block) {
1151                assignments[op.tempId()].reg = op.physReg();
1152                assignments[op.tempId()].valid = true;
1153             }
1154          }
1155
1156          for (unsigned i = 0; i < instr->definitions.size(); i++) {
1157             Definition& def = instr->definitions[i];
1158             if (!def.isTemp())
1159                continue;
1160             if (!def.isFixed())
1161                err |=
1162                   ra_fail(program, loc, Location(), "Definition %d is not assigned a register", i);
1163             if (assignments[def.tempId()].defloc.block)
1164                err |= ra_fail(program, loc, assignments[def.tempId()].defloc,
1165                               "Temporary %%%d also defined by instruction", def.tempId());
1166             if ((def.getTemp().type() == RegType::vgpr &&
1167                  def.physReg().reg_b + def.bytes() > (256 + program->config->num_vgprs) * 4) ||
1168                 (def.getTemp().type() == RegType::sgpr &&
1169                  def.physReg() + def.size() > program->config->num_sgprs &&
1170                  def.physReg() < sgpr_limit))
1171                err |= ra_fail(program, loc, assignments[def.tempId()].firstloc,
1172                               "Definition %d has an out-of-bounds register assignment", i);
1173             if (def.physReg() == vcc && !program->needs_vcc)
1174                err |= ra_fail(program, loc, Location(),
1175                               "Definition %d fixed to vcc but needs_vcc=false", i);
1176             if (def.regClass().is_subdword() &&
1177                 !validate_subdword_definition(program->gfx_level, instr))
1178                err |= ra_fail(program, loc, Location(), "Definition %d not aligned correctly", i);
1179             if (!assignments[def.tempId()].firstloc.block)
1180                assignments[def.tempId()].firstloc = loc;
1181             assignments[def.tempId()].defloc = loc;
1182             assignments[def.tempId()].reg = def.physReg();
1183             assignments[def.tempId()].valid = true;
1184          }
1185       }
1186    }
1187
1188    for (Block& block : program->blocks) {
1189       Location loc;
1190       loc.block = &block;
1191
1192       std::array<unsigned, 2048> regs; /* register file in bytes */
1193       regs.fill(0);
1194
1195       IDSet live = live_vars.live_out[block.index];
1196       /* remove killed p_phi sgpr operands */
1197       for (Temp tmp : phi_sgpr_ops[block.index])
1198          live.erase(tmp.id());
1199
1200       /* check live out */
1201       for (unsigned id : live) {
1202          Temp tmp(id, program->temp_rc[id]);
1203          PhysReg reg = assignments[id].reg;
1204          for (unsigned i = 0; i < tmp.bytes(); i++) {
1205             if (regs[reg.reg_b + i]) {
1206                err |= ra_fail(program, loc, Location(),
1207                               "Assignment of element %d of %%%d already taken by %%%d in live-out",
1208                               i, id, regs[reg.reg_b + i]);
1209             }
1210             regs[reg.reg_b + i] = id;
1211          }
1212       }
1213       regs.fill(0);
1214
1215       for (auto it = block.instructions.rbegin(); it != block.instructions.rend(); ++it) {
1216          aco_ptr<Instruction>& instr = *it;
1217
1218          /* check killed p_phi sgpr operands */
1219          if (instr->opcode == aco_opcode::p_logical_end) {
1220             for (Temp tmp : phi_sgpr_ops[block.index]) {
1221                PhysReg reg = assignments[tmp.id()].reg;
1222                for (unsigned i = 0; i < tmp.bytes(); i++) {
1223                   if (regs[reg.reg_b + i])
1224                      err |= ra_fail(
1225                         program, loc, Location(),
1226                         "Assignment of element %d of %%%d already taken by %%%d in live-out", i,
1227                         tmp.id(), regs[reg.reg_b + i]);
1228                }
1229                live.insert(tmp.id());
1230             }
1231          }
1232
1233          for (const Definition& def : instr->definitions) {
1234             if (!def.isTemp())
1235                continue;
1236             live.erase(def.tempId());
1237          }
1238
1239          /* don't count phi operands as live-in, since they are actually
1240           * killed when they are copied at the predecessor */
1241          if (instr->opcode != aco_opcode::p_phi && instr->opcode != aco_opcode::p_linear_phi) {
1242             for (const Operand& op : instr->operands) {
1243                if (!op.isTemp())
1244                   continue;
1245                live.insert(op.tempId());
1246             }
1247          }
1248       }
1249
1250       for (unsigned id : live) {
1251          Temp tmp(id, program->temp_rc[id]);
1252          PhysReg reg = assignments[id].reg;
1253          for (unsigned i = 0; i < tmp.bytes(); i++)
1254             regs[reg.reg_b + i] = id;
1255       }
1256
1257       for (aco_ptr<Instruction>& instr : block.instructions) {
1258          loc.instr = instr.get();
1259
1260          /* remove killed p_phi operands from regs */
1261          if (instr->opcode == aco_opcode::p_logical_end) {
1262             for (Temp tmp : phi_sgpr_ops[block.index]) {
1263                PhysReg reg = assignments[tmp.id()].reg;
1264                for (unsigned i = 0; i < tmp.bytes(); i++)
1265                   regs[reg.reg_b + i] = 0;
1266             }
1267          }
1268
1269          if (instr->opcode != aco_opcode::p_phi && instr->opcode != aco_opcode::p_linear_phi) {
1270             for (const Operand& op : instr->operands) {
1271                if (!op.isTemp())
1272                   continue;
1273                if (op.isFirstKillBeforeDef()) {
1274                   for (unsigned j = 0; j < op.getTemp().bytes(); j++)
1275                      regs[op.physReg().reg_b + j] = 0;
1276                }
1277             }
1278          }
1279
1280          if (!instr->isBranch() || block.linear_succs.size() != 1)
1281             err |= validate_instr_defs(program, regs, assignments, loc, instr);
1282
1283          if (!is_phi(instr)) {
1284             for (const Operand& op : instr->operands) {
1285                if (!op.isTemp())
1286                   continue;
1287                if (op.isLateKill() && op.isFirstKill()) {
1288                   for (unsigned j = 0; j < op.getTemp().bytes(); j++)
1289                      regs[op.physReg().reg_b + j] = 0;
1290                }
1291             }
1292          } else if (block.linear_preds.size() != 1 ||
1293                     program->blocks[block.linear_preds[0]].linear_succs.size() == 1) {
1294             for (unsigned pred : block.linear_preds) {
1295                aco_ptr<Instruction>& br = program->blocks[pred].instructions.back();
1296                assert(br->isBranch());
1297                err |= validate_instr_defs(program, regs, assignments, loc, br);
1298             }
1299          }
1300       }
1301    }
1302
1303    return err;
1304 }
1305 } // namespace aco