2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
26 #include "util/u_format.h"
27 #include "util/u_memory.h"
28 #include "pipe/p_shader_tokens.h"
29 #include "r600_pipe.h"
31 #include "r600_opcodes.h"
33 #include "r600_formats.h"
36 #define NUM_OF_CYCLES 3
37 #define NUM_OF_COMPONENTS 4
39 static inline unsigned int r600_bc_get_num_operands(struct r600_bc *bc, struct r600_bc_alu *alu)
44 switch (bc->chiprev) {
48 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
50 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
51 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT:
52 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
53 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
54 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
55 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
56 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
57 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT:
58 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
59 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
60 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
61 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
62 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
63 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
64 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
65 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
66 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
67 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
68 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
69 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
70 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
73 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
74 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA:
75 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR:
76 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
77 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
78 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
79 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
80 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
81 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
82 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
83 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
84 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
85 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
86 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
87 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
88 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT:
89 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
90 case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
93 "Need instruction operand number for 0x%x.\n", alu->inst);
96 case CHIPREV_EVERGREEN:
99 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
101 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD:
102 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD_INT:
103 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE:
104 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT:
105 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE:
106 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE:
107 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL:
108 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT:
109 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX:
110 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN:
111 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE:
112 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE:
113 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT:
114 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE:
115 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
116 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
117 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
118 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
119 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4:
120 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE:
121 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE:
122 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_XY:
123 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INTERP_ZW:
126 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
127 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
128 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
129 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
130 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
131 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
132 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED:
133 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE:
134 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED:
135 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE:
136 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
137 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
138 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
139 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR:
140 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT:
141 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
142 case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
145 "Need instruction operand number for 0x%x.\n", alu->inst);
153 int r700_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsigned id);
155 static struct r600_bc_cf *r600_bc_cf(void)
157 struct r600_bc_cf *cf = CALLOC_STRUCT(r600_bc_cf);
161 LIST_INITHEAD(&cf->list);
162 LIST_INITHEAD(&cf->alu);
163 LIST_INITHEAD(&cf->vtx);
164 LIST_INITHEAD(&cf->tex);
168 static struct r600_bc_alu *r600_bc_alu(void)
170 struct r600_bc_alu *alu = CALLOC_STRUCT(r600_bc_alu);
174 LIST_INITHEAD(&alu->list);
178 static struct r600_bc_vtx *r600_bc_vtx(void)
180 struct r600_bc_vtx *vtx = CALLOC_STRUCT(r600_bc_vtx);
184 LIST_INITHEAD(&vtx->list);
188 static struct r600_bc_tex *r600_bc_tex(void)
190 struct r600_bc_tex *tex = CALLOC_STRUCT(r600_bc_tex);
194 LIST_INITHEAD(&tex->list);
198 int r600_bc_init(struct r600_bc *bc, enum radeon_family family)
200 LIST_INITHEAD(&bc->cf);
202 switch (bc->family) {
211 bc->chiprev = CHIPREV_R600;
217 bc->chiprev = CHIPREV_R700;
230 bc->chiprev = CHIPREV_EVERGREEN;
233 bc->chiprev = CHIPREV_CAYMAN;
236 R600_ERR("unknown family %d\n", bc->family);
242 static int r600_bc_add_cf(struct r600_bc *bc)
244 struct r600_bc_cf *cf = r600_bc_cf();
248 LIST_ADDTAIL(&cf->list, &bc->cf);
250 cf->id = bc->cf_last->id + 2;
254 bc->force_add_cf = 0;
258 int r600_bc_add_output(struct r600_bc *bc, const struct r600_bc_output *output)
262 if (bc->cf_last && (bc->cf_last->inst == output->inst ||
263 (bc->cf_last->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT) &&
264 output->inst == BC_INST(bc, V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE))) &&
265 output->type == bc->cf_last->output.type &&
266 output->elem_size == bc->cf_last->output.elem_size &&
267 output->swizzle_x == bc->cf_last->output.swizzle_x &&
268 output->swizzle_y == bc->cf_last->output.swizzle_y &&
269 output->swizzle_z == bc->cf_last->output.swizzle_z &&
270 output->swizzle_w == bc->cf_last->output.swizzle_w &&
271 (output->burst_count + bc->cf_last->output.burst_count) <= 16) {
273 if ((output->gpr + output->burst_count) == bc->cf_last->output.gpr &&
274 (output->array_base + output->burst_count) == bc->cf_last->output.array_base) {
276 bc->cf_last->output.end_of_program |= output->end_of_program;
277 bc->cf_last->output.inst = output->inst;
278 bc->cf_last->output.gpr = output->gpr;
279 bc->cf_last->output.array_base = output->array_base;
280 bc->cf_last->output.burst_count += output->burst_count;
283 } else if (output->gpr == (bc->cf_last->output.gpr + bc->cf_last->output.burst_count) &&
284 output->array_base == (bc->cf_last->output.array_base + bc->cf_last->output.burst_count)) {
286 bc->cf_last->output.end_of_program |= output->end_of_program;
287 bc->cf_last->output.inst = output->inst;
288 bc->cf_last->output.burst_count += output->burst_count;
293 r = r600_bc_add_cf(bc);
296 bc->cf_last->inst = output->inst;
297 memcpy(&bc->cf_last->output, output, sizeof(struct r600_bc_output));
301 /* alu instructions that can ony exits once per group */
302 static int is_alu_once_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
304 switch (bc->chiprev) {
307 return !alu->is_op3 && (
308 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
309 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
310 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
311 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
312 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
313 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
314 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
315 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
316 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
317 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT ||
318 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
319 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
320 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
321 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
322 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
323 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
324 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
325 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
326 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
327 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
328 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
329 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
330 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
331 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
332 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
333 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
334 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
335 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
336 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
337 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
338 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
339 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
340 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
341 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
342 case CHIPREV_EVERGREEN:
345 return !alu->is_op3 && (
346 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
347 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT ||
348 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE ||
349 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE ||
350 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_UINT ||
351 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_UINT ||
352 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE_INT ||
353 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT_INT ||
354 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGE_INT ||
355 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLNE_INT ||
356 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_UINT ||
357 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_UINT ||
358 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE ||
359 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT ||
360 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE ||
361 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE ||
362 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_INV ||
363 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_POP ||
364 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_CLR ||
365 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SET_RESTORE ||
366 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH ||
367 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH ||
368 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH ||
369 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH ||
370 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT ||
371 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_INT ||
372 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_INT ||
373 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_INT ||
374 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_PUSH_INT ||
375 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT_PUSH_INT ||
376 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE_PUSH_INT ||
377 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE_PUSH_INT ||
378 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
379 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
383 static int is_alu_reduction_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
385 switch (bc->chiprev) {
388 return !alu->is_op3 && (
389 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
390 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
391 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
392 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
393 case CHIPREV_EVERGREEN:
396 return !alu->is_op3 && (
397 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
398 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4 ||
399 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
400 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
404 static int is_alu_cube_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
406 switch (bc->chiprev) {
409 return !alu->is_op3 &&
410 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
411 case CHIPREV_EVERGREEN:
414 return !alu->is_op3 &&
415 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
419 static int is_alu_mova_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
421 switch (bc->chiprev) {
424 return !alu->is_op3 && (
425 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA ||
426 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR ||
427 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
428 case CHIPREV_EVERGREEN:
431 return !alu->is_op3 && (
432 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
436 /* alu instructions that can only execute on the vector unit */
437 static int is_alu_vec_unit_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
439 return is_alu_reduction_inst(bc, alu) ||
440 is_alu_mova_inst(bc, alu) ||
441 (bc->chiprev == CHIPREV_EVERGREEN &&
442 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR);
445 /* alu instructions that can only execute on the trans unit */
446 static int is_alu_trans_unit_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
448 switch (bc->chiprev) {
452 return alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
453 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT ||
454 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
455 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT ||
456 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT ||
457 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT ||
458 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT ||
459 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT ||
460 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT ||
461 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT ||
462 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT ||
463 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT ||
464 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS ||
465 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE ||
466 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED ||
467 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE ||
468 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED ||
469 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_FF ||
470 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE ||
471 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED ||
472 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_FF ||
473 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE ||
474 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN ||
475 alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SQRT_IEEE;
477 return alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT ||
478 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_D2 ||
479 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M2 ||
480 alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M4;
481 case CHIPREV_EVERGREEN:
485 /* Note that FLT_TO_INT_* instructions are vector-only instructions
486 * on Evergreen, despite what the documentation says. FLT_TO_INT
487 * can do both vector and scalar. */
488 return alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT ||
489 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT ||
490 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHL_INT ||
491 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LSHR_INT ||
492 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_INT ||
493 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT ||
494 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_INT ||
495 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULLO_UINT ||
496 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_INT ||
497 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_UINT ||
498 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_UINT_TO_FLT ||
499 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS ||
500 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE ||
501 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED ||
502 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE ||
503 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_CLAMPED ||
504 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_FF ||
505 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE ||
506 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED ||
507 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_FF ||
508 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE ||
509 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN ||
510 alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SQRT_IEEE;
512 return alu->inst == EG_V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT;
516 /* alu instructions that can execute on any unit */
517 static int is_alu_any_unit_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
519 return !is_alu_vec_unit_inst(bc, alu) &&
520 !is_alu_trans_unit_inst(bc, alu);
523 static int assign_alu_units(struct r600_bc *bc, struct r600_bc_alu *alu_first,
524 struct r600_bc_alu *assignment[5])
526 struct r600_bc_alu *alu;
527 unsigned i, chan, trans;
528 int max_slots = bc->chiprev == CHIPREV_CAYMAN ? 4 : 5;
530 for (i = 0; i < max_slots; i++)
531 assignment[i] = NULL;
533 for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bc_alu, alu->list.next, list)) {
534 chan = alu->dst.chan;
537 else if (is_alu_trans_unit_inst(bc, alu))
539 else if (is_alu_vec_unit_inst(bc, alu))
541 else if (assignment[chan])
542 trans = 1; /* Assume ALU_INST_PREFER_VECTOR. */
548 assert(0); /* ALU.Trans has already been allocated. */
553 if (assignment[chan]) {
554 assert(0); /* ALU.chan has already been allocated. */
557 assignment[chan] = alu;
566 struct alu_bank_swizzle {
567 int hw_gpr[NUM_OF_CYCLES][NUM_OF_COMPONENTS];
568 int hw_cfile_addr[4];
569 int hw_cfile_elem[4];
572 static const unsigned cycle_for_bank_swizzle_vec[][3] = {
573 [SQ_ALU_VEC_012] = { 0, 1, 2 },
574 [SQ_ALU_VEC_021] = { 0, 2, 1 },
575 [SQ_ALU_VEC_120] = { 1, 2, 0 },
576 [SQ_ALU_VEC_102] = { 1, 0, 2 },
577 [SQ_ALU_VEC_201] = { 2, 0, 1 },
578 [SQ_ALU_VEC_210] = { 2, 1, 0 }
581 static const unsigned cycle_for_bank_swizzle_scl[][3] = {
582 [SQ_ALU_SCL_210] = { 2, 1, 0 },
583 [SQ_ALU_SCL_122] = { 1, 2, 2 },
584 [SQ_ALU_SCL_212] = { 2, 1, 2 },
585 [SQ_ALU_SCL_221] = { 2, 2, 1 }
588 static void init_bank_swizzle(struct alu_bank_swizzle *bs)
590 int i, cycle, component;
592 for (cycle = 0; cycle < NUM_OF_CYCLES; cycle++)
593 for (component = 0; component < NUM_OF_COMPONENTS; component++)
594 bs->hw_gpr[cycle][component] = -1;
595 for (i = 0; i < 4; i++)
596 bs->hw_cfile_addr[i] = -1;
597 for (i = 0; i < 4; i++)
598 bs->hw_cfile_elem[i] = -1;
601 static int reserve_gpr(struct alu_bank_swizzle *bs, unsigned sel, unsigned chan, unsigned cycle)
603 if (bs->hw_gpr[cycle][chan] == -1)
604 bs->hw_gpr[cycle][chan] = sel;
605 else if (bs->hw_gpr[cycle][chan] != (int)sel) {
606 /* Another scalar operation has already used the GPR read port for the channel. */
612 static int reserve_cfile(struct r600_bc *bc, struct alu_bank_swizzle *bs, unsigned sel, unsigned chan)
614 int res, num_res = 4;
615 if (bc->chiprev >= CHIPREV_R700) {
619 for (res = 0; res < num_res; ++res) {
620 if (bs->hw_cfile_addr[res] == -1) {
621 bs->hw_cfile_addr[res] = sel;
622 bs->hw_cfile_elem[res] = chan;
624 } else if (bs->hw_cfile_addr[res] == sel &&
625 bs->hw_cfile_elem[res] == chan)
626 return 0; /* Read for this scalar element already reserved, nothing to do here. */
628 /* All cfile read ports are used, cannot reference vector element. */
632 static int is_gpr(unsigned sel)
634 return (sel >= 0 && sel <= 127);
637 /* CB constants start at 512, and get translated to a kcache index when ALU
638 * clauses are constructed. Note that we handle kcache constants the same way
639 * as (the now gone) cfile constants, is that really required? */
640 static int is_cfile(unsigned sel)
642 return (sel > 255 && sel < 512) ||
643 (sel > 511 && sel < 4607) || /* Kcache before translation. */
644 (sel > 127 && sel < 192); /* Kcache after translation. */
647 static int is_const(int sel)
649 return is_cfile(sel) ||
650 (sel >= V_SQ_ALU_SRC_0 &&
651 sel <= V_SQ_ALU_SRC_LITERAL);
654 static int check_vector(struct r600_bc *bc, struct r600_bc_alu *alu,
655 struct alu_bank_swizzle *bs, int bank_swizzle)
657 int r, src, num_src, sel, elem, cycle;
659 num_src = r600_bc_get_num_operands(bc, alu);
660 for (src = 0; src < num_src; src++) {
661 sel = alu->src[src].sel;
662 elem = alu->src[src].chan;
664 cycle = cycle_for_bank_swizzle_vec[bank_swizzle][src];
665 if (src == 1 && sel == alu->src[0].sel && elem == alu->src[0].chan)
666 /* Nothing to do; special-case optimization,
667 * second source uses first source’s reservation. */
670 r = reserve_gpr(bs, sel, elem, cycle);
674 } else if (is_cfile(sel)) {
675 r = reserve_cfile(bc, bs, sel, elem);
679 /* No restrictions on PV, PS, literal or special constants. */
684 static int check_scalar(struct r600_bc *bc, struct r600_bc_alu *alu,
685 struct alu_bank_swizzle *bs, int bank_swizzle)
687 int r, src, num_src, const_count, sel, elem, cycle;
689 num_src = r600_bc_get_num_operands(bc, alu);
690 for (const_count = 0, src = 0; src < num_src; ++src) {
691 sel = alu->src[src].sel;
692 elem = alu->src[src].chan;
693 if (is_const(sel)) { /* Any constant, including literal and inline constants. */
694 if (const_count >= 2)
695 /* More than two references to a constant in
696 * transcendental operation. */
702 r = reserve_cfile(bc, bs, sel, elem);
707 for (src = 0; src < num_src; ++src) {
708 sel = alu->src[src].sel;
709 elem = alu->src[src].chan;
711 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
712 if (cycle < const_count)
713 /* Cycle for GPR load conflicts with
714 * constant load in transcendental operation. */
716 r = reserve_gpr(bs, sel, elem, cycle);
720 /* PV PS restrictions */
721 if (const_count && (sel == 254 || sel == 255)) {
722 cycle = cycle_for_bank_swizzle_scl[bank_swizzle][src];
723 if (cycle < const_count)
730 static int check_and_set_bank_swizzle(struct r600_bc *bc,
731 struct r600_bc_alu *slots[5])
733 struct alu_bank_swizzle bs;
735 int i, r = 0, forced = 1;
736 boolean scalar_only = bc->chiprev == CHIPREV_CAYMAN ? false : true;
737 int max_slots = bc->chiprev == CHIPREV_CAYMAN ? 4 : 5;
739 for (i = 0; i < max_slots; i++) {
741 if (slots[i]->bank_swizzle_force) {
742 slots[i]->bank_swizzle = slots[i]->bank_swizzle_force;
748 if (i < 4 && slots[i])
754 /* Just check every possible combination of bank swizzle.
755 * Not very efficent, but works on the first try in most of the cases. */
756 for (i = 0; i < 4; i++)
757 if (!slots[i] || !slots[i]->bank_swizzle_force)
758 bank_swizzle[i] = SQ_ALU_VEC_012;
760 bank_swizzle[i] = slots[i]->bank_swizzle;
762 bank_swizzle[4] = SQ_ALU_SCL_210;
763 while(bank_swizzle[4] <= SQ_ALU_SCL_221) {
765 if (max_slots == 4) {
766 for (i = 0; i < max_slots; i++) {
767 if (bank_swizzle[i] == SQ_ALU_VEC_210)
771 init_bank_swizzle(&bs);
772 if (scalar_only == false) {
773 for (i = 0; i < 4; i++) {
775 r = check_vector(bc, slots[i], &bs, bank_swizzle[i]);
783 if (!r && slots[4] && max_slots == 5) {
784 r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]);
787 for (i = 0; i < max_slots; i++) {
789 slots[i]->bank_swizzle = bank_swizzle[i];
797 for (i = 0; i < max_slots; i++) {
798 if (!slots[i] || !slots[i]->bank_swizzle_force) {
800 if (bank_swizzle[i] <= SQ_ALU_VEC_210)
803 bank_swizzle[i] = SQ_ALU_VEC_012;
809 /* Couldn't find a working swizzle. */
813 static int replace_gpr_with_pv_ps(struct r600_bc *bc,
814 struct r600_bc_alu *slots[5], struct r600_bc_alu *alu_prev)
816 struct r600_bc_alu *prev[5];
818 int i, j, r, src, num_src;
819 int max_slots = bc->chiprev == CHIPREV_CAYMAN ? 4 : 5;
821 r = assign_alu_units(bc, alu_prev, prev);
825 for (i = 0; i < max_slots; ++i) {
826 if (prev[i] && (prev[i]->dst.write || prev[i]->is_op3) && !prev[i]->dst.rel) {
827 gpr[i] = prev[i]->dst.sel;
828 /* cube writes more than PV.X */
829 if (!is_alu_cube_inst(bc, prev[i]) && is_alu_reduction_inst(bc, prev[i]))
832 chan[i] = prev[i]->dst.chan;
837 for (i = 0; i < max_slots; ++i) {
838 struct r600_bc_alu *alu = slots[i];
842 num_src = r600_bc_get_num_operands(bc, alu);
843 for (src = 0; src < num_src; ++src) {
844 if (!is_gpr(alu->src[src].sel) || alu->src[src].rel)
847 if (bc->chiprev < CHIPREV_CAYMAN) {
848 if (alu->src[src].sel == gpr[4] &&
849 alu->src[src].chan == chan[4]) {
850 alu->src[src].sel = V_SQ_ALU_SRC_PS;
851 alu->src[src].chan = 0;
856 for (j = 0; j < 4; ++j) {
857 if (alu->src[src].sel == gpr[j] &&
858 alu->src[src].chan == j) {
859 alu->src[src].sel = V_SQ_ALU_SRC_PV;
860 alu->src[src].chan = chan[j];
870 void r600_bc_special_constants(u32 value, unsigned *sel, unsigned *neg)
874 *sel = V_SQ_ALU_SRC_0;
877 *sel = V_SQ_ALU_SRC_1_INT;
880 *sel = V_SQ_ALU_SRC_M_1_INT;
882 case 0x3F800000: /* 1.0f */
883 *sel = V_SQ_ALU_SRC_1;
885 case 0x3F000000: /* 0.5f */
886 *sel = V_SQ_ALU_SRC_0_5;
888 case 0xBF800000: /* -1.0f */
889 *sel = V_SQ_ALU_SRC_1;
892 case 0xBF000000: /* -0.5f */
893 *sel = V_SQ_ALU_SRC_0_5;
897 *sel = V_SQ_ALU_SRC_LITERAL;
902 /* compute how many literal are needed */
903 static int r600_bc_alu_nliterals(struct r600_bc *bc, struct r600_bc_alu *alu,
904 uint32_t literal[4], unsigned *nliteral)
906 unsigned num_src = r600_bc_get_num_operands(bc, alu);
909 for (i = 0; i < num_src; ++i) {
910 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
911 uint32_t value = alu->src[i].value;
913 for (j = 0; j < *nliteral; ++j) {
914 if (literal[j] == value) {
922 literal[(*nliteral)++] = value;
929 static void r600_bc_alu_adjust_literals(struct r600_bc *bc,
930 struct r600_bc_alu *alu,
931 uint32_t literal[4], unsigned nliteral)
933 unsigned num_src = r600_bc_get_num_operands(bc, alu);
936 for (i = 0; i < num_src; ++i) {
937 if (alu->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
938 uint32_t value = alu->src[i].value;
939 for (j = 0; j < nliteral; ++j) {
940 if (literal[j] == value) {
941 alu->src[i].chan = j;
949 static int merge_inst_groups(struct r600_bc *bc, struct r600_bc_alu *slots[5],
950 struct r600_bc_alu *alu_prev)
952 struct r600_bc_alu *prev[5];
953 struct r600_bc_alu *result[5] = { NULL };
955 uint32_t literal[4], prev_literal[4];
956 unsigned nliteral = 0, prev_nliteral = 0;
958 int i, j, r, src, num_src;
959 int num_once_inst = 0;
960 int have_mova = 0, have_rel = 0;
961 int max_slots = bc->chiprev == CHIPREV_CAYMAN ? 4 : 5;
963 r = assign_alu_units(bc, alu_prev, prev);
967 for (i = 0; i < max_slots; ++i) {
968 struct r600_bc_alu *alu;
970 /* check number of literals */
972 if (r600_bc_alu_nliterals(bc, prev[i], literal, &nliteral))
974 if (r600_bc_alu_nliterals(bc, prev[i], prev_literal, &prev_nliteral))
976 if (is_alu_mova_inst(bc, prev[i])) {
981 num_once_inst += is_alu_once_inst(bc, prev[i]);
983 if (slots[i] && r600_bc_alu_nliterals(bc, slots[i], literal, &nliteral))
986 /* Let's check used slots. */
987 if (prev[i] && !slots[i]) {
990 } else if (prev[i] && slots[i]) {
991 if (max_slots == 5 && result[4] == NULL && prev[4] == NULL && slots[4] == NULL) {
992 /* Trans unit is still free try to use it. */
993 if (is_alu_any_unit_inst(bc, slots[i])) {
995 result[4] = slots[i];
996 } else if (is_alu_any_unit_inst(bc, prev[i])) {
997 result[i] = slots[i];
1003 } else if(!slots[i]) {
1006 result[i] = slots[i];
1009 num_once_inst += is_alu_once_inst(bc, alu);
1011 /* Let's check dst gpr. */
1018 /* Let's check source gprs */
1019 num_src = r600_bc_get_num_operands(bc, alu);
1020 for (src = 0; src < num_src; ++src) {
1021 if (alu->src[src].rel) {
1027 /* Constants don't matter. */
1028 if (!is_gpr(alu->src[src].sel))
1031 for (j = 0; j < max_slots; ++j) {
1032 if (!prev[j] || !prev[j]->dst.write)
1035 /* If it's relative then we can't determin which gpr is really used. */
1036 if (prev[j]->dst.chan == alu->src[src].chan &&
1037 (prev[j]->dst.sel == alu->src[src].sel ||
1038 prev[j]->dst.rel || alu->src[src].rel))
1044 /* more than one PRED_ or KILL_ ? */
1045 if (num_once_inst > 1)
1048 /* check if the result can still be swizzlet */
1049 r = check_and_set_bank_swizzle(bc, result);
1053 /* looks like everything worked out right, apply the changes */
1055 /* undo adding previus literals */
1056 bc->cf_last->ndw -= align(prev_nliteral, 2);
1058 /* sort instructions */
1059 for (i = 0; i < max_slots; ++i) {
1060 slots[i] = result[i];
1062 LIST_DEL(&result[i]->list);
1063 result[i]->last = 0;
1064 LIST_ADDTAIL(&result[i]->list, &bc->cf_last->alu);
1068 /* determine new last instruction */
1069 LIST_ENTRY(struct r600_bc_alu, bc->cf_last->alu.prev, list)->last = 1;
1071 /* determine new first instruction */
1072 for (i = 0; i < max_slots; ++i) {
1074 bc->cf_last->curr_bs_head = result[i];
1079 bc->cf_last->prev_bs_head = bc->cf_last->prev2_bs_head;
1080 bc->cf_last->prev2_bs_head = NULL;
1085 /* This code handles kcache lines as single blocks of 32 constants. We could
1086 * probably do slightly better by recognizing that we actually have two
1087 * consecutive lines of 16 constants, but the resulting code would also be
1088 * somewhat more complicated. */
1089 static int r600_bc_alloc_kcache_lines(struct r600_bc *bc, struct r600_bc_alu *alu, int type)
1091 struct r600_bc_kcache *kcache = bc->cf_last->kcache;
1092 unsigned int required_lines;
1093 unsigned int free_lines = 0;
1094 unsigned int cache_line[3];
1095 unsigned int count = 0;
1099 /* Collect required cache lines. */
1100 for (i = 0; i < 3; ++i) {
1101 boolean found = false;
1104 if (alu->src[i].sel < 512)
1107 line = ((alu->src[i].sel - 512) / 32) * 2;
1109 for (j = 0; j < count; ++j) {
1110 if (cache_line[j] == line) {
1117 cache_line[count++] = line;
1120 /* This should never actually happen. */
1121 if (count >= 3) return -ENOMEM;
1123 for (i = 0; i < 2; ++i) {
1124 if (kcache[i].mode == V_SQ_CF_KCACHE_NOP) {
1129 /* Filter lines pulled in by previous intructions. Note that this is
1130 * only for the required_lines count, we can't remove these from the
1131 * cache_line array since we may have to start a new ALU clause. */
1132 for (i = 0, required_lines = count; i < count; ++i) {
1133 for (j = 0; j < 2; ++j) {
1134 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1135 kcache[j].addr == cache_line[i]) {
1142 /* Start a new ALU clause if needed. */
1143 if (required_lines > free_lines) {
1144 if ((r = r600_bc_add_cf(bc))) {
1147 bc->cf_last->inst = (type << 3);
1148 kcache = bc->cf_last->kcache;
1151 /* Setup the kcache lines. */
1152 for (i = 0; i < count; ++i) {
1153 boolean found = false;
1155 for (j = 0; j < 2; ++j) {
1156 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1157 kcache[j].addr == cache_line[i]) {
1163 if (found) continue;
1165 for (j = 0; j < 2; ++j) {
1166 if (kcache[j].mode == V_SQ_CF_KCACHE_NOP) {
1168 kcache[j].addr = cache_line[i];
1169 kcache[j].mode = V_SQ_CF_KCACHE_LOCK_2;
1175 /* Alter the src operands to refer to the kcache. */
1176 for (i = 0; i < 3; ++i) {
1177 static const unsigned int base[] = {128, 160, 256, 288};
1180 if (alu->src[i].sel < 512)
1183 alu->src[i].sel -= 512;
1184 line = (alu->src[i].sel / 32) * 2;
1186 for (j = 0; j < 2; ++j) {
1187 if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
1188 kcache[j].addr == line) {
1189 alu->src[i].sel &= 0x1f;
1190 alu->src[i].sel += base[j];
1199 int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int type)
1201 struct r600_bc_alu *nalu = r600_bc_alu();
1202 struct r600_bc_alu *lalu;
1207 memcpy(nalu, alu, sizeof(struct r600_bc_alu));
1209 if (bc->cf_last != NULL && bc->cf_last->inst != (type << 3)) {
1210 /* check if we could add it anyway */
1211 if (bc->cf_last->inst == (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3) &&
1212 type == V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE) {
1213 LIST_FOR_EACH_ENTRY(lalu, &bc->cf_last->alu, list) {
1214 if (lalu->predicate) {
1215 bc->force_add_cf = 1;
1220 bc->force_add_cf = 1;
1223 /* cf can contains only alu or only vtx or only tex */
1224 if (bc->cf_last == NULL || bc->force_add_cf) {
1225 r = r600_bc_add_cf(bc);
1231 bc->cf_last->inst = (type << 3);
1233 /* Setup the kcache for this ALU instruction. This will start a new
1234 * ALU clause if needed. */
1235 if ((r = r600_bc_alloc_kcache_lines(bc, nalu, type))) {
1240 if (!bc->cf_last->curr_bs_head) {
1241 bc->cf_last->curr_bs_head = nalu;
1243 /* number of gpr == the last gpr used in any alu */
1244 for (i = 0; i < 3; i++) {
1245 if (nalu->src[i].sel >= bc->ngpr && nalu->src[i].sel < 128) {
1246 bc->ngpr = nalu->src[i].sel + 1;
1248 if (nalu->src[i].sel == V_SQ_ALU_SRC_LITERAL)
1249 r600_bc_special_constants(nalu->src[i].value,
1250 &nalu->src[i].sel, &nalu->src[i].neg);
1252 if (nalu->dst.sel >= bc->ngpr) {
1253 bc->ngpr = nalu->dst.sel + 1;
1255 LIST_ADDTAIL(&nalu->list, &bc->cf_last->alu);
1256 /* each alu use 2 dwords */
1257 bc->cf_last->ndw += 2;
1260 /* process cur ALU instructions for bank swizzle */
1262 uint32_t literal[4];
1264 struct r600_bc_alu *slots[5];
1265 int max_slots = bc->chiprev == CHIPREV_CAYMAN ? 4 : 5;
1266 r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots);
1270 if (bc->cf_last->prev_bs_head) {
1271 r = merge_inst_groups(bc, slots, bc->cf_last->prev_bs_head);
1276 if (bc->cf_last->prev_bs_head) {
1277 r = replace_gpr_with_pv_ps(bc, slots, bc->cf_last->prev_bs_head);
1282 r = check_and_set_bank_swizzle(bc, slots);
1286 for (i = 0, nliteral = 0; i < max_slots; i++) {
1288 r = r600_bc_alu_nliterals(bc, slots[i], literal, &nliteral);
1293 bc->cf_last->ndw += align(nliteral, 2);
1295 /* at most 128 slots, one add alu can add 5 slots + 4 constants(2 slots)
1297 if ((bc->cf_last->ndw >> 1) >= 120) {
1298 bc->force_add_cf = 1;
1301 bc->cf_last->prev2_bs_head = bc->cf_last->prev_bs_head;
1302 bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head;
1303 bc->cf_last->curr_bs_head = NULL;
1308 int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu)
1310 return r600_bc_add_alu_type(bc, alu, BC_INST(bc, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU));
1313 static unsigned r600_bc_num_tex_and_vtx_instructions(const struct r600_bc *bc)
1315 switch (bc->chiprev) {
1322 case CHIPREV_EVERGREEN:
1323 case CHIPREV_CAYMAN:
1327 R600_ERR("Unknown chiprev %d.\n", bc->chiprev);
1332 static inline boolean last_inst_was_vtx_fetch(struct r600_bc *bc)
1334 if (bc->chiprev == CHIPREV_CAYMAN) {
1335 if (bc->cf_last->inst != CM_V_SQ_CF_WORD1_SQ_CF_INST_TC)
1338 if (bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX &&
1339 bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC)
1345 int r600_bc_add_vtx(struct r600_bc *bc, const struct r600_bc_vtx *vtx)
1347 struct r600_bc_vtx *nvtx = r600_bc_vtx();
1352 memcpy(nvtx, vtx, sizeof(struct r600_bc_vtx));
1354 /* cf can contains only alu or only vtx or only tex */
1355 if (bc->cf_last == NULL ||
1356 last_inst_was_vtx_fetch(bc) ||
1358 r = r600_bc_add_cf(bc);
1363 if (bc->chiprev == CHIPREV_CAYMAN)
1364 bc->cf_last->inst = CM_V_SQ_CF_WORD1_SQ_CF_INST_TC;
1366 bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_VTX;
1368 LIST_ADDTAIL(&nvtx->list, &bc->cf_last->vtx);
1369 /* each fetch use 4 dwords */
1370 bc->cf_last->ndw += 4;
1372 if ((bc->cf_last->ndw / 4) >= r600_bc_num_tex_and_vtx_instructions(bc))
1373 bc->force_add_cf = 1;
1377 int r600_bc_add_tex(struct r600_bc *bc, const struct r600_bc_tex *tex)
1379 struct r600_bc_tex *ntex = r600_bc_tex();
1384 memcpy(ntex, tex, sizeof(struct r600_bc_tex));
1386 /* we can't fetch data und use it as texture lookup address in the same TEX clause */
1387 if (bc->cf_last != NULL &&
1388 bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_TEX) {
1389 struct r600_bc_tex *ttex;
1390 LIST_FOR_EACH_ENTRY(ttex, &bc->cf_last->tex, list) {
1391 if (ttex->dst_gpr == ntex->src_gpr) {
1392 bc->force_add_cf = 1;
1396 /* slight hack to make gradients always go into same cf */
1397 if (ntex->inst == SQ_TEX_INST_SET_GRADIENTS_H)
1398 bc->force_add_cf = 1;
1401 /* cf can contains only alu or only vtx or only tex */
1402 if (bc->cf_last == NULL ||
1403 bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_TEX ||
1405 r = r600_bc_add_cf(bc);
1410 bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_TEX;
1412 if (ntex->src_gpr >= bc->ngpr) {
1413 bc->ngpr = ntex->src_gpr + 1;
1415 if (ntex->dst_gpr >= bc->ngpr) {
1416 bc->ngpr = ntex->dst_gpr + 1;
1418 LIST_ADDTAIL(&ntex->list, &bc->cf_last->tex);
1419 /* each texture fetch use 4 dwords */
1420 bc->cf_last->ndw += 4;
1422 if ((bc->cf_last->ndw / 4) >= r600_bc_num_tex_and_vtx_instructions(bc))
1423 bc->force_add_cf = 1;
1427 int r600_bc_add_cfinst(struct r600_bc *bc, int inst)
1430 r = r600_bc_add_cf(bc);
1434 bc->cf_last->cond = V_SQ_CF_COND_ACTIVE;
1435 bc->cf_last->inst = inst;
1439 int cm_bc_add_cf_end(struct r600_bc *bc)
1441 return r600_bc_add_cfinst(bc, CM_V_SQ_CF_WORD1_SQ_CF_INST_END);
1444 /* common to all 3 families */
1445 static int r600_bc_vtx_build(struct r600_bc *bc, struct r600_bc_vtx *vtx, unsigned id)
1447 bc->bytecode[id] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
1448 S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) |
1449 S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
1450 S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x);
1451 if (bc->chiprev < CHIPREV_CAYMAN)
1452 bc->bytecode[id] |= S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count);
1454 bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) |
1455 S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) |
1456 S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) |
1457 S_SQ_VTX_WORD1_DST_SEL_W(vtx->dst_sel_w) |
1458 S_SQ_VTX_WORD1_USE_CONST_FIELDS(vtx->use_const_fields) |
1459 S_SQ_VTX_WORD1_DATA_FORMAT(vtx->data_format) |
1460 S_SQ_VTX_WORD1_NUM_FORMAT_ALL(vtx->num_format_all) |
1461 S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) |
1462 S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) |
1463 S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr);
1464 bc->bytecode[id] = S_SQ_VTX_WORD2_OFFSET(vtx->offset)|
1465 S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian);
1466 if (bc->chiprev < CHIPREV_CAYMAN)
1467 bc->bytecode[id] |= S_SQ_VTX_WORD2_MEGA_FETCH(1);
1469 bc->bytecode[id++] = 0;
1473 /* common to all 3 families */
1474 static int r600_bc_tex_build(struct r600_bc *bc, struct r600_bc_tex *tex, unsigned id)
1476 bc->bytecode[id++] = S_SQ_TEX_WORD0_TEX_INST(tex->inst) |
1477 S_SQ_TEX_WORD0_RESOURCE_ID(tex->resource_id) |
1478 S_SQ_TEX_WORD0_SRC_GPR(tex->src_gpr) |
1479 S_SQ_TEX_WORD0_SRC_REL(tex->src_rel);
1480 bc->bytecode[id++] = S_SQ_TEX_WORD1_DST_GPR(tex->dst_gpr) |
1481 S_SQ_TEX_WORD1_DST_REL(tex->dst_rel) |
1482 S_SQ_TEX_WORD1_DST_SEL_X(tex->dst_sel_x) |
1483 S_SQ_TEX_WORD1_DST_SEL_Y(tex->dst_sel_y) |
1484 S_SQ_TEX_WORD1_DST_SEL_Z(tex->dst_sel_z) |
1485 S_SQ_TEX_WORD1_DST_SEL_W(tex->dst_sel_w) |
1486 S_SQ_TEX_WORD1_LOD_BIAS(tex->lod_bias) |
1487 S_SQ_TEX_WORD1_COORD_TYPE_X(tex->coord_type_x) |
1488 S_SQ_TEX_WORD1_COORD_TYPE_Y(tex->coord_type_y) |
1489 S_SQ_TEX_WORD1_COORD_TYPE_Z(tex->coord_type_z) |
1490 S_SQ_TEX_WORD1_COORD_TYPE_W(tex->coord_type_w);
1491 bc->bytecode[id++] = S_SQ_TEX_WORD2_OFFSET_X(tex->offset_x) |
1492 S_SQ_TEX_WORD2_OFFSET_Y(tex->offset_y) |
1493 S_SQ_TEX_WORD2_OFFSET_Z(tex->offset_z) |
1494 S_SQ_TEX_WORD2_SAMPLER_ID(tex->sampler_id) |
1495 S_SQ_TEX_WORD2_SRC_SEL_X(tex->src_sel_x) |
1496 S_SQ_TEX_WORD2_SRC_SEL_Y(tex->src_sel_y) |
1497 S_SQ_TEX_WORD2_SRC_SEL_Z(tex->src_sel_z) |
1498 S_SQ_TEX_WORD2_SRC_SEL_W(tex->src_sel_w);
1499 bc->bytecode[id++] = 0;
1503 /* r600 only, r700/eg bits in r700_asm.c */
1504 static int r600_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsigned id)
1506 /* don't replace gpr by pv or ps for destination register */
1507 bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) |
1508 S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) |
1509 S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) |
1510 S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) |
1511 S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) |
1512 S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) |
1513 S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) |
1514 S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) |
1515 S_SQ_ALU_WORD0_LAST(alu->last);
1518 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1519 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1520 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1521 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1522 S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) |
1523 S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) |
1524 S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) |
1525 S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) |
1526 S_SQ_ALU_WORD1_OP3_ALU_INST(alu->inst) |
1527 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle);
1529 bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) |
1530 S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) |
1531 S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) |
1532 S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) |
1533 S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) |
1534 S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) |
1535 S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) |
1536 S_SQ_ALU_WORD1_OP2_OMOD(alu->omod) |
1537 S_SQ_ALU_WORD1_OP2_ALU_INST(alu->inst) |
1538 S_SQ_ALU_WORD1_BANK_SWIZZLE(alu->bank_swizzle) |
1539 S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->predicate) |
1540 S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->predicate);
1545 static void r600_bc_cf_vtx_build(uint32_t *bytecode, const struct r600_bc_cf *cf)
1547 *bytecode++ = S_SQ_CF_WORD0_ADDR(cf->addr >> 1);
1548 *bytecode++ = S_SQ_CF_WORD1_CF_INST(cf->inst) |
1549 S_SQ_CF_WORD1_BARRIER(1) |
1550 S_SQ_CF_WORD1_COUNT((cf->ndw / 4) - 1);
1553 /* common for r600/r700 - eg in eg_asm.c */
1554 static int r600_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf)
1556 unsigned id = cf->id;
1559 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1560 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1561 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1562 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1563 bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1) |
1564 S_SQ_CF_ALU_WORD0_KCACHE_MODE0(cf->kcache[0].mode) |
1565 S_SQ_CF_ALU_WORD0_KCACHE_BANK0(cf->kcache[0].bank) |
1566 S_SQ_CF_ALU_WORD0_KCACHE_BANK1(cf->kcache[1].bank);
1568 bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(cf->inst >> 3) |
1569 S_SQ_CF_ALU_WORD1_KCACHE_MODE1(cf->kcache[1].mode) |
1570 S_SQ_CF_ALU_WORD1_KCACHE_ADDR0(cf->kcache[0].addr) |
1571 S_SQ_CF_ALU_WORD1_KCACHE_ADDR1(cf->kcache[1].addr) |
1572 S_SQ_CF_ALU_WORD1_BARRIER(1) |
1573 S_SQ_CF_ALU_WORD1_USES_WATERFALL(bc->chiprev == CHIPREV_R600 ? cf->r6xx_uses_waterfall : 0) |
1574 S_SQ_CF_ALU_WORD1_COUNT((cf->ndw / 2) - 1);
1576 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1577 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1578 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1579 if (bc->chiprev == CHIPREV_R700)
1580 r700_bc_cf_vtx_build(&bc->bytecode[id], cf);
1582 r600_bc_cf_vtx_build(&bc->bytecode[id], cf);
1584 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1585 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1586 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR(cf->output.gpr) |
1587 S_SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE(cf->output.elem_size) |
1588 S_SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE(cf->output.array_base) |
1589 S_SQ_CF_ALLOC_EXPORT_WORD0_TYPE(cf->output.type);
1590 bc->bytecode[id++] = S_SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT(cf->output.burst_count - 1) |
1591 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X(cf->output.swizzle_x) |
1592 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y(cf->output.swizzle_y) |
1593 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z(cf->output.swizzle_z) |
1594 S_SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W(cf->output.swizzle_w) |
1595 S_SQ_CF_ALLOC_EXPORT_WORD1_BARRIER(cf->output.barrier) |
1596 S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->output.inst) |
1597 S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program);
1599 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1600 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1601 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1602 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1603 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1604 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1605 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1606 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1607 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1608 bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1);
1609 bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(cf->inst) |
1610 S_SQ_CF_WORD1_BARRIER(1) |
1611 S_SQ_CF_WORD1_COND(cf->cond) |
1612 S_SQ_CF_WORD1_POP_COUNT(cf->pop_count);
1616 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1622 int r600_bc_build(struct r600_bc *bc)
1624 struct r600_bc_cf *cf;
1625 struct r600_bc_alu *alu;
1626 struct r600_bc_vtx *vtx;
1627 struct r600_bc_tex *tex;
1628 uint32_t literal[4];
1633 if (bc->callstack[0].max > 0)
1634 bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2;
1635 if (bc->type == TGSI_PROCESSOR_VERTEX && !bc->nstack) {
1639 /* first path compute addr of each CF block */
1640 /* addr start after all the CF instructions */
1641 addr = bc->cf_last->id + 2;
1642 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1644 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1645 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1646 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1647 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1649 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1650 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1651 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1652 /* fetch node need to be 16 bytes aligned*/
1654 addr &= 0xFFFFFFFCUL;
1656 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1657 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1658 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1659 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1661 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1662 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1663 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1664 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1665 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1666 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1667 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1668 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1669 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1670 case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
1673 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1678 bc->ndw = cf->addr + cf->ndw;
1681 bc->bytecode = calloc(1, bc->ndw * 4);
1682 if (bc->bytecode == NULL)
1684 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1686 if (bc->chiprev >= CHIPREV_EVERGREEN)
1687 r = eg_bc_cf_build(bc, cf);
1689 r = r600_bc_cf_build(bc, cf);
1693 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1694 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1695 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1696 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1698 memset(literal, 0, sizeof(literal));
1699 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
1700 r = r600_bc_alu_nliterals(bc, alu, literal, &nliteral);
1703 r600_bc_alu_adjust_literals(bc, alu, literal, nliteral);
1704 switch(bc->chiprev) {
1706 r = r600_bc_alu_build(bc, alu, addr);
1709 case CHIPREV_EVERGREEN: /* eg alu is same encoding as r700 */
1710 case CHIPREV_CAYMAN: /* eg alu is same encoding as r700 */
1711 r = r700_bc_alu_build(bc, alu, addr);
1714 R600_ERR("unknown family %d\n", bc->family);
1721 for (i = 0; i < align(nliteral, 2); ++i) {
1722 bc->bytecode[addr++] = literal[i];
1725 memset(literal, 0, sizeof(literal));
1729 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1730 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1731 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1732 r = r600_bc_vtx_build(bc, vtx, addr);
1738 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1739 if (bc->chiprev == CHIPREV_CAYMAN) {
1740 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1741 r = r600_bc_vtx_build(bc, vtx, addr);
1747 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
1748 r = r600_bc_tex_build(bc, tex, addr);
1754 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1755 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1756 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1757 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1758 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1759 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1760 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1761 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1762 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1763 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1764 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1765 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1766 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1767 case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
1770 R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
1777 void r600_bc_clear(struct r600_bc *bc)
1779 struct r600_bc_cf *cf = NULL, *next_cf;
1782 bc->bytecode = NULL;
1784 LIST_FOR_EACH_ENTRY_SAFE(cf, next_cf, &bc->cf, list) {
1785 struct r600_bc_alu *alu = NULL, *next_alu;
1786 struct r600_bc_tex *tex = NULL, *next_tex;
1787 struct r600_bc_tex *vtx = NULL, *next_vtx;
1789 LIST_FOR_EACH_ENTRY_SAFE(alu, next_alu, &cf->alu, list) {
1793 LIST_INITHEAD(&cf->alu);
1795 LIST_FOR_EACH_ENTRY_SAFE(tex, next_tex, &cf->tex, list) {
1799 LIST_INITHEAD(&cf->tex);
1801 LIST_FOR_EACH_ENTRY_SAFE(vtx, next_vtx, &cf->vtx, list) {
1805 LIST_INITHEAD(&cf->vtx);
1810 LIST_INITHEAD(&cf->list);
1813 void r600_bc_dump(struct r600_bc *bc)
1815 struct r600_bc_cf *cf = NULL;
1816 struct r600_bc_alu *alu = NULL;
1817 struct r600_bc_vtx *vtx = NULL;
1818 struct r600_bc_tex *tex = NULL;
1821 uint32_t literal[4];
1825 switch (bc->chiprev) {
1840 fprintf(stderr, "bytecode %d dw -- %d gprs ---------------------\n", bc->ndw, bc->ngpr);
1841 fprintf(stderr, " %c\n", chip);
1843 LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
1847 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3):
1848 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP_AFTER << 3):
1849 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_POP2_AFTER << 3):
1850 case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3):
1851 fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
1852 fprintf(stderr, "ADDR:%d ", cf->addr);
1853 fprintf(stderr, "KCACHE_MODE0:%X ", cf->kcache[0].mode);
1854 fprintf(stderr, "KCACHE_BANK0:%X ", cf->kcache[0].bank);
1855 fprintf(stderr, "KCACHE_BANK1:%X\n", cf->kcache[1].bank);
1857 fprintf(stderr, "%04d %08X ALU ", id, bc->bytecode[id]);
1858 fprintf(stderr, "INST:%d ", cf->inst);
1859 fprintf(stderr, "KCACHE_MODE1:%X ", cf->kcache[1].mode);
1860 fprintf(stderr, "KCACHE_ADDR0:%X ", cf->kcache[0].addr);
1861 fprintf(stderr, "KCACHE_ADDR1:%X ", cf->kcache[1].addr);
1862 fprintf(stderr, "COUNT:%d\n", cf->ndw / 2);
1864 case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
1865 case V_SQ_CF_WORD1_SQ_CF_INST_VTX:
1866 case V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC:
1867 fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
1868 fprintf(stderr, "ADDR:%d\n", cf->addr);
1870 fprintf(stderr, "%04d %08X TEX/VTX ", id, bc->bytecode[id]);
1871 fprintf(stderr, "INST:%d ", cf->inst);
1872 fprintf(stderr, "COUNT:%d\n", cf->ndw / 4);
1874 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1875 case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1876 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT:
1877 case EG_V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE:
1878 fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
1879 fprintf(stderr, "GPR:%X ", cf->output.gpr);
1880 fprintf(stderr, "ELEM_SIZE:%X ", cf->output.elem_size);
1881 fprintf(stderr, "ARRAY_BASE:%X ", cf->output.array_base);
1882 fprintf(stderr, "TYPE:%X\n", cf->output.type);
1884 fprintf(stderr, "%04d %08X EXPORT ", id, bc->bytecode[id]);
1885 fprintf(stderr, "SWIZ_X:%X ", cf->output.swizzle_x);
1886 fprintf(stderr, "SWIZ_Y:%X ", cf->output.swizzle_y);
1887 fprintf(stderr, "SWIZ_Z:%X ", cf->output.swizzle_z);
1888 fprintf(stderr, "SWIZ_W:%X ", cf->output.swizzle_w);
1889 fprintf(stderr, "BARRIER:%X ", cf->output.barrier);
1890 fprintf(stderr, "INST:%d ", cf->output.inst);
1891 fprintf(stderr, "BURST_COUNT:%d ", cf->output.burst_count);
1892 fprintf(stderr, "EOP:%X\n", cf->output.end_of_program);
1894 case V_SQ_CF_WORD1_SQ_CF_INST_JUMP:
1895 case V_SQ_CF_WORD1_SQ_CF_INST_ELSE:
1896 case V_SQ_CF_WORD1_SQ_CF_INST_POP:
1897 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL:
1898 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END:
1899 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE:
1900 case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
1901 case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
1902 case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
1903 case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
1904 fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
1905 fprintf(stderr, "ADDR:%d\n", cf->cf_addr);
1907 fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
1908 fprintf(stderr, "INST:%d ", cf->inst);
1909 fprintf(stderr, "COND:%X ", cf->cond);
1910 fprintf(stderr, "POP_COUNT:%X\n", cf->pop_count);
1916 LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) {
1917 r600_bc_alu_nliterals(bc, alu, literal, &nliteral);
1919 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1920 fprintf(stderr, "SRC0(SEL:%d ", alu->src[0].sel);
1921 fprintf(stderr, "REL:%d ", alu->src[0].rel);
1922 fprintf(stderr, "CHAN:%d ", alu->src[0].chan);
1923 fprintf(stderr, "NEG:%d) ", alu->src[0].neg);
1924 fprintf(stderr, "SRC1(SEL:%d ", alu->src[1].sel);
1925 fprintf(stderr, "REL:%d ", alu->src[1].rel);
1926 fprintf(stderr, "CHAN:%d ", alu->src[1].chan);
1927 fprintf(stderr, "NEG:%d) ", alu->src[1].neg);
1928 fprintf(stderr, "LAST:%d)\n", alu->last);
1930 fprintf(stderr, "%04d %08X %c ", id, bc->bytecode[id], alu->last ? '*' : ' ');
1931 fprintf(stderr, "INST:%d ", alu->inst);
1932 fprintf(stderr, "DST(SEL:%d ", alu->dst.sel);
1933 fprintf(stderr, "CHAN:%d ", alu->dst.chan);
1934 fprintf(stderr, "REL:%d ", alu->dst.rel);
1935 fprintf(stderr, "CLAMP:%d) ", alu->dst.clamp);
1936 fprintf(stderr, "BANK_SWIZZLE:%d ", alu->bank_swizzle);
1938 fprintf(stderr, "SRC2(SEL:%d ", alu->src[2].sel);
1939 fprintf(stderr, "REL:%d ", alu->src[2].rel);
1940 fprintf(stderr, "CHAN:%d ", alu->src[2].chan);
1941 fprintf(stderr, "NEG:%d)\n", alu->src[2].neg);
1943 fprintf(stderr, "SRC0_ABS:%d ", alu->src[0].abs);
1944 fprintf(stderr, "SRC1_ABS:%d ", alu->src[1].abs);
1945 fprintf(stderr, "WRITE_MASK:%d ", alu->dst.write);
1946 fprintf(stderr, "OMOD:%d ", alu->omod);
1947 fprintf(stderr, "EXECUTE_MASK:%d ", alu->predicate);
1948 fprintf(stderr, "UPDATE_PRED:%d\n", alu->predicate);
1953 for (i = 0; i < nliteral; i++, id++) {
1954 float *f = (float*)(bc->bytecode + id);
1955 fprintf(stderr, "%04d %08X\t%f\n", id, bc->bytecode[id], *f);
1962 LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
1963 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1964 fprintf(stderr, "INST:%d ", tex->inst);
1965 fprintf(stderr, "RESOURCE_ID:%d ", tex->resource_id);
1966 fprintf(stderr, "SRC(GPR:%d ", tex->src_gpr);
1967 fprintf(stderr, "REL:%d)\n", tex->src_rel);
1969 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1970 fprintf(stderr, "DST(GPR:%d ", tex->dst_gpr);
1971 fprintf(stderr, "REL:%d ", tex->dst_rel);
1972 fprintf(stderr, "SEL_X:%d ", tex->dst_sel_x);
1973 fprintf(stderr, "SEL_Y:%d ", tex->dst_sel_y);
1974 fprintf(stderr, "SEL_Z:%d ", tex->dst_sel_z);
1975 fprintf(stderr, "SEL_W:%d) ", tex->dst_sel_w);
1976 fprintf(stderr, "LOD_BIAS:%d ", tex->lod_bias);
1977 fprintf(stderr, "COORD_TYPE_X:%d ", tex->coord_type_x);
1978 fprintf(stderr, "COORD_TYPE_Y:%d ", tex->coord_type_y);
1979 fprintf(stderr, "COORD_TYPE_Z:%d ", tex->coord_type_z);
1980 fprintf(stderr, "COORD_TYPE_W:%d\n", tex->coord_type_w);
1982 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1983 fprintf(stderr, "OFFSET_X:%d ", tex->offset_x);
1984 fprintf(stderr, "OFFSET_Y:%d ", tex->offset_y);
1985 fprintf(stderr, "OFFSET_Z:%d ", tex->offset_z);
1986 fprintf(stderr, "SAMPLER_ID:%d ", tex->sampler_id);
1987 fprintf(stderr, "SRC(SEL_X:%d ", tex->src_sel_x);
1988 fprintf(stderr, "SEL_Y:%d ", tex->src_sel_y);
1989 fprintf(stderr, "SEL_Z:%d ", tex->src_sel_z);
1990 fprintf(stderr, "SEL_W:%d)\n", tex->src_sel_w);
1992 fprintf(stderr, "%04d %08X \n", id, bc->bytecode[id]);
1996 LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
1997 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
1998 fprintf(stderr, "INST:%d ", vtx->inst);
1999 fprintf(stderr, "FETCH_TYPE:%d ", vtx->fetch_type);
2000 fprintf(stderr, "BUFFER_ID:%d\n", vtx->buffer_id);
2002 /* This assumes that no semantic fetches exist */
2003 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
2004 fprintf(stderr, "SRC(GPR:%d ", vtx->src_gpr);
2005 fprintf(stderr, "SEL_X:%d) ", vtx->src_sel_x);
2006 if (bc->chiprev < CHIPREV_CAYMAN)
2007 fprintf(stderr, "MEGA_FETCH_COUNT:%d ", vtx->mega_fetch_count);
2009 fprintf(stderr, "SEL_Y:%d) ", 0);
2010 fprintf(stderr, "DST(GPR:%d ", vtx->dst_gpr);
2011 fprintf(stderr, "SEL_X:%d ", vtx->dst_sel_x);
2012 fprintf(stderr, "SEL_Y:%d ", vtx->dst_sel_y);
2013 fprintf(stderr, "SEL_Z:%d ", vtx->dst_sel_z);
2014 fprintf(stderr, "SEL_W:%d) ", vtx->dst_sel_w);
2015 fprintf(stderr, "USE_CONST_FIELDS:%d ", vtx->use_const_fields);
2016 fprintf(stderr, "FORMAT(DATA:%d ", vtx->data_format);
2017 fprintf(stderr, "NUM:%d ", vtx->num_format_all);
2018 fprintf(stderr, "COMP:%d ", vtx->format_comp_all);
2019 fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all);
2021 fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
2022 fprintf(stderr, "ENDIAN:%d ", vtx->endian);
2023 fprintf(stderr, "OFFSET:%d\n", vtx->offset);
2026 fprintf(stderr, "%04d %08X \n", id, bc->bytecode[id]);
2031 fprintf(stderr, "--------------------------------------\n");
2034 static void r600_vertex_data_type(enum pipe_format pformat, unsigned *format,
2035 unsigned *num_format, unsigned *format_comp, unsigned *endian)
2037 const struct util_format_description *desc;
2043 *endian = ENDIAN_NONE;
2045 desc = util_format_description(pformat);
2046 if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
2050 /* Find the first non-VOID channel. */
2051 for (i = 0; i < 4; i++) {
2052 if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) {
2057 *endian = r600_endian_swap(desc->channel[i].size);
2059 switch (desc->channel[i].type) {
2060 /* Half-floats, floats, ints */
2061 case UTIL_FORMAT_TYPE_FLOAT:
2062 switch (desc->channel[i].size) {
2064 switch (desc->nr_channels) {
2066 *format = FMT_16_FLOAT;
2069 *format = FMT_16_16_FLOAT;
2073 *format = FMT_16_16_16_16_FLOAT;
2078 switch (desc->nr_channels) {
2080 *format = FMT_32_FLOAT;
2083 *format = FMT_32_32_FLOAT;
2086 *format = FMT_32_32_32_FLOAT;
2089 *format = FMT_32_32_32_32_FLOAT;
2098 case UTIL_FORMAT_TYPE_UNSIGNED:
2100 case UTIL_FORMAT_TYPE_SIGNED:
2101 switch (desc->channel[i].size) {
2103 switch (desc->nr_channels) {
2112 *format = FMT_8_8_8_8;
2117 switch (desc->nr_channels) {
2122 *format = FMT_16_16;
2126 *format = FMT_16_16_16_16;
2131 switch (desc->nr_channels) {
2136 *format = FMT_32_32;
2139 *format = FMT_32_32_32;
2142 *format = FMT_32_32_32_32;
2154 if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
2157 if (desc->channel[i].normalized) {
2164 R600_ERR("unsupported vertex format %s\n", util_format_name(pformat));
2167 int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, struct r600_vertex_element *ve)
2169 static int dump_shaders = -1;
2172 struct r600_bc_vtx vtx;
2173 struct pipe_vertex_element *elements = ve->elements;
2174 const struct util_format_description *desc;
2175 unsigned fetch_resource_start = rctx->family >= CHIP_CEDAR ? 0 : 160;
2176 unsigned format, num_format, format_comp, endian;
2180 /* Vertex element offsets need special handling. If the offset is
2181 * bigger than what we can put in the fetch instruction we need to
2182 * alter the vertex resource offset. In order to simplify code we
2183 * will bind one resource per element in such cases. It's a worst
2185 for (i = 0; i < ve->count; i++) {
2186 ve->vbuffer_offset[i] = C_SQ_VTX_WORD2_OFFSET & elements[i].src_offset;
2187 if (ve->vbuffer_offset[i]) {
2188 ve->vbuffer_need_offset = 1;
2192 memset(&bc, 0, sizeof(bc));
2193 r = r600_bc_init(&bc, r600_get_family(rctx->radeon));
2197 for (i = 0; i < ve->count; i++) {
2198 if (elements[i].instance_divisor > 1) {
2199 struct r600_bc_alu alu;
2201 memset(&alu, 0, sizeof(alu));
2202 alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MULHI_UINT);
2204 alu.src[0].chan = 3;
2206 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2207 alu.src[1].value = (1ll << 32) / elements[i].instance_divisor + 1;
2209 alu.dst.sel = i + 1;
2214 if ((r = r600_bc_add_alu(&bc, &alu))) {
2221 for (i = 0; i < ve->count; i++) {
2222 unsigned vbuffer_index;
2223 r600_vertex_data_type(ve->elements[i].src_format, &format, &num_format, &format_comp, &endian);
2224 desc = util_format_description(ve->elements[i].src_format);
2227 R600_ERR("unknown format %d\n", ve->elements[i].src_format);
2231 /* see above for vbuffer_need_offset explanation */
2232 vbuffer_index = elements[i].vertex_buffer_index;
2233 memset(&vtx, 0, sizeof(vtx));
2234 vtx.buffer_id = (ve->vbuffer_need_offset ? i : vbuffer_index) + fetch_resource_start;
2235 vtx.fetch_type = elements[i].instance_divisor ? 1 : 0;
2236 vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0;
2237 vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0;
2238 vtx.mega_fetch_count = 0x1F;
2239 vtx.dst_gpr = i + 1;
2240 vtx.dst_sel_x = desc->swizzle[0];
2241 vtx.dst_sel_y = desc->swizzle[1];
2242 vtx.dst_sel_z = desc->swizzle[2];
2243 vtx.dst_sel_w = desc->swizzle[3];
2244 vtx.data_format = format;
2245 vtx.num_format_all = num_format;
2246 vtx.format_comp_all = format_comp;
2247 vtx.srf_mode_all = 1;
2248 vtx.offset = elements[i].src_offset;
2249 vtx.endian = endian;
2251 if ((r = r600_bc_add_vtx(&bc, &vtx))) {
2257 r600_bc_add_cfinst(&bc, BC_INST(&bc, V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
2259 if ((r = r600_bc_build(&bc))) {
2264 if (dump_shaders == -1)
2265 dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
2268 fprintf(stderr, "--------------------------------------------------------------\n");
2270 fprintf(stderr, "______________________________________________________________\n");
2273 ve->fs_size = bc.ndw*4;
2275 /* use PIPE_BIND_VERTEX_BUFFER so we use the cache buffer manager */
2276 ve->fetch_shader = r600_bo(rctx->radeon, ve->fs_size, 256, PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_IMMUTABLE);
2277 if (ve->fetch_shader == NULL) {
2282 bytecode = r600_bo_map(rctx->radeon, ve->fetch_shader, 0, NULL);
2283 if (bytecode == NULL) {
2285 r600_bo_reference(rctx->radeon, &ve->fetch_shader, NULL);
2289 if (R600_BIG_ENDIAN) {
2290 for (i = 0; i < ve->fs_size / 4; ++i) {
2291 bytecode[i] = bswap_32(bc.bytecode[i]);
2294 memcpy(bytecode, bc.bytecode, ve->fs_size);
2297 r600_bo_unmap(rctx->radeon, ve->fetch_shader);
2300 if (rctx->family >= CHIP_CEDAR)
2301 evergreen_fetch_shader(&rctx->context, ve);
2303 r600_fetch_shader(&rctx->context, ve);