Tizen 2.1 base
[sdk/emulator/qemu.git] / gl / mesa / src / gallium / drivers / r300 / compiler / r3xx_vertprog.c
1 /*
2  * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
22
23 #include "radeon_compiler.h"
24
25 #include <stdio.h>
26
27 #include "../r300_reg.h"
28
29 #include "radeon_compiler_util.h"
30 #include "radeon_dataflow.h"
31 #include "radeon_program_alu.h"
32 #include "radeon_swizzle.h"
33 #include "radeon_emulate_branches.h"
34 #include "radeon_emulate_loops.h"
35 #include "radeon_remove_constants.h"
36
37 struct loop {
38         int BgnLoop;
39
40 };
41
42 /*
43  * Take an already-setup and valid source then swizzle it appropriately to
44  * obtain a constant ZERO or ONE source.
45  */
46 #define __CONST(x, y)   \
47         (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]),      \
48                            t_swizzle(y),        \
49                            t_swizzle(y),        \
50                            t_swizzle(y),        \
51                            t_swizzle(y),        \
52                            t_src_class(vpi->SrcReg[x].File), \
53                            RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
54
55
56 static unsigned long t_dst_mask(unsigned int mask)
57 {
58         /* RC_MASK_* is equivalent to VSF_FLAG_* */
59         return mask & RC_MASK_XYZW;
60 }
61
62 static unsigned long t_dst_class(rc_register_file file)
63 {
64         switch (file) {
65         default:
66                 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
67                 /* fall-through */
68         case RC_FILE_TEMPORARY:
69                 return PVS_DST_REG_TEMPORARY;
70         case RC_FILE_OUTPUT:
71                 return PVS_DST_REG_OUT;
72         case RC_FILE_ADDRESS:
73                 return PVS_DST_REG_A0;
74         }
75 }
76
77 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
78                                  struct rc_dst_register *dst)
79 {
80         if (dst->File == RC_FILE_OUTPUT)
81                 return vp->outputs[dst->Index];
82
83         return dst->Index;
84 }
85
86 static unsigned long t_src_class(rc_register_file file)
87 {
88         switch (file) {
89         default:
90                 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
91                 /* fall-through */
92         case RC_FILE_NONE:
93         case RC_FILE_TEMPORARY:
94                 return PVS_SRC_REG_TEMPORARY;
95         case RC_FILE_INPUT:
96                 return PVS_SRC_REG_INPUT;
97         case RC_FILE_CONSTANT:
98                 return PVS_SRC_REG_CONSTANT;
99         }
100 }
101
102 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
103 {
104         unsigned long aclass = t_src_class(a.File);
105         unsigned long bclass = t_src_class(b.File);
106
107         if (aclass != bclass)
108                 return 0;
109         if (aclass == PVS_SRC_REG_TEMPORARY)
110                 return 0;
111
112         if (a.RelAddr || b.RelAddr)
113                 return 1;
114         if (a.Index != b.Index)
115                 return 1;
116
117         return 0;
118 }
119
120 static inline unsigned long t_swizzle(unsigned int swizzle)
121 {
122         /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
123         return swizzle;
124 }
125
126 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
127                                  struct rc_src_register *src)
128 {
129         if (src->File == RC_FILE_INPUT) {
130                 assert(vp->inputs[src->Index] != -1);
131                 return vp->inputs[src->Index];
132         } else {
133                 if (src->Index < 0) {
134                         fprintf(stderr,
135                                 "negative offsets for indirect addressing do not work.\n");
136                         return 0;
137                 }
138                 return src->Index;
139         }
140 }
141
142 /* these two functions should probably be merged... */
143
144 static unsigned long t_src(struct r300_vertex_program_code *vp,
145                            struct rc_src_register *src)
146 {
147         /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
148          * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
149          */
150         return PVS_SRC_OPERAND(t_src_index(vp, src),
151                                t_swizzle(GET_SWZ(src->Swizzle, 0)),
152                                t_swizzle(GET_SWZ(src->Swizzle, 1)),
153                                t_swizzle(GET_SWZ(src->Swizzle, 2)),
154                                t_swizzle(GET_SWZ(src->Swizzle, 3)),
155                                t_src_class(src->File),
156                                src->Negate) |
157                (src->RelAddr << 4) | (src->Abs << 3);
158 }
159
160 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
161                                   struct rc_src_register *src)
162 {
163         /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
164          * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
165          */
166         unsigned int swz = rc_get_scalar_src_swz(src->Swizzle);
167
168         return PVS_SRC_OPERAND(t_src_index(vp, src),
169                                t_swizzle(swz),
170                                t_swizzle(swz),
171                                t_swizzle(swz),
172                                t_swizzle(swz),
173                                t_src_class(src->File),
174                                src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
175                (src->RelAddr << 4) | (src->Abs << 3);
176 }
177
178 static int valid_dst(struct r300_vertex_program_code *vp,
179                            struct rc_dst_register *dst)
180 {
181         if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
182                 return 0;
183         } else if (dst->File == RC_FILE_ADDRESS) {
184                 assert(dst->Index == 0);
185         }
186
187         return 1;
188 }
189
190 static void ei_vector1(struct r300_vertex_program_code *vp,
191                                 unsigned int hw_opcode,
192                                 struct rc_sub_instruction *vpi,
193                                 unsigned int * inst)
194 {
195         inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
196                                      0,
197                                      0,
198                                      t_dst_index(vp, &vpi->DstReg),
199                                      t_dst_mask(vpi->DstReg.WriteMask),
200                                      t_dst_class(vpi->DstReg.File));
201         inst[1] = t_src(vp, &vpi->SrcReg[0]);
202         inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
203         inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
204 }
205
206 static void ei_vector2(struct r300_vertex_program_code *vp,
207                                 unsigned int hw_opcode,
208                                 struct rc_sub_instruction *vpi,
209                                 unsigned int * inst)
210 {
211         inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
212                                      0,
213                                      0,
214                                      t_dst_index(vp, &vpi->DstReg),
215                                      t_dst_mask(vpi->DstReg.WriteMask),
216                                      t_dst_class(vpi->DstReg.File));
217         inst[1] = t_src(vp, &vpi->SrcReg[0]);
218         inst[2] = t_src(vp, &vpi->SrcReg[1]);
219         inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
220 }
221
222 static void ei_math1(struct r300_vertex_program_code *vp,
223                                 unsigned int hw_opcode,
224                                 struct rc_sub_instruction *vpi,
225                                 unsigned int * inst)
226 {
227         inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
228                                      1,
229                                      0,
230                                      t_dst_index(vp, &vpi->DstReg),
231                                      t_dst_mask(vpi->DstReg.WriteMask),
232                                      t_dst_class(vpi->DstReg.File));
233         inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
234         inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
235         inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
236 }
237
238 static void ei_lit(struct r300_vertex_program_code *vp,
239                                       struct rc_sub_instruction *vpi,
240                                       unsigned int * inst)
241 {
242         //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
243
244         inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
245                                      1,
246                                      0,
247                                      t_dst_index(vp, &vpi->DstReg),
248                                      t_dst_mask(vpi->DstReg.WriteMask),
249                                      t_dst_class(vpi->DstReg.File));
250         /* NOTE: Users swizzling might not work. */
251         inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),      // X
252                                   t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),        // W
253                                   PVS_SRC_SELECT_FORCE_0,       // Z
254                                   t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),        // Y
255                                   t_src_class(vpi->SrcReg[0].File),
256                                   vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
257             (vpi->SrcReg[0].RelAddr << 4);
258         inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),      // Y
259                                   t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),        // W
260                                   PVS_SRC_SELECT_FORCE_0,       // Z
261                                   t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),        // X
262                                   t_src_class(vpi->SrcReg[0].File),
263                                   vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
264             (vpi->SrcReg[0].RelAddr << 4);
265         inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),      // Y
266                                   t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),        // X
267                                   PVS_SRC_SELECT_FORCE_0,       // Z
268                                   t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),        // W
269                                   t_src_class(vpi->SrcReg[0].File),
270                                   vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
271             (vpi->SrcReg[0].RelAddr << 4);
272 }
273
274 static void ei_mad(struct r300_vertex_program_code *vp,
275                                       struct rc_sub_instruction *vpi,
276                                       unsigned int * inst)
277 {
278         unsigned int i;
279         /* Remarks about hardware limitations of MAD
280          * (please preserve this comment, as this information is _NOT_
281          * in the documentation provided by AMD).
282          *
283          * As described in the documentation, MAD with three unique temporary
284          * source registers requires the use of the macro version.
285          *
286          * However (and this is not mentioned in the documentation), apparently
287          * the macro version is _NOT_ a full superset of the normal version.
288          * In particular, the macro version does not always work when relative
289          * addressing is used in the source operands.
290          *
291          * This limitation caused incorrect rendering in Sauerbraten's OpenGL
292          * assembly shader path when using medium quality animations
293          * (i.e. animations with matrix blending instead of quaternion blending).
294          *
295          * Unfortunately, I (nha) have been unable to extract a Piglit regression
296          * test for this issue - for some reason, it is possible to have vertex
297          * programs whose prefix is *exactly* the same as the prefix of the
298          * offending program in Sauerbraten up to the offending instruction
299          * without causing any trouble.
300          *
301          * Bottom line: Only use the macro version only when really necessary;
302          * according to AMD docs, this should improve performance by one clock
303          * as a nice side bonus.
304          */
305         if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
306             vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
307             vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
308             vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
309             vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
310             vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
311                 inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
312                                 0,
313                                 1,
314                                 t_dst_index(vp, &vpi->DstReg),
315                                 t_dst_mask(vpi->DstReg.WriteMask),
316                                 t_dst_class(vpi->DstReg.File));
317         } else {
318                 inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
319                                 0,
320                                 0,
321                                 t_dst_index(vp, &vpi->DstReg),
322                                 t_dst_mask(vpi->DstReg.WriteMask),
323                                 t_dst_class(vpi->DstReg.File));
324
325                 /* Arguments with constant swizzles still count as a unique
326                  * temporary, so we should make sure these arguments share a
327                  * register index with one of the other arguments. */
328                 for (i = 0; i < 3; i++) {
329                         unsigned int j;
330                         if (vpi->SrcReg[i].File != RC_FILE_NONE)
331                                 continue;
332
333                         for (j = 0; j < 3; j++) {
334                                 if (i != j) {
335                                         vpi->SrcReg[i].Index =
336                                                 vpi->SrcReg[j].Index;
337                                         break;
338                                 }
339                         }
340                 }
341         }
342         inst[1] = t_src(vp, &vpi->SrcReg[0]);
343         inst[2] = t_src(vp, &vpi->SrcReg[1]);
344         inst[3] = t_src(vp, &vpi->SrcReg[2]);
345 }
346
347 static void ei_pow(struct r300_vertex_program_code *vp,
348                                       struct rc_sub_instruction *vpi,
349                                       unsigned int * inst)
350 {
351         inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
352                                      1,
353                                      0,
354                                      t_dst_index(vp, &vpi->DstReg),
355                                      t_dst_mask(vpi->DstReg.WriteMask),
356                                      t_dst_class(vpi->DstReg.File));
357         inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
358         inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
359         inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
360 }
361
362 static void mark_write(void * userdata, struct rc_instruction * inst,
363                 rc_register_file file,  unsigned int index, unsigned int mask)
364 {
365         unsigned int * writemasks = userdata;
366
367         if (file != RC_FILE_TEMPORARY)
368                 return;
369
370         if (index >= R300_VS_MAX_TEMPS)
371                 return;
372
373         writemasks[index] |= mask;
374 }
375
376 static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler)
377 {
378         return PVS_SRC_OPERAND(compiler->PredicateIndex,
379                 t_swizzle(RC_SWIZZLE_ZERO),
380                 t_swizzle(RC_SWIZZLE_ZERO),
381                 t_swizzle(RC_SWIZZLE_ZERO),
382                 t_swizzle(RC_SWIZZLE_W),
383                 t_src_class(RC_FILE_TEMPORARY),
384                 0);
385 }
386
387 static unsigned long t_pred_dst(struct r300_vertex_program_compiler * compiler,
388                                         unsigned int hw_opcode, int is_math)
389 {
390         return PVS_OP_DST_OPERAND(hw_opcode,
391              is_math,
392              0,
393              compiler->PredicateIndex,
394              RC_MASK_W,
395              t_dst_class(RC_FILE_TEMPORARY));
396
397 }
398
399 static void ei_if(struct r300_vertex_program_compiler * compiler,
400                                         struct rc_instruction *rci,
401                                         unsigned int * inst,
402                                         unsigned int branch_depth)
403 {
404         unsigned int predicate_opcode;
405         int is_math = 0;
406
407         if (!compiler->Base.is_r500) {
408                 rc_error(&compiler->Base,"Opcode IF not supported\n");
409                 return;
410         }
411
412         /* Reserve a temporary to use as our predicate stack counter, if we
413          * don't already have one. */
414         if (!compiler->PredicateMask) {
415                 unsigned int writemasks[RC_REGISTER_MAX_INDEX];
416                 struct rc_instruction * inst;
417                 unsigned int i;
418                 memset(writemasks, 0, sizeof(writemasks));
419                 for(inst = compiler->Base.Program.Instructions.Next;
420                                 inst != &compiler->Base.Program.Instructions;
421                                                         inst = inst->Next) {
422                         rc_for_all_writes_mask(inst, mark_write, writemasks);
423                 }
424                 for(i = 0; i < compiler->Base.max_temp_regs; i++) {
425                         unsigned int mask = ~writemasks[i] & RC_MASK_XYZW;
426                         /* Only the W component can be used fo the predicate
427                          * stack counter. */
428                         if (mask & RC_MASK_W) {
429                                 compiler->PredicateMask = RC_MASK_W;
430                                 compiler->PredicateIndex = i;
431                                 break;
432                         }
433                 }
434                 if (i == compiler->Base.max_temp_regs) {
435                         rc_error(&compiler->Base, "No free temporary to use for"
436                                         " predicate stack counter.\n");
437                         return;
438                 }
439         }
440         predicate_opcode =
441                         branch_depth ? VE_PRED_SET_NEQ_PUSH : ME_PRED_SET_NEQ;
442
443         rci->U.I.SrcReg[0].Swizzle = RC_MAKE_SWIZZLE_SMEAR(GET_SWZ(rci->U.I.SrcReg[0].Swizzle,0));
444         if (branch_depth == 0) {
445                 is_math = 1;
446                 predicate_opcode = ME_PRED_SET_NEQ;
447                 inst[1] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
448                 inst[2] = 0;
449         } else {
450                 predicate_opcode = VE_PRED_SET_NEQ_PUSH;
451                 inst[1] = t_pred_src(compiler);
452                 inst[2] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
453         }
454
455         inst[0] = t_pred_dst(compiler, predicate_opcode, is_math);
456         inst[3] = 0;
457
458 }
459
460 static void ei_else(struct r300_vertex_program_compiler * compiler,
461                                                         unsigned int * inst)
462 {
463         if (!compiler->Base.is_r500) {
464                 rc_error(&compiler->Base,"Opcode ELSE not supported\n");
465                 return;
466         }
467         inst[0] = t_pred_dst(compiler, ME_PRED_SET_INV, 1);
468         inst[1] = t_pred_src(compiler);
469         inst[2] = 0;
470         inst[3] = 0;
471 }
472
473 static void ei_endif(struct r300_vertex_program_compiler *compiler,
474                                                         unsigned int * inst)
475 {
476         if (!compiler->Base.is_r500) {
477                 rc_error(&compiler->Base,"Opcode ENDIF not supported\n");
478                 return;
479         }
480         inst[0] = t_pred_dst(compiler, ME_PRED_SET_POP, 1);
481         inst[1] = t_pred_src(compiler);
482         inst[2] = 0;
483         inst[3] = 0;
484 }
485
486 static void translate_vertex_program(struct radeon_compiler *c, void *user)
487 {
488         struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
489         struct rc_instruction *rci;
490
491         struct loop * loops = NULL;
492         int current_loop_depth = 0;
493         int loops_reserved = 0;
494
495         unsigned int branch_depth = 0;
496
497         compiler->code->pos_end = 0;    /* Not supported yet */
498         compiler->code->length = 0;
499         compiler->code->num_temporaries = 0;
500
501         compiler->SetHwInputOutput(compiler);
502
503         for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
504                 struct rc_sub_instruction *vpi = &rci->U.I;
505                 unsigned int *inst = compiler->code->body.d + compiler->code->length;
506                 const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
507
508                 /* Skip instructions writing to non-existing destination */
509                 if (!valid_dst(compiler->code, &vpi->DstReg))
510                         continue;
511
512                 if (info->HasDstReg) {
513                         /* Neither is Saturate. */
514                         if (vpi->SaturateMode != RC_SATURATE_NONE) {
515                                 rc_error(&compiler->Base, "Vertex program does not support the Saturate "
516                                          "modifier (yet).\n");
517                         }
518                 }
519
520                 if (compiler->code->length >= c->max_alu_insts * 4) {
521                         rc_error(&compiler->Base, "Vertex program has too many instructions\n");
522                         return;
523                 }
524
525                 assert(compiler->Base.is_r500 ||
526                        (vpi->Opcode != RC_OPCODE_SEQ &&
527                         vpi->Opcode != RC_OPCODE_SNE));
528
529                 switch (vpi->Opcode) {
530                 case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
531                 case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
532                 case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
533                 case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
534                 case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
535                 case RC_OPCODE_ELSE: ei_else(compiler, inst); break;
536                 case RC_OPCODE_ENDIF: ei_endif(compiler, inst); branch_depth--; break;
537                 case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
538                 case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
539                 case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
540                 case RC_OPCODE_IF: ei_if(compiler, rci, inst, branch_depth); branch_depth++; break;
541                 case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
542                 case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
543                 case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
544                 case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
545                 case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
546                 case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
547                 case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
548                 case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
549                 case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
550                 case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
551                 case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
552                 case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
553                 case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
554                 case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
555                 case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
556                 case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
557                 case RC_OPCODE_BGNLOOP:
558                 {
559                         struct loop * l;
560
561                         if ((!compiler->Base.is_r500
562                                 && loops_reserved >= R300_VS_MAX_LOOP_DEPTH)
563                                 || loops_reserved >= R500_VS_MAX_FC_DEPTH) {
564                                 rc_error(&compiler->Base,
565                                                 "Loops are nested too deep.");
566                                 return;
567                         }
568                         memory_pool_array_reserve(&compiler->Base.Pool,
569                                         struct loop, loops, current_loop_depth,
570                                         loops_reserved, 1);
571                         l = &loops[current_loop_depth++];
572                         memset(l , 0, sizeof(struct loop));
573                         l->BgnLoop = (compiler->code->length / 4);
574                         continue;
575                 }
576                 case RC_OPCODE_ENDLOOP:
577                 {
578                         struct loop * l;
579                         unsigned int act_addr;
580                         unsigned int last_addr;
581                         unsigned int ret_addr;
582
583                         assert(loops);
584                         l = &loops[current_loop_depth - 1];
585                         act_addr = l->BgnLoop - 1;
586                         last_addr = (compiler->code->length / 4) - 1;
587                         ret_addr = l->BgnLoop;
588
589                         if (loops_reserved >= R300_VS_MAX_FC_OPS) {
590                                 rc_error(&compiler->Base,
591                                         "Too many flow control instructions.");
592                                 return;
593                         }
594                         if (compiler->Base.is_r500) {
595                                 compiler->code->fc_op_addrs.r500
596                                         [compiler->code->num_fc_ops].lw =
597                                         R500_PVS_FC_ACT_ADRS(act_addr)
598                                         | R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff)
599                                         ;
600                                 compiler->code->fc_op_addrs.r500
601                                         [compiler->code->num_fc_ops].uw =
602                                         R500_PVS_FC_LAST_INST(last_addr)
603                                         | R500_PVS_FC_RTN_INST(ret_addr)
604                                         ;
605                         } else {
606                                 compiler->code->fc_op_addrs.r300
607                                         [compiler->code->num_fc_ops] =
608                                         R300_PVS_FC_ACT_ADRS(act_addr)
609                                         | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
610                                         | R300_PVS_FC_LAST_INST(last_addr)
611                                         | R300_PVS_FC_RTN_INST(ret_addr)
612                                         ;
613                         }
614                         compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
615                                 R300_PVS_FC_LOOP_INIT_VAL(0x0)
616                                 | R300_PVS_FC_LOOP_STEP_VAL(0x1)
617                                 ;
618                         compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
619                                                 compiler->code->num_fc_ops);
620                         compiler->code->num_fc_ops++;
621                         current_loop_depth--;
622                         continue;
623                 }
624
625                 default:
626                         rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
627                         return;
628                 }
629
630                 /* Non-flow control instructions that are inside an if statement
631                  * need to pay attention to the predicate bit. */
632                 if (branch_depth
633                         && vpi->Opcode != RC_OPCODE_IF
634                         && vpi->Opcode != RC_OPCODE_ELSE
635                         && vpi->Opcode != RC_OPCODE_ENDIF) {
636
637                         inst[0] |= (PVS_DST_PRED_ENABLE_MASK
638                                                 << PVS_DST_PRED_ENABLE_SHIFT);
639                         inst[0] |= (PVS_DST_PRED_SENSE_MASK
640                                                 << PVS_DST_PRED_SENSE_SHIFT);
641                 }
642
643                 /* Update the number of temporaries. */
644                 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
645                     vpi->DstReg.Index >= compiler->code->num_temporaries)
646                         compiler->code->num_temporaries = vpi->DstReg.Index + 1;
647
648                 for (unsigned i = 0; i < info->NumSrcRegs; i++)
649                         if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
650                             vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
651                                 compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
652
653                 if (compiler->PredicateMask)
654                         if (compiler->PredicateIndex >= compiler->code->num_temporaries)
655                                 compiler->code->num_temporaries = compiler->PredicateIndex + 1;
656
657                 if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
658                         rc_error(&compiler->Base, "Too many temporaries.\n");
659                         return;
660                 }
661
662                 compiler->code->length += 4;
663
664                 if (compiler->Base.Error)
665                         return;
666         }
667 }
668
669 struct temporary_allocation {
670         unsigned int Allocated:1;
671         unsigned int HwTemp:15;
672         struct rc_instruction * LastRead;
673 };
674
675 static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
676 {
677         struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
678         struct rc_instruction *inst;
679         struct rc_instruction *end_loop = NULL;
680         unsigned int num_orig_temps = 0;
681         char hwtemps[RC_REGISTER_MAX_INDEX];
682         struct temporary_allocation * ta;
683         unsigned int i, j;
684
685         memset(hwtemps, 0, sizeof(hwtemps));
686
687         rc_recompute_ips(c);
688
689         /* Pass 1: Count original temporaries. */
690         for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
691                 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
692
693                 for (i = 0; i < opcode->NumSrcRegs; ++i) {
694                         if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
695                                 if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
696                                         num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
697                         }
698                 }
699
700                 if (opcode->HasDstReg) {
701                         if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
702                                 if (inst->U.I.DstReg.Index >= num_orig_temps)
703                                         num_orig_temps = inst->U.I.DstReg.Index + 1;
704                         }
705                 }
706         }
707
708         ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
709                         sizeof(struct temporary_allocation) * num_orig_temps);
710         memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
711
712         /* Pass 2: Determine original temporary lifetimes */
713         for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
714                 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
715                 /* Instructions inside of loops need to use the ENDLOOP
716                  * instruction as their LastRead. */
717                 if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
718                         int endloops = 1;
719                         struct rc_instruction * ptr;
720                         for(ptr = inst->Next;
721                                 ptr != &compiler->Base.Program.Instructions;
722                                                         ptr = ptr->Next){
723                                 if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
724                                         endloops++;
725                                 } else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
726                                         endloops--;
727                                         if (endloops <= 0) {
728                                                 end_loop = ptr;
729                                                 break;
730                                         }
731                                 }
732                         }
733                 }
734
735                 if (inst == end_loop) {
736                         end_loop = NULL;
737                         continue;
738                 }
739
740                 for (i = 0; i < opcode->NumSrcRegs; ++i) {
741                         if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
742                                 ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst;
743                         }
744                 }
745         }
746
747         /* Pass 3: Register allocation */
748         for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
749                 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
750
751                 for (i = 0; i < opcode->NumSrcRegs; ++i) {
752                         if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
753                                 unsigned int orig = inst->U.I.SrcReg[i].Index;
754                                 inst->U.I.SrcReg[i].Index = ta[orig].HwTemp;
755
756                                 if (ta[orig].Allocated && inst == ta[orig].LastRead)
757                                         hwtemps[ta[orig].HwTemp] = 0;
758                         }
759                 }
760
761                 if (opcode->HasDstReg) {
762                         if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
763                                 unsigned int orig = inst->U.I.DstReg.Index;
764
765                                 if (!ta[orig].Allocated) {
766                                         for(j = 0; j < c->max_temp_regs; ++j) {
767                                                 if (!hwtemps[j])
768                                                         break;
769                                         }
770                                         ta[orig].Allocated = 1;
771                                         ta[orig].HwTemp = j;
772                                         hwtemps[ta[orig].HwTemp] = 1;
773                                 }
774
775                                 inst->U.I.DstReg.Index = ta[orig].HwTemp;
776                         }
777                 }
778         }
779 }
780
781 /**
782  * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
783  * and the Saturate opcode modifier. Only Absolute is currently transformed.
784  */
785 static int transform_nonnative_modifiers(
786         struct radeon_compiler *c,
787         struct rc_instruction *inst,
788         void* unused)
789 {
790         const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
791         unsigned i;
792
793         /* Transform ABS(a) to MAX(a, -a). */
794         for (i = 0; i < opcode->NumSrcRegs; i++) {
795                 if (inst->U.I.SrcReg[i].Abs) {
796                         struct rc_instruction *new_inst;
797                         unsigned temp;
798
799                         inst->U.I.SrcReg[i].Abs = 0;
800
801                         temp = rc_find_free_temporary(c);
802
803                         new_inst = rc_insert_new_instruction(c, inst->Prev);
804                         new_inst->U.I.Opcode = RC_OPCODE_MAX;
805                         new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
806                         new_inst->U.I.DstReg.Index = temp;
807                         new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
808                         new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
809                         new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
810
811                         memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));
812                         inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
813                         inst->U.I.SrcReg[i].Index = temp;
814                         inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
815                 }
816         }
817         return 1;
818 }
819
820 /**
821  * Vertex engine cannot read two inputs or two constants at the same time.
822  * Introduce intermediate MOVs to temporary registers to account for this.
823  */
824 static int transform_source_conflicts(
825         struct radeon_compiler *c,
826         struct rc_instruction* inst,
827         void* unused)
828 {
829         const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
830
831         if (opcode->NumSrcRegs == 3) {
832                 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
833                     || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
834                         int tmpreg = rc_find_free_temporary(c);
835                         struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
836                         inst_mov->U.I.Opcode = RC_OPCODE_MOV;
837                         inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
838                         inst_mov->U.I.DstReg.Index = tmpreg;
839                         inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
840
841                         reset_srcreg(&inst->U.I.SrcReg[2]);
842                         inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
843                         inst->U.I.SrcReg[2].Index = tmpreg;
844                 }
845         }
846
847         if (opcode->NumSrcRegs >= 2) {
848                 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
849                         int tmpreg = rc_find_free_temporary(c);
850                         struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
851                         inst_mov->U.I.Opcode = RC_OPCODE_MOV;
852                         inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
853                         inst_mov->U.I.DstReg.Index = tmpreg;
854                         inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
855
856                         reset_srcreg(&inst->U.I.SrcReg[1]);
857                         inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
858                         inst->U.I.SrcReg[1].Index = tmpreg;
859                 }
860         }
861
862         return 1;
863 }
864
865 static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
866 {
867         struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
868         int i;
869
870         for(i = 0; i < 32; ++i) {
871                 if ((compiler->RequiredOutputs & (1 << i)) &&
872                     !(compiler->Base.Program.OutputsWritten & (1 << i))) {
873                         struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
874                         inst->U.I.Opcode = RC_OPCODE_MOV;
875
876                         inst->U.I.DstReg.File = RC_FILE_OUTPUT;
877                         inst->U.I.DstReg.Index = i;
878                         inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
879
880                         inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
881                         inst->U.I.SrcReg[0].Index = 0;
882                         inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
883
884                         compiler->Base.Program.OutputsWritten |= 1 << i;
885                 }
886         }
887 }
888
889 static void dataflow_outputs_mark_used(void * userdata, void * data,
890                 void (*callback)(void *, unsigned int, unsigned int))
891 {
892         struct r300_vertex_program_compiler * c = userdata;
893         int i;
894
895         for(i = 0; i < 32; ++i) {
896                 if (c->RequiredOutputs & (1 << i))
897                         callback(data, i, RC_MASK_XYZW);
898         }
899 }
900
901 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
902 {
903         (void) opcode;
904         (void) reg;
905
906         return 1;
907 }
908
909 static void transform_negative_addressing(struct r300_vertex_program_compiler *c,
910                                           struct rc_instruction *arl,
911                                           struct rc_instruction *end,
912                                           int min_offset)
913 {
914         struct rc_instruction *inst, *add;
915         unsigned const_swizzle;
916
917         /* Transform ARL */
918         add = rc_insert_new_instruction(&c->Base, arl->Prev);
919         add->U.I.Opcode = RC_OPCODE_ADD;
920         add->U.I.DstReg.File = RC_FILE_TEMPORARY;
921         add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);
922         add->U.I.DstReg.WriteMask = RC_MASK_X;
923         add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];
924         add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
925         add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,
926                                                                      min_offset, &const_swizzle);
927         add->U.I.SrcReg[1].Swizzle = const_swizzle;
928
929         arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
930         arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;
931         arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;
932
933         /* Rewrite offsets up to and excluding inst. */
934         for (inst = arl->Next; inst != end; inst = inst->Next) {
935                 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
936
937                 for (unsigned i = 0; i < opcode->NumSrcRegs; i++)
938                         if (inst->U.I.SrcReg[i].RelAddr)
939                                 inst->U.I.SrcReg[i].Index -= min_offset;
940         }
941 }
942
943 static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)
944 {
945         struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;
946         struct rc_instruction *inst, *lastARL = NULL;
947         int min_offset = 0;
948
949         for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {
950                 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
951
952                 if (inst->U.I.Opcode == RC_OPCODE_ARL) {
953                         if (lastARL != NULL && min_offset < 0)
954                                 transform_negative_addressing(c, lastARL, inst, min_offset);
955
956                         lastARL = inst;
957                         min_offset = 0;
958                         continue;
959                 }
960
961                 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
962                         if (inst->U.I.SrcReg[i].RelAddr &&
963                             inst->U.I.SrcReg[i].Index < 0) {
964                                 /* ARL must precede any indirect addressing. */
965                                 if (lastARL == NULL) {
966                                         rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL.");
967                                         return;
968                                 }
969
970                                 if (inst->U.I.SrcReg[i].Index < min_offset)
971                                         min_offset = inst->U.I.SrcReg[i].Index;
972                         }
973                 }
974         }
975
976         if (lastARL != NULL && min_offset < 0)
977                 transform_negative_addressing(c, lastARL, inst, min_offset);
978 }
979
980 static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
981         .IsNative = &swizzle_is_native,
982         .Split = 0 /* should never be called */
983 };
984
985 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
986 {
987         int is_r500 = c->Base.is_r500;
988         int opt = !c->Base.disable_optimizations;
989
990         /* Lists of instruction transformations. */
991         struct radeon_program_transformation alu_rewrite_r500[] = {
992                 { &r300_transform_vertex_alu, 0 },
993                 { &r300_transform_trig_scale_vertex, 0 },
994                 { 0, 0 }
995         };
996
997         struct radeon_program_transformation alu_rewrite_r300[] = {
998                 { &r300_transform_vertex_alu, 0 },
999                 { &r300_transform_trig_simple, 0 },
1000                 { 0, 0 }
1001         };
1002
1003         /* Note: These passes have to be done seperately from ALU rewrite,
1004          * otherwise non-native ALU instructions with source conflits
1005          * or non-native modifiers will not be treated properly.
1006          */
1007         struct radeon_program_transformation emulate_modifiers[] = {
1008                 { &transform_nonnative_modifiers, 0 },
1009                 { 0, 0 }
1010         };
1011
1012         struct radeon_program_transformation resolve_src_conflicts[] = {
1013                 { &transform_source_conflicts, 0 },
1014                 { 0, 0 }
1015         };
1016
1017         /* List of compiler passes. */
1018         struct radeon_compiler_pass vs_list[] = {
1019                 /* NAME                         DUMP PREDICATE  FUNCTION                        PARAM */
1020                 {"add artificial outputs",      0, 1,           rc_vs_add_artificial_outputs,   NULL},
1021                 {"transform loops",             1, 1,           rc_transform_loops,             NULL},
1022                 {"emulate branches",            1, !is_r500,    rc_emulate_branches,            NULL},
1023                 {"emulate negative addressing", 1, 1,           rc_emulate_negative_addressing, NULL},
1024                 {"native rewrite",              1, is_r500,     rc_local_transform,             alu_rewrite_r500},
1025                 {"native rewrite",              1, !is_r500,    rc_local_transform,             alu_rewrite_r300},
1026                 {"emulate modifiers",           1, !is_r500,    rc_local_transform,             emulate_modifiers},
1027                 {"deadcode",                    1, opt,         rc_dataflow_deadcode,           dataflow_outputs_mark_used},
1028                 {"dataflow optimize",           1, opt,         rc_optimize,                    NULL},
1029                 /* This pass must be done after optimizations. */
1030                 {"source conflict resolve",     1, 1,           rc_local_transform,             resolve_src_conflicts},
1031                 {"register allocation",         1, opt,         allocate_temporary_registers,   NULL},
1032                 {"dead constants",              1, 1,           rc_remove_unused_constants,     &c->code->constants_remap_table},
1033                 {"final code validation",       0, 1,           rc_validate_final_shader,       NULL},
1034                 {"machine code generation",     0, 1,           translate_vertex_program,       NULL},
1035                 {"dump machine code",           0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump,        NULL},
1036                 {NULL, 0, 0, NULL, NULL}
1037         };
1038
1039         c->Base.type = RC_VERTEX_PROGRAM;
1040         c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
1041
1042         rc_run_compiler(&c->Base, vs_list);
1043
1044         c->code->InputsRead = c->Base.Program.InputsRead;
1045         c->code->OutputsWritten = c->Base.Program.OutputsWritten;
1046         rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
1047 }