Tizen 2.0 Release
[profile/ivi/osmesa.git] / src / mesa / drivers / dri / r300 / compiler / r3xx_vertprog.c
1 /*
2  * Copyright 2009 Nicolai Hähnle <nhaehnle@gmail.com>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * on the rights to use, copy, modify, merge, publish, distribute, sub
8  * license, and/or sell copies of the Software, and to permit persons to whom
9  * the Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
22
23 #include "radeon_compiler.h"
24
25 #include <stdio.h>
26
27 #include "../r300_reg.h"
28
29 #include "radeon_compiler_util.h"
30 #include "radeon_dataflow.h"
31 #include "radeon_program_alu.h"
32 #include "radeon_swizzle.h"
33 #include "radeon_emulate_branches.h"
34 #include "radeon_emulate_loops.h"
35 #include "radeon_remove_constants.h"
36
37 struct loop {
38         int BgnLoop;
39
40 };
41
42 /*
43  * Take an already-setup and valid source then swizzle it appropriately to
44  * obtain a constant ZERO or ONE source.
45  */
46 #define __CONST(x, y)   \
47         (PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[x]),      \
48                            t_swizzle(y),        \
49                            t_swizzle(y),        \
50                            t_swizzle(y),        \
51                            t_swizzle(y),        \
52                            t_src_class(vpi->SrcReg[x].File), \
53                            RC_MASK_NONE) | (vpi->SrcReg[x].RelAddr << 4))
54
55
56 static unsigned long t_dst_mask(unsigned int mask)
57 {
58         /* RC_MASK_* is equivalent to VSF_FLAG_* */
59         return mask & RC_MASK_XYZW;
60 }
61
62 static unsigned long t_dst_class(rc_register_file file)
63 {
64         switch (file) {
65         default:
66                 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
67                 /* fall-through */
68         case RC_FILE_TEMPORARY:
69                 return PVS_DST_REG_TEMPORARY;
70         case RC_FILE_OUTPUT:
71                 return PVS_DST_REG_OUT;
72         case RC_FILE_ADDRESS:
73                 return PVS_DST_REG_A0;
74         }
75 }
76
77 static unsigned long t_dst_index(struct r300_vertex_program_code *vp,
78                                  struct rc_dst_register *dst)
79 {
80         if (dst->File == RC_FILE_OUTPUT)
81                 return vp->outputs[dst->Index];
82
83         return dst->Index;
84 }
85
86 static unsigned long t_src_class(rc_register_file file)
87 {
88         switch (file) {
89         default:
90                 fprintf(stderr, "%s: Bad register file %i\n", __FUNCTION__, file);
91                 /* fall-through */
92         case RC_FILE_NONE:
93         case RC_FILE_TEMPORARY:
94                 return PVS_SRC_REG_TEMPORARY;
95         case RC_FILE_INPUT:
96                 return PVS_SRC_REG_INPUT;
97         case RC_FILE_CONSTANT:
98                 return PVS_SRC_REG_CONSTANT;
99         }
100 }
101
102 static int t_src_conflict(struct rc_src_register a, struct rc_src_register b)
103 {
104         unsigned long aclass = t_src_class(a.File);
105         unsigned long bclass = t_src_class(b.File);
106
107         if (aclass != bclass)
108                 return 0;
109         if (aclass == PVS_SRC_REG_TEMPORARY)
110                 return 0;
111
112         if (a.RelAddr || b.RelAddr)
113                 return 1;
114         if (a.Index != b.Index)
115                 return 1;
116
117         return 0;
118 }
119
120 static inline unsigned long t_swizzle(unsigned int swizzle)
121 {
122         /* this is in fact a NOP as the Mesa RC_SWIZZLE_* are all identical to VSF_IN_COMPONENT_* */
123         return swizzle;
124 }
125
126 static unsigned long t_src_index(struct r300_vertex_program_code *vp,
127                                  struct rc_src_register *src)
128 {
129         if (src->File == RC_FILE_INPUT) {
130                 assert(vp->inputs[src->Index] != -1);
131                 return vp->inputs[src->Index];
132         } else {
133                 if (src->Index < 0) {
134                         fprintf(stderr,
135                                 "negative offsets for indirect addressing do not work.\n");
136                         return 0;
137                 }
138                 return src->Index;
139         }
140 }
141
142 /* these two functions should probably be merged... */
143
144 static unsigned long t_src(struct r300_vertex_program_code *vp,
145                            struct rc_src_register *src)
146 {
147         /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
148          * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
149          */
150         return PVS_SRC_OPERAND(t_src_index(vp, src),
151                                t_swizzle(GET_SWZ(src->Swizzle, 0)),
152                                t_swizzle(GET_SWZ(src->Swizzle, 1)),
153                                t_swizzle(GET_SWZ(src->Swizzle, 2)),
154                                t_swizzle(GET_SWZ(src->Swizzle, 3)),
155                                t_src_class(src->File),
156                                src->Negate) |
157                (src->RelAddr << 4) | (src->Abs << 3);
158 }
159
160 static unsigned long t_src_scalar(struct r300_vertex_program_code *vp,
161                                   struct rc_src_register *src)
162 {
163         /* src->Negate uses the RC_MASK_ flags from program_instruction.h,
164          * which equal our VSF_FLAGS_ values, so it's safe to just pass it here.
165          */
166         return PVS_SRC_OPERAND(t_src_index(vp, src),
167                                t_swizzle(GET_SWZ(src->Swizzle, 0)),
168                                t_swizzle(GET_SWZ(src->Swizzle, 0)),
169                                t_swizzle(GET_SWZ(src->Swizzle, 0)),
170                                t_swizzle(GET_SWZ(src->Swizzle, 0)),
171                                t_src_class(src->File),
172                                src->Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
173                (src->RelAddr << 4) | (src->Abs << 3);
174 }
175
176 static int valid_dst(struct r300_vertex_program_code *vp,
177                            struct rc_dst_register *dst)
178 {
179         if (dst->File == RC_FILE_OUTPUT && vp->outputs[dst->Index] == -1) {
180                 return 0;
181         } else if (dst->File == RC_FILE_ADDRESS) {
182                 assert(dst->Index == 0);
183         }
184
185         return 1;
186 }
187
188 static void ei_vector1(struct r300_vertex_program_code *vp,
189                                 unsigned int hw_opcode,
190                                 struct rc_sub_instruction *vpi,
191                                 unsigned int * inst)
192 {
193         inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
194                                      0,
195                                      0,
196                                      t_dst_index(vp, &vpi->DstReg),
197                                      t_dst_mask(vpi->DstReg.WriteMask),
198                                      t_dst_class(vpi->DstReg.File));
199         inst[1] = t_src(vp, &vpi->SrcReg[0]);
200         inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
201         inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
202 }
203
204 static void ei_vector2(struct r300_vertex_program_code *vp,
205                                 unsigned int hw_opcode,
206                                 struct rc_sub_instruction *vpi,
207                                 unsigned int * inst)
208 {
209         inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
210                                      0,
211                                      0,
212                                      t_dst_index(vp, &vpi->DstReg),
213                                      t_dst_mask(vpi->DstReg.WriteMask),
214                                      t_dst_class(vpi->DstReg.File));
215         inst[1] = t_src(vp, &vpi->SrcReg[0]);
216         inst[2] = t_src(vp, &vpi->SrcReg[1]);
217         inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
218 }
219
220 static void ei_math1(struct r300_vertex_program_code *vp,
221                                 unsigned int hw_opcode,
222                                 struct rc_sub_instruction *vpi,
223                                 unsigned int * inst)
224 {
225         inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
226                                      1,
227                                      0,
228                                      t_dst_index(vp, &vpi->DstReg),
229                                      t_dst_mask(vpi->DstReg.WriteMask),
230                                      t_dst_class(vpi->DstReg.File));
231         inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
232         inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
233         inst[3] = __CONST(0, RC_SWIZZLE_ZERO);
234 }
235
236 static void ei_lit(struct r300_vertex_program_code *vp,
237                                       struct rc_sub_instruction *vpi,
238                                       unsigned int * inst)
239 {
240         //LIT TMP 1.Y Z TMP 1{} {X W Z Y} TMP 1{} {Y W Z X} TMP 1{} {Y X Z W}
241
242         inst[0] = PVS_OP_DST_OPERAND(ME_LIGHT_COEFF_DX,
243                                      1,
244                                      0,
245                                      t_dst_index(vp, &vpi->DstReg),
246                                      t_dst_mask(vpi->DstReg.WriteMask),
247                                      t_dst_class(vpi->DstReg.File));
248         /* NOTE: Users swizzling might not work. */
249         inst[1] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),      // X
250                                   t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),        // W
251                                   PVS_SRC_SELECT_FORCE_0,       // Z
252                                   t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),        // Y
253                                   t_src_class(vpi->SrcReg[0].File),
254                                   vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
255             (vpi->SrcReg[0].RelAddr << 4);
256         inst[2] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),      // Y
257                                   t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),        // W
258                                   PVS_SRC_SELECT_FORCE_0,       // Z
259                                   t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),        // X
260                                   t_src_class(vpi->SrcReg[0].File),
261                                   vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
262             (vpi->SrcReg[0].RelAddr << 4);
263         inst[3] = PVS_SRC_OPERAND(t_src_index(vp, &vpi->SrcReg[0]), t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 1)),      // Y
264                                   t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 0)),        // X
265                                   PVS_SRC_SELECT_FORCE_0,       // Z
266                                   t_swizzle(GET_SWZ(vpi->SrcReg[0].Swizzle, 3)),        // W
267                                   t_src_class(vpi->SrcReg[0].File),
268                                   vpi->SrcReg[0].Negate ? RC_MASK_XYZW : RC_MASK_NONE) |
269             (vpi->SrcReg[0].RelAddr << 4);
270 }
271
272 static void ei_mad(struct r300_vertex_program_code *vp,
273                                       struct rc_sub_instruction *vpi,
274                                       unsigned int * inst)
275 {
276         unsigned int i;
277         /* Remarks about hardware limitations of MAD
278          * (please preserve this comment, as this information is _NOT_
279          * in the documentation provided by AMD).
280          *
281          * As described in the documentation, MAD with three unique temporary
282          * source registers requires the use of the macro version.
283          *
284          * However (and this is not mentioned in the documentation), apparently
285          * the macro version is _NOT_ a full superset of the normal version.
286          * In particular, the macro version does not always work when relative
287          * addressing is used in the source operands.
288          *
289          * This limitation caused incorrect rendering in Sauerbraten's OpenGL
290          * assembly shader path when using medium quality animations
291          * (i.e. animations with matrix blending instead of quaternion blending).
292          *
293          * Unfortunately, I (nha) have been unable to extract a Piglit regression
294          * test for this issue - for some reason, it is possible to have vertex
295          * programs whose prefix is *exactly* the same as the prefix of the
296          * offending program in Sauerbraten up to the offending instruction
297          * without causing any trouble.
298          *
299          * Bottom line: Only use the macro version only when really necessary;
300          * according to AMD docs, this should improve performance by one clock
301          * as a nice side bonus.
302          */
303         if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
304             vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
305             vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
306             vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
307             vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
308             vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
309                 inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
310                                 0,
311                                 1,
312                                 t_dst_index(vp, &vpi->DstReg),
313                                 t_dst_mask(vpi->DstReg.WriteMask),
314                                 t_dst_class(vpi->DstReg.File));
315         } else {
316                 inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
317                                 0,
318                                 0,
319                                 t_dst_index(vp, &vpi->DstReg),
320                                 t_dst_mask(vpi->DstReg.WriteMask),
321                                 t_dst_class(vpi->DstReg.File));
322
323                 /* Arguments with constant swizzles still count as a unique
324                  * temporary, so we should make sure these arguments share a
325                  * register index with one of the other arguments. */
326                 for (i = 0; i < 3; i++) {
327                         unsigned int j;
328                         if (vpi->SrcReg[i].File != RC_FILE_NONE)
329                                 continue;
330
331                         for (j = 0; j < 3; j++) {
332                                 if (i != j) {
333                                         vpi->SrcReg[i].Index =
334                                                 vpi->SrcReg[j].Index;
335                                         break;
336                                 }
337                         }
338                 }
339         }
340         inst[1] = t_src(vp, &vpi->SrcReg[0]);
341         inst[2] = t_src(vp, &vpi->SrcReg[1]);
342         inst[3] = t_src(vp, &vpi->SrcReg[2]);
343 }
344
345 static void ei_pow(struct r300_vertex_program_code *vp,
346                                       struct rc_sub_instruction *vpi,
347                                       unsigned int * inst)
348 {
349         inst[0] = PVS_OP_DST_OPERAND(ME_POWER_FUNC_FF,
350                                      1,
351                                      0,
352                                      t_dst_index(vp, &vpi->DstReg),
353                                      t_dst_mask(vpi->DstReg.WriteMask),
354                                      t_dst_class(vpi->DstReg.File));
355         inst[1] = t_src_scalar(vp, &vpi->SrcReg[0]);
356         inst[2] = __CONST(0, RC_SWIZZLE_ZERO);
357         inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
358 }
359
360 static void mark_write(void * userdata, struct rc_instruction * inst,
361                 rc_register_file file,  unsigned int index, unsigned int mask)
362 {
363         unsigned int * writemasks = userdata;
364
365         if (file != RC_FILE_TEMPORARY)
366                 return;
367
368         if (index >= R300_VS_MAX_TEMPS)
369                 return;
370
371         writemasks[index] |= mask;
372 }
373
374 static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler)
375 {
376         return PVS_SRC_OPERAND(compiler->PredicateIndex,
377                 t_swizzle(RC_SWIZZLE_ZERO),
378                 t_swizzle(RC_SWIZZLE_ZERO),
379                 t_swizzle(RC_SWIZZLE_ZERO),
380                 t_swizzle(RC_SWIZZLE_W),
381                 t_src_class(RC_FILE_TEMPORARY),
382                 0);
383 }
384
385 static unsigned long t_pred_dst(struct r300_vertex_program_compiler * compiler,
386                                         unsigned int hw_opcode, int is_math)
387 {
388         return PVS_OP_DST_OPERAND(hw_opcode,
389              is_math,
390              0,
391              compiler->PredicateIndex,
392              RC_MASK_W,
393              t_dst_class(RC_FILE_TEMPORARY));
394
395 }
396
397 static void ei_if(struct r300_vertex_program_compiler * compiler,
398                                         struct rc_instruction *rci,
399                                         unsigned int * inst,
400                                         unsigned int branch_depth)
401 {
402         unsigned int predicate_opcode;
403         int is_math = 0;
404
405         if (!compiler->Base.is_r500) {
406                 rc_error(&compiler->Base,"Opcode IF not supported\n");
407                 return;
408         }
409
410         /* Reserve a temporary to use as our predicate stack counter, if we
411          * don't already have one. */
412         if (!compiler->PredicateMask) {
413                 unsigned int writemasks[RC_REGISTER_MAX_INDEX];
414                 struct rc_instruction * inst;
415                 unsigned int i;
416                 memset(writemasks, 0, sizeof(writemasks));
417                 for(inst = compiler->Base.Program.Instructions.Next;
418                                 inst != &compiler->Base.Program.Instructions;
419                                                         inst = inst->Next) {
420                         rc_for_all_writes_mask(inst, mark_write, writemasks);
421                 }
422                 for(i = 0; i < compiler->Base.max_temp_regs; i++) {
423                         unsigned int mask = ~writemasks[i] & RC_MASK_XYZW;
424                         /* Only the W component can be used fo the predicate
425                          * stack counter. */
426                         if (mask & RC_MASK_W) {
427                                 compiler->PredicateMask = RC_MASK_W;
428                                 compiler->PredicateIndex = i;
429                                 break;
430                         }
431                 }
432                 if (i == compiler->Base.max_temp_regs) {
433                         rc_error(&compiler->Base, "No free temporary to use for"
434                                         " predicate stack counter.\n");
435                         return;
436                 }
437         }
438         predicate_opcode =
439                         branch_depth ? VE_PRED_SET_NEQ_PUSH : ME_PRED_SET_NEQ;
440
441         rci->U.I.SrcReg[0].Swizzle = RC_MAKE_SWIZZLE_SMEAR(GET_SWZ(rci->U.I.SrcReg[0].Swizzle,0));
442         if (branch_depth == 0) {
443                 is_math = 1;
444                 predicate_opcode = ME_PRED_SET_NEQ;
445                 inst[1] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
446                 inst[2] = 0;
447         } else {
448                 predicate_opcode = VE_PRED_SET_NEQ_PUSH;
449                 inst[1] = t_pred_src(compiler);
450                 inst[2] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
451         }
452
453         inst[0] = t_pred_dst(compiler, predicate_opcode, is_math);
454         inst[3] = 0;
455
456 }
457
458 static void ei_else(struct r300_vertex_program_compiler * compiler,
459                                                         unsigned int * inst)
460 {
461         if (!compiler->Base.is_r500) {
462                 rc_error(&compiler->Base,"Opcode ELSE not supported\n");
463                 return;
464         }
465         inst[0] = t_pred_dst(compiler, ME_PRED_SET_INV, 1);
466         inst[1] = t_pred_src(compiler);
467         inst[2] = 0;
468         inst[3] = 0;
469 }
470
471 static void ei_endif(struct r300_vertex_program_compiler *compiler,
472                                                         unsigned int * inst)
473 {
474         if (!compiler->Base.is_r500) {
475                 rc_error(&compiler->Base,"Opcode ENDIF not supported\n");
476                 return;
477         }
478         inst[0] = t_pred_dst(compiler, ME_PRED_SET_POP, 1);
479         inst[1] = t_pred_src(compiler);
480         inst[2] = 0;
481         inst[3] = 0;
482 }
483
484 static void translate_vertex_program(struct radeon_compiler *c, void *user)
485 {
486         struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
487         struct rc_instruction *rci;
488
489         struct loop * loops = NULL;
490         int current_loop_depth = 0;
491         int loops_reserved = 0;
492
493         unsigned int branch_depth = 0;
494
495         compiler->code->pos_end = 0;    /* Not supported yet */
496         compiler->code->length = 0;
497         compiler->code->num_temporaries = 0;
498
499         compiler->SetHwInputOutput(compiler);
500
501         for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
502                 struct rc_sub_instruction *vpi = &rci->U.I;
503                 unsigned int *inst = compiler->code->body.d + compiler->code->length;
504                 const struct rc_opcode_info *info = rc_get_opcode_info(vpi->Opcode);
505
506                 /* Skip instructions writing to non-existing destination */
507                 if (!valid_dst(compiler->code, &vpi->DstReg))
508                         continue;
509
510                 if (info->HasDstReg) {
511                         /* Neither is Saturate. */
512                         if (vpi->SaturateMode != RC_SATURATE_NONE) {
513                                 rc_error(&compiler->Base, "Vertex program does not support the Saturate "
514                                          "modifier (yet).\n");
515                         }
516                 }
517
518                 if (compiler->code->length >= c->max_alu_insts * 4) {
519                         rc_error(&compiler->Base, "Vertex program has too many instructions\n");
520                         return;
521                 }
522
523                 assert(compiler->Base.is_r500 ||
524                        (vpi->Opcode != RC_OPCODE_SEQ &&
525                         vpi->Opcode != RC_OPCODE_SNE));
526
527                 switch (vpi->Opcode) {
528                 case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
529                 case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
530                 case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
531                 case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
532                 case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
533                 case RC_OPCODE_ELSE: ei_else(compiler, inst); break;
534                 case RC_OPCODE_ENDIF: ei_endif(compiler, inst); branch_depth--; break;
535                 case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
536                 case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
537                 case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
538                 case RC_OPCODE_IF: ei_if(compiler, rci, inst, branch_depth); branch_depth++; break;
539                 case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
540                 case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
541                 case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
542                 case RC_OPCODE_MAD: ei_mad(compiler->code, vpi, inst); break;
543                 case RC_OPCODE_MAX: ei_vector2(compiler->code, VE_MAXIMUM, vpi, inst); break;
544                 case RC_OPCODE_MIN: ei_vector2(compiler->code, VE_MINIMUM, vpi, inst); break;
545                 case RC_OPCODE_MOV: ei_vector1(compiler->code, VE_ADD, vpi, inst); break;
546                 case RC_OPCODE_MUL: ei_vector2(compiler->code, VE_MULTIPLY, vpi, inst); break;
547                 case RC_OPCODE_POW: ei_pow(compiler->code, vpi, inst); break;
548                 case RC_OPCODE_RCP: ei_math1(compiler->code, ME_RECIP_DX, vpi, inst); break;
549                 case RC_OPCODE_RSQ: ei_math1(compiler->code, ME_RECIP_SQRT_DX, vpi, inst); break;
550                 case RC_OPCODE_SEQ: ei_vector2(compiler->code, VE_SET_EQUAL, vpi, inst); break;
551                 case RC_OPCODE_SGE: ei_vector2(compiler->code, VE_SET_GREATER_THAN_EQUAL, vpi, inst); break;
552                 case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
553                 case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
554                 case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
555                 case RC_OPCODE_BGNLOOP:
556                 {
557                         struct loop * l;
558
559                         if ((!compiler->Base.is_r500
560                                 && loops_reserved >= R300_VS_MAX_LOOP_DEPTH)
561                                 || loops_reserved >= R500_VS_MAX_FC_DEPTH) {
562                                 rc_error(&compiler->Base,
563                                                 "Loops are nested too deep.");
564                                 return;
565                         }
566                         memory_pool_array_reserve(&compiler->Base.Pool,
567                                         struct loop, loops, current_loop_depth,
568                                         loops_reserved, 1);
569                         l = &loops[current_loop_depth++];
570                         memset(l , 0, sizeof(struct loop));
571                         l->BgnLoop = (compiler->code->length / 4);
572                         continue;
573                 }
574                 case RC_OPCODE_ENDLOOP:
575                 {
576                         struct loop * l;
577                         unsigned int act_addr;
578                         unsigned int last_addr;
579                         unsigned int ret_addr;
580
581                         assert(loops);
582                         l = &loops[current_loop_depth - 1];
583                         act_addr = l->BgnLoop - 1;
584                         last_addr = (compiler->code->length / 4) - 1;
585                         ret_addr = l->BgnLoop;
586
587                         if (loops_reserved >= R300_VS_MAX_FC_OPS) {
588                                 rc_error(&compiler->Base,
589                                         "Too many flow control instructions.");
590                                 return;
591                         }
592                         if (compiler->Base.is_r500) {
593                                 compiler->code->fc_op_addrs.r500
594                                         [compiler->code->num_fc_ops].lw =
595                                         R500_PVS_FC_ACT_ADRS(act_addr)
596                                         | R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff)
597                                         ;
598                                 compiler->code->fc_op_addrs.r500
599                                         [compiler->code->num_fc_ops].uw =
600                                         R500_PVS_FC_LAST_INST(last_addr)
601                                         | R500_PVS_FC_RTN_INST(ret_addr)
602                                         ;
603                         } else {
604                                 compiler->code->fc_op_addrs.r300
605                                         [compiler->code->num_fc_ops] =
606                                         R300_PVS_FC_ACT_ADRS(act_addr)
607                                         | R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
608                                         | R300_PVS_FC_LAST_INST(last_addr)
609                                         | R300_PVS_FC_RTN_INST(ret_addr)
610                                         ;
611                         }
612                         compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
613                                 R300_PVS_FC_LOOP_INIT_VAL(0x0)
614                                 | R300_PVS_FC_LOOP_STEP_VAL(0x1)
615                                 ;
616                         compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
617                                                 compiler->code->num_fc_ops);
618                         compiler->code->num_fc_ops++;
619                         current_loop_depth--;
620                         continue;
621                 }
622
623                 default:
624                         rc_error(&compiler->Base, "Unknown opcode %s\n", info->Name);
625                         return;
626                 }
627
628                 /* Non-flow control instructions that are inside an if statement
629                  * need to pay attention to the predicate bit. */
630                 if (branch_depth
631                         && vpi->Opcode != RC_OPCODE_IF
632                         && vpi->Opcode != RC_OPCODE_ELSE
633                         && vpi->Opcode != RC_OPCODE_ENDIF) {
634
635                         inst[0] |= (PVS_DST_PRED_ENABLE_MASK
636                                                 << PVS_DST_PRED_ENABLE_SHIFT);
637                         inst[0] |= (PVS_DST_PRED_SENSE_MASK
638                                                 << PVS_DST_PRED_SENSE_SHIFT);
639                 }
640
641                 /* Update the number of temporaries. */
642                 if (info->HasDstReg && vpi->DstReg.File == RC_FILE_TEMPORARY &&
643                     vpi->DstReg.Index >= compiler->code->num_temporaries)
644                         compiler->code->num_temporaries = vpi->DstReg.Index + 1;
645
646                 for (unsigned i = 0; i < info->NumSrcRegs; i++)
647                         if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
648                             vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
649                                 compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
650
651                 if (compiler->PredicateMask)
652                         if (compiler->PredicateIndex >= compiler->code->num_temporaries)
653                                 compiler->code->num_temporaries = compiler->PredicateIndex + 1;
654
655                 if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
656                         rc_error(&compiler->Base, "Too many temporaries.\n");
657                         return;
658                 }
659
660                 compiler->code->length += 4;
661
662                 if (compiler->Base.Error)
663                         return;
664         }
665 }
666
667 struct temporary_allocation {
668         unsigned int Allocated:1;
669         unsigned int HwTemp:15;
670         struct rc_instruction * LastRead;
671 };
672
673 static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
674 {
675         struct r300_vertex_program_compiler *compiler = (struct r300_vertex_program_compiler*)c;
676         struct rc_instruction *inst;
677         struct rc_instruction *end_loop = NULL;
678         unsigned int num_orig_temps = 0;
679         char hwtemps[RC_REGISTER_MAX_INDEX];
680         struct temporary_allocation * ta;
681         unsigned int i, j;
682
683         memset(hwtemps, 0, sizeof(hwtemps));
684
685         rc_recompute_ips(c);
686
687         /* Pass 1: Count original temporaries. */
688         for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
689                 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
690
691                 for (i = 0; i < opcode->NumSrcRegs; ++i) {
692                         if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
693                                 if (inst->U.I.SrcReg[i].Index >= num_orig_temps)
694                                         num_orig_temps = inst->U.I.SrcReg[i].Index + 1;
695                         }
696                 }
697
698                 if (opcode->HasDstReg) {
699                         if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
700                                 if (inst->U.I.DstReg.Index >= num_orig_temps)
701                                         num_orig_temps = inst->U.I.DstReg.Index + 1;
702                         }
703                 }
704         }
705
706         ta = (struct temporary_allocation*)memory_pool_malloc(&compiler->Base.Pool,
707                         sizeof(struct temporary_allocation) * num_orig_temps);
708         memset(ta, 0, sizeof(struct temporary_allocation) * num_orig_temps);
709
710         /* Pass 2: Determine original temporary lifetimes */
711         for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
712                 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
713                 /* Instructions inside of loops need to use the ENDLOOP
714                  * instruction as their LastRead. */
715                 if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
716                         int endloops = 1;
717                         struct rc_instruction * ptr;
718                         for(ptr = inst->Next;
719                                 ptr != &compiler->Base.Program.Instructions;
720                                                         ptr = ptr->Next){
721                                 if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
722                                         endloops++;
723                                 } else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
724                                         endloops--;
725                                         if (endloops <= 0) {
726                                                 end_loop = ptr;
727                                                 break;
728                                         }
729                                 }
730                         }
731                 }
732
733                 if (inst == end_loop) {
734                         end_loop = NULL;
735                         continue;
736                 }
737
738                 for (i = 0; i < opcode->NumSrcRegs; ++i) {
739                         if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
740                                 ta[inst->U.I.SrcReg[i].Index].LastRead = end_loop ? end_loop : inst;
741                         }
742                 }
743         }
744
745         /* Pass 3: Register allocation */
746         for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
747                 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
748
749                 for (i = 0; i < opcode->NumSrcRegs; ++i) {
750                         if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
751                                 unsigned int orig = inst->U.I.SrcReg[i].Index;
752                                 inst->U.I.SrcReg[i].Index = ta[orig].HwTemp;
753
754                                 if (ta[orig].Allocated && inst == ta[orig].LastRead)
755                                         hwtemps[ta[orig].HwTemp] = 0;
756                         }
757                 }
758
759                 if (opcode->HasDstReg) {
760                         if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY) {
761                                 unsigned int orig = inst->U.I.DstReg.Index;
762
763                                 if (!ta[orig].Allocated) {
764                                         for(j = 0; j < c->max_temp_regs; ++j) {
765                                                 if (!hwtemps[j])
766                                                         break;
767                                         }
768                                         ta[orig].Allocated = 1;
769                                         ta[orig].HwTemp = j;
770                                         hwtemps[ta[orig].HwTemp] = 1;
771                                 }
772
773                                 inst->U.I.DstReg.Index = ta[orig].HwTemp;
774                         }
775                 }
776         }
777 }
778
779 /**
780  * R3xx-R4xx vertex engine does not support the Absolute source operand modifier
781  * and the Saturate opcode modifier. Only Absolute is currently transformed.
782  */
783 static int transform_nonnative_modifiers(
784         struct radeon_compiler *c,
785         struct rc_instruction *inst,
786         void* unused)
787 {
788         const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->U.I.Opcode);
789         unsigned i;
790
791         /* Transform ABS(a) to MAX(a, -a). */
792         for (i = 0; i < opcode->NumSrcRegs; i++) {
793                 if (inst->U.I.SrcReg[i].Abs) {
794                         struct rc_instruction *new_inst;
795                         unsigned temp;
796
797                         inst->U.I.SrcReg[i].Abs = 0;
798
799                         temp = rc_find_free_temporary(c);
800
801                         new_inst = rc_insert_new_instruction(c, inst->Prev);
802                         new_inst->U.I.Opcode = RC_OPCODE_MAX;
803                         new_inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
804                         new_inst->U.I.DstReg.Index = temp;
805                         new_inst->U.I.SrcReg[0] = inst->U.I.SrcReg[i];
806                         new_inst->U.I.SrcReg[1] = inst->U.I.SrcReg[i];
807                         new_inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
808
809                         memset(&inst->U.I.SrcReg[i], 0, sizeof(inst->U.I.SrcReg[i]));
810                         inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
811                         inst->U.I.SrcReg[i].Index = temp;
812                         inst->U.I.SrcReg[i].Swizzle = RC_SWIZZLE_XYZW;
813                 }
814         }
815         return 1;
816 }
817
818 /**
819  * Vertex engine cannot read two inputs or two constants at the same time.
820  * Introduce intermediate MOVs to temporary registers to account for this.
821  */
822 static int transform_source_conflicts(
823         struct radeon_compiler *c,
824         struct rc_instruction* inst,
825         void* unused)
826 {
827         const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
828
829         if (opcode->NumSrcRegs == 3) {
830                 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[2])
831                     || t_src_conflict(inst->U.I.SrcReg[0], inst->U.I.SrcReg[2])) {
832                         int tmpreg = rc_find_free_temporary(c);
833                         struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
834                         inst_mov->U.I.Opcode = RC_OPCODE_MOV;
835                         inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
836                         inst_mov->U.I.DstReg.Index = tmpreg;
837                         inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
838
839                         reset_srcreg(&inst->U.I.SrcReg[2]);
840                         inst->U.I.SrcReg[2].File = RC_FILE_TEMPORARY;
841                         inst->U.I.SrcReg[2].Index = tmpreg;
842                 }
843         }
844
845         if (opcode->NumSrcRegs >= 2) {
846                 if (t_src_conflict(inst->U.I.SrcReg[1], inst->U.I.SrcReg[0])) {
847                         int tmpreg = rc_find_free_temporary(c);
848                         struct rc_instruction * inst_mov = rc_insert_new_instruction(c, inst->Prev);
849                         inst_mov->U.I.Opcode = RC_OPCODE_MOV;
850                         inst_mov->U.I.DstReg.File = RC_FILE_TEMPORARY;
851                         inst_mov->U.I.DstReg.Index = tmpreg;
852                         inst_mov->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
853
854                         reset_srcreg(&inst->U.I.SrcReg[1]);
855                         inst->U.I.SrcReg[1].File = RC_FILE_TEMPORARY;
856                         inst->U.I.SrcReg[1].Index = tmpreg;
857                 }
858         }
859
860         return 1;
861 }
862
863 static void rc_vs_add_artificial_outputs(struct radeon_compiler *c, void *user)
864 {
865         struct r300_vertex_program_compiler * compiler = (struct r300_vertex_program_compiler*)c;
866         int i;
867
868         for(i = 0; i < 32; ++i) {
869                 if ((compiler->RequiredOutputs & (1 << i)) &&
870                     !(compiler->Base.Program.OutputsWritten & (1 << i))) {
871                         struct rc_instruction * inst = rc_insert_new_instruction(&compiler->Base, compiler->Base.Program.Instructions.Prev);
872                         inst->U.I.Opcode = RC_OPCODE_MOV;
873
874                         inst->U.I.DstReg.File = RC_FILE_OUTPUT;
875                         inst->U.I.DstReg.Index = i;
876                         inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
877
878                         inst->U.I.SrcReg[0].File = RC_FILE_CONSTANT;
879                         inst->U.I.SrcReg[0].Index = 0;
880                         inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
881
882                         compiler->Base.Program.OutputsWritten |= 1 << i;
883                 }
884         }
885 }
886
887 static void dataflow_outputs_mark_used(void * userdata, void * data,
888                 void (*callback)(void *, unsigned int, unsigned int))
889 {
890         struct r300_vertex_program_compiler * c = userdata;
891         int i;
892
893         for(i = 0; i < 32; ++i) {
894                 if (c->RequiredOutputs & (1 << i))
895                         callback(data, i, RC_MASK_XYZW);
896         }
897 }
898
899 static int swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
900 {
901         (void) opcode;
902         (void) reg;
903
904         return 1;
905 }
906
907 static void transform_negative_addressing(struct r300_vertex_program_compiler *c,
908                                           struct rc_instruction *arl,
909                                           struct rc_instruction *end,
910                                           int min_offset)
911 {
912         struct rc_instruction *inst, *add;
913         unsigned const_swizzle;
914
915         /* Transform ARL */
916         add = rc_insert_new_instruction(&c->Base, arl->Prev);
917         add->U.I.Opcode = RC_OPCODE_ADD;
918         add->U.I.DstReg.File = RC_FILE_TEMPORARY;
919         add->U.I.DstReg.Index = rc_find_free_temporary(&c->Base);
920         add->U.I.DstReg.WriteMask = RC_MASK_X;
921         add->U.I.SrcReg[0] = arl->U.I.SrcReg[0];
922         add->U.I.SrcReg[1].File = RC_FILE_CONSTANT;
923         add->U.I.SrcReg[1].Index = rc_constants_add_immediate_scalar(&c->Base.Program.Constants,
924                                                                      min_offset, &const_swizzle);
925         add->U.I.SrcReg[1].Swizzle = const_swizzle;
926
927         arl->U.I.SrcReg[0].File = RC_FILE_TEMPORARY;
928         arl->U.I.SrcReg[0].Index = add->U.I.DstReg.Index;
929         arl->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XXXX;
930
931         /* Rewrite offsets up to and excluding inst. */
932         for (inst = arl->Next; inst != end; inst = inst->Next) {
933                 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
934
935                 for (unsigned i = 0; i < opcode->NumSrcRegs; i++)
936                         if (inst->U.I.SrcReg[i].RelAddr)
937                                 inst->U.I.SrcReg[i].Index -= min_offset;
938         }
939 }
940
941 static void rc_emulate_negative_addressing(struct radeon_compiler *compiler, void *user)
942 {
943         struct r300_vertex_program_compiler * c = (struct r300_vertex_program_compiler*)compiler;
944         struct rc_instruction *inst, *lastARL = NULL;
945         int min_offset = 0;
946
947         for (inst = c->Base.Program.Instructions.Next; inst != &c->Base.Program.Instructions; inst = inst->Next) {
948                 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
949
950                 if (inst->U.I.Opcode == RC_OPCODE_ARL) {
951                         if (lastARL != NULL && min_offset < 0)
952                                 transform_negative_addressing(c, lastARL, inst, min_offset);
953
954                         lastARL = inst;
955                         min_offset = 0;
956                         continue;
957                 }
958
959                 for (unsigned i = 0; i < opcode->NumSrcRegs; i++) {
960                         if (inst->U.I.SrcReg[i].RelAddr &&
961                             inst->U.I.SrcReg[i].Index < 0) {
962                                 /* ARL must precede any indirect addressing. */
963                                 if (lastARL == NULL) {
964                                         rc_error(&c->Base, "Vertex shader: Found relative addressing without ARL.");
965                                         return;
966                                 }
967
968                                 if (inst->U.I.SrcReg[i].Index < min_offset)
969                                         min_offset = inst->U.I.SrcReg[i].Index;
970                         }
971                 }
972         }
973
974         if (lastARL != NULL && min_offset < 0)
975                 transform_negative_addressing(c, lastARL, inst, min_offset);
976 }
977
978 static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
979         .IsNative = &swizzle_is_native,
980         .Split = 0 /* should never be called */
981 };
982
983 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
984 {
985         int is_r500 = c->Base.is_r500;
986         int opt = !c->Base.disable_optimizations;
987
988         /* Lists of instruction transformations. */
989         struct radeon_program_transformation alu_rewrite_r500[] = {
990                 { &r300_transform_vertex_alu, 0 },
991                 { &r300_transform_trig_scale_vertex, 0 },
992                 { 0, 0 }
993         };
994
995         struct radeon_program_transformation alu_rewrite_r300[] = {
996                 { &r300_transform_vertex_alu, 0 },
997                 { &r300_transform_trig_simple, 0 },
998                 { 0, 0 }
999         };
1000
1001         /* Note: These passes have to be done seperately from ALU rewrite,
1002          * otherwise non-native ALU instructions with source conflits
1003          * or non-native modifiers will not be treated properly.
1004          */
1005         struct radeon_program_transformation emulate_modifiers[] = {
1006                 { &transform_nonnative_modifiers, 0 },
1007                 { 0, 0 }
1008         };
1009
1010         struct radeon_program_transformation resolve_src_conflicts[] = {
1011                 { &transform_source_conflicts, 0 },
1012                 { 0, 0 }
1013         };
1014
1015         /* List of compiler passes. */
1016         struct radeon_compiler_pass vs_list[] = {
1017                 /* NAME                         DUMP PREDICATE  FUNCTION                        PARAM */
1018                 {"add artificial outputs",      0, 1,           rc_vs_add_artificial_outputs,   NULL},
1019                 {"transform loops",             1, 1,           rc_transform_loops,             NULL},
1020                 {"emulate branches",            1, !is_r500,    rc_emulate_branches,            NULL},
1021                 {"emulate negative addressing", 1, 1,           rc_emulate_negative_addressing, NULL},
1022                 {"native rewrite",              1, is_r500,     rc_local_transform,             alu_rewrite_r500},
1023                 {"native rewrite",              1, !is_r500,    rc_local_transform,             alu_rewrite_r300},
1024                 {"emulate modifiers",           1, !is_r500,    rc_local_transform,             emulate_modifiers},
1025                 {"deadcode",                    1, opt,         rc_dataflow_deadcode,           dataflow_outputs_mark_used},
1026                 {"dataflow optimize",           1, opt,         rc_optimize,                    NULL},
1027                 /* This pass must be done after optimizations. */
1028                 {"source conflict resolve",     1, 1,           rc_local_transform,             resolve_src_conflicts},
1029                 {"register allocation",         1, opt,         allocate_temporary_registers,   NULL},
1030                 {"dead constants",              1, 1,           rc_remove_unused_constants,     &c->code->constants_remap_table},
1031                 {"final code validation",       0, 1,           rc_validate_final_shader,       NULL},
1032                 {"machine code generation",     0, 1,           translate_vertex_program,       NULL},
1033                 {"dump machine code",           0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump,        NULL},
1034                 {NULL, 0, 0, NULL, NULL}
1035         };
1036
1037         c->Base.type = RC_VERTEX_PROGRAM;
1038         c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
1039
1040         rc_run_compiler(&c->Base, vs_list);
1041
1042         c->code->InputsRead = c->Base.Program.InputsRead;
1043         c->code->OutputsWritten = c->Base.Program.OutputsWritten;
1044         rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
1045 }