Tizen 2.1 base
[sdk/emulator/qemu.git] / gl / mesa / src / gallium / drivers / r300 / compiler / radeon_program_alu.c
1 /*
2  * Copyright (C) 2008 Nicolai Haehnle.
3  *
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining
7  * a copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sublicense, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial
16  * portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  */
27
28 /**
29  * @file
30  *
31  * Shareable transformations that transform "special" ALU instructions
32  * into ALU instructions that are supported by hardware.
33  *
34  */
35
36 #include "radeon_program_alu.h"
37
38 #include "radeon_compiler.h"
39 #include "radeon_compiler_util.h"
40
41
42 static struct rc_instruction *emit1(
43         struct radeon_compiler * c, struct rc_instruction * after,
44         rc_opcode Opcode, struct rc_sub_instruction * base,
45         struct rc_dst_register DstReg, struct rc_src_register SrcReg)
46 {
47         struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
48
49         if (base) {
50                 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
51         }
52
53         fpi->U.I.Opcode = Opcode;
54         fpi->U.I.DstReg = DstReg;
55         fpi->U.I.SrcReg[0] = SrcReg;
56         return fpi;
57 }
58
59 static struct rc_instruction *emit2(
60         struct radeon_compiler * c, struct rc_instruction * after,
61         rc_opcode Opcode, struct rc_sub_instruction * base,
62         struct rc_dst_register DstReg,
63         struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
64 {
65         struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
66
67         if (base) {
68                 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
69         }
70
71         fpi->U.I.Opcode = Opcode;
72         fpi->U.I.DstReg = DstReg;
73         fpi->U.I.SrcReg[0] = SrcReg0;
74         fpi->U.I.SrcReg[1] = SrcReg1;
75         return fpi;
76 }
77
78 static struct rc_instruction *emit3(
79         struct radeon_compiler * c, struct rc_instruction * after,
80         rc_opcode Opcode, struct rc_sub_instruction * base,
81         struct rc_dst_register DstReg,
82         struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
83         struct rc_src_register SrcReg2)
84 {
85         struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
86
87         if (base) {
88                 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
89         }
90
91         fpi->U.I.Opcode = Opcode;
92         fpi->U.I.DstReg = DstReg;
93         fpi->U.I.SrcReg[0] = SrcReg0;
94         fpi->U.I.SrcReg[1] = SrcReg1;
95         fpi->U.I.SrcReg[2] = SrcReg2;
96         return fpi;
97 }
98
99 static struct rc_dst_register dstregtmpmask(int index, int mask)
100 {
101         struct rc_dst_register dst = {0, 0, 0};
102         dst.File = RC_FILE_TEMPORARY;
103         dst.Index = index;
104         dst.WriteMask = mask;
105         return dst;
106 }
107
108 static const struct rc_src_register builtin_zero = {
109         .File = RC_FILE_NONE,
110         .Index = 0,
111         .Swizzle = RC_SWIZZLE_0000
112 };
113 static const struct rc_src_register builtin_one = {
114         .File = RC_FILE_NONE,
115         .Index = 0,
116         .Swizzle = RC_SWIZZLE_1111
117 };
118
119 static const struct rc_src_register builtin_half = {
120         .File = RC_FILE_NONE,
121         .Index = 0,
122         .Swizzle = RC_SWIZZLE_HHHH
123 };
124
125 static const struct rc_src_register srcreg_undefined = {
126         .File = RC_FILE_NONE,
127         .Index = 0,
128         .Swizzle = RC_SWIZZLE_XYZW
129 };
130
131 static struct rc_src_register srcreg(int file, int index)
132 {
133         struct rc_src_register src = srcreg_undefined;
134         src.File = file;
135         src.Index = index;
136         return src;
137 }
138
139 static struct rc_src_register srcregswz(int file, int index, int swz)
140 {
141         struct rc_src_register src = srcreg_undefined;
142         src.File = file;
143         src.Index = index;
144         src.Swizzle = swz;
145         return src;
146 }
147
148 static struct rc_src_register absolute(struct rc_src_register reg)
149 {
150         struct rc_src_register newreg = reg;
151         newreg.Abs = 1;
152         newreg.Negate = RC_MASK_NONE;
153         return newreg;
154 }
155
156 static struct rc_src_register negate(struct rc_src_register reg)
157 {
158         struct rc_src_register newreg = reg;
159         newreg.Negate = newreg.Negate ^ RC_MASK_XYZW;
160         return newreg;
161 }
162
163 static struct rc_src_register swizzle(struct rc_src_register reg,
164                 rc_swizzle x, rc_swizzle y, rc_swizzle z, rc_swizzle w)
165 {
166         struct rc_src_register swizzled = reg;
167         swizzled.Swizzle = combine_swizzles4(reg.Swizzle, x, y, z, w);
168         return swizzled;
169 }
170
171 static struct rc_src_register swizzle_smear(struct rc_src_register reg,
172                 rc_swizzle x)
173 {
174         return swizzle(reg, x, x, x, x);
175 }
176
177 static struct rc_src_register swizzle_xxxx(struct rc_src_register reg)
178 {
179         return swizzle_smear(reg, RC_SWIZZLE_X);
180 }
181
182 static struct rc_src_register swizzle_yyyy(struct rc_src_register reg)
183 {
184         return swizzle_smear(reg, RC_SWIZZLE_Y);
185 }
186
187 static struct rc_src_register swizzle_zzzz(struct rc_src_register reg)
188 {
189         return swizzle_smear(reg, RC_SWIZZLE_Z);
190 }
191
192 static struct rc_src_register swizzle_wwww(struct rc_src_register reg)
193 {
194         return swizzle_smear(reg, RC_SWIZZLE_W);
195 }
196
197 static int is_dst_safe_to_reuse(struct rc_instruction *inst)
198 {
199         const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
200         unsigned i;
201
202         assert(info->HasDstReg);
203
204         if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
205                 return 0;
206
207         for (i = 0; i < info->NumSrcRegs; i++) {
208                 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
209                     inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index)
210                         return 0;
211         }
212
213         return 1;
214 }
215
216 static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c,
217                                                struct rc_instruction *inst)
218 {
219         unsigned tmp;
220
221         if (is_dst_safe_to_reuse(inst))
222                 tmp = inst->U.I.DstReg.Index;
223         else
224                 tmp = rc_find_free_temporary(c);
225
226         return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
227 }
228
229 static void transform_ABS(struct radeon_compiler* c,
230         struct rc_instruction* inst)
231 {
232         struct rc_src_register src = inst->U.I.SrcReg[0];
233         src.Abs = 1;
234         src.Negate = RC_MASK_NONE;
235         emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, src);
236         rc_remove_instruction(inst);
237 }
238
239 static void transform_CEIL(struct radeon_compiler* c,
240         struct rc_instruction* inst)
241 {
242         /* Assuming:
243          *     ceil(x) = -floor(-x)
244          *
245          * After inlining floor:
246          *     ceil(x) = -(-x-frac(-x))
247          *
248          * After simplification:
249          *     ceil(x) = x+frac(-x)
250          */
251
252         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
253         emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
254         emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
255                 inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
256         rc_remove_instruction(inst);
257 }
258
259 static void transform_CLAMP(struct radeon_compiler *c,
260         struct rc_instruction *inst)
261 {
262         /* CLAMP dst, src, min, max
263          *    into:
264          * MIN tmp, src, max
265          * MAX dst, tmp, min
266          */
267         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
268         emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
269                 inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
270         emit2(c, inst->Prev, RC_OPCODE_MAX, &inst->U.I, inst->U.I.DstReg,
271                 srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
272         rc_remove_instruction(inst);
273 }
274
275 static void transform_DP2(struct radeon_compiler* c,
276         struct rc_instruction* inst)
277 {
278         struct rc_src_register src0 = inst->U.I.SrcReg[0];
279         struct rc_src_register src1 = inst->U.I.SrcReg[1];
280         src0.Negate &= ~(RC_MASK_Z | RC_MASK_W);
281         src0.Swizzle &= ~(63 << (3 * 2));
282         src0.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
283         src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
284         src1.Swizzle &= ~(63 << (3 * 2));
285         src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
286         emit2(c, inst->Prev, RC_OPCODE_DP3, &inst->U.I, inst->U.I.DstReg, src0, src1);
287         rc_remove_instruction(inst);
288 }
289
290 static void transform_DPH(struct radeon_compiler* c,
291         struct rc_instruction* inst)
292 {
293         struct rc_src_register src0 = inst->U.I.SrcReg[0];
294         src0.Negate &= ~RC_MASK_W;
295         src0.Swizzle &= ~(7 << (3 * 3));
296         src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);
297         emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
298         rc_remove_instruction(inst);
299 }
300
301 /**
302  * [1, src0.y*src1.y, src0.z, src1.w]
303  * So basically MUL with lotsa swizzling.
304  */
305 static void transform_DST(struct radeon_compiler* c,
306         struct rc_instruction* inst)
307 {
308         emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg,
309                 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
310                 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
311         rc_remove_instruction(inst);
312 }
313
314 static void transform_FLR(struct radeon_compiler* c,
315         struct rc_instruction* inst)
316 {
317         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
318         emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
319         emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
320                 inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
321         rc_remove_instruction(inst);
322 }
323
324 /**
325  * Definition of LIT (from ARB_fragment_program):
326  *
327  *  tmp = VectorLoad(op0);
328  *  if (tmp.x < 0) tmp.x = 0;
329  *  if (tmp.y < 0) tmp.y = 0;
330  *  if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
331  *  else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
332  *  result.x = 1.0;
333  *  result.y = tmp.x;
334  *  result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
335  *  result.w = 1.0;
336  *
337  * The longest path of computation is the one leading to result.z,
338  * consisting of 5 operations. This implementation of LIT takes
339  * 5 slots, if the subsequent optimization passes are clever enough
340  * to pair instructions correctly.
341  */
342 static void transform_LIT(struct radeon_compiler* c,
343         struct rc_instruction* inst)
344 {
345         unsigned int constant;
346         unsigned int constant_swizzle;
347         unsigned int temp;
348         struct rc_src_register srctemp;
349
350         constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
351
352         if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
353                 struct rc_instruction * inst_mov;
354
355                 inst_mov = emit1(c, inst,
356                         RC_OPCODE_MOV, 0, inst->U.I.DstReg,
357                         srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));
358
359                 inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
360                 inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
361                 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
362         }
363
364         temp = inst->U.I.DstReg.Index;
365         srctemp = srcreg(RC_FILE_TEMPORARY, temp);
366
367         /* tmp.x = max(0.0, Src.x); */
368         /* tmp.y = max(0.0, Src.y); */
369         /* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
370         emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
371                 dstregtmpmask(temp, RC_MASK_XYW),
372                 inst->U.I.SrcReg[0],
373                 swizzle(srcreg(RC_FILE_CONSTANT, constant),
374                         RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));
375         emit2(c, inst->Prev, RC_OPCODE_MIN, 0,
376                 dstregtmpmask(temp, RC_MASK_Z),
377                 swizzle_wwww(srctemp),
378                 negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
379
380         /* tmp.w = Pow(tmp.y, tmp.w) */
381         emit1(c, inst->Prev, RC_OPCODE_LG2, 0,
382                 dstregtmpmask(temp, RC_MASK_W),
383                 swizzle_yyyy(srctemp));
384         emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
385                 dstregtmpmask(temp, RC_MASK_W),
386                 swizzle_wwww(srctemp),
387                 swizzle_zzzz(srctemp));
388         emit1(c, inst->Prev, RC_OPCODE_EX2, 0,
389                 dstregtmpmask(temp, RC_MASK_W),
390                 swizzle_wwww(srctemp));
391
392         /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
393         emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I,
394                 dstregtmpmask(temp, RC_MASK_Z),
395                 negate(swizzle_xxxx(srctemp)),
396                 swizzle_wwww(srctemp),
397                 builtin_zero);
398
399         /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
400         emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I,
401                 dstregtmpmask(temp, RC_MASK_XYW),
402                 swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
403
404         rc_remove_instruction(inst);
405 }
406
407 static void transform_LRP(struct radeon_compiler* c,
408         struct rc_instruction* inst)
409 {
410         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
411
412         emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
413                 dst,
414                 inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
415         emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I,
416                 inst->U.I.DstReg,
417                 inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
418
419         rc_remove_instruction(inst);
420 }
421
422 static void transform_POW(struct radeon_compiler* c,
423         struct rc_instruction* inst)
424 {
425         struct rc_dst_register tempdst = try_to_reuse_dst(c, inst);
426         struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
427         tempdst.WriteMask = RC_MASK_W;
428         tempsrc.Swizzle = RC_SWIZZLE_WWWW;
429
430         emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
431         emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
432         emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc);
433
434         rc_remove_instruction(inst);
435 }
436
437 /* dst = ROUND(src) :
438  *   add = src + .5
439  *   frac = FRC(add)
440  *   dst = add - frac
441  *
442  * According to the GLSL spec, the implementor can decide which way to round
443  * when the fraction is .5.  We round down for .5.
444  *
445  */
446 static void transform_ROUND(struct radeon_compiler* c,
447         struct rc_instruction* inst)
448 {
449         unsigned int mask = inst->U.I.DstReg.WriteMask;
450         unsigned int frac_index, add_index;
451         struct rc_dst_register frac_dst, add_dst;
452         struct rc_src_register frac_src, add_src;
453
454         /* add = src + .5 */
455         add_index = rc_find_free_temporary(c);
456         add_dst = dstregtmpmask(add_index, mask);
457         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, add_dst, inst->U.I.SrcReg[0],
458                                                                 builtin_half);
459         add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index);
460
461
462         /* frac = FRC(add) */
463         frac_index = rc_find_free_temporary(c);
464         frac_dst = dstregtmpmask(frac_index, mask);
465         emit1(c, inst->Prev, RC_OPCODE_FRC, 0, frac_dst, add_src);
466         frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index);
467
468         /* dst = add - frac */
469         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, inst->U.I.DstReg,
470                                                 add_src, negate(frac_src));
471         rc_remove_instruction(inst);
472 }
473
474 static void transform_RSQ(struct radeon_compiler* c,
475         struct rc_instruction* inst)
476 {
477         inst->U.I.SrcReg[0] = absolute(inst->U.I.SrcReg[0]);
478 }
479
480 static void transform_SEQ(struct radeon_compiler* c,
481         struct rc_instruction* inst)
482 {
483         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
484
485         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
486         emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
487                 negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
488
489         rc_remove_instruction(inst);
490 }
491
492 static void transform_SFL(struct radeon_compiler* c,
493         struct rc_instruction* inst)
494 {
495         emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, builtin_zero);
496         rc_remove_instruction(inst);
497 }
498
499 static void transform_SGE(struct radeon_compiler* c,
500         struct rc_instruction* inst)
501 {
502         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
503
504         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
505         emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
506                 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
507
508         rc_remove_instruction(inst);
509 }
510
511 static void transform_SGT(struct radeon_compiler* c,
512         struct rc_instruction* inst)
513 {
514         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
515
516         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
517         emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
518                 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
519
520         rc_remove_instruction(inst);
521 }
522
523 static void transform_SLE(struct radeon_compiler* c,
524         struct rc_instruction* inst)
525 {
526         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
527
528         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
529         emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
530                 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
531
532         rc_remove_instruction(inst);
533 }
534
535 static void transform_SLT(struct radeon_compiler* c,
536         struct rc_instruction* inst)
537 {
538         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
539
540         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
541         emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
542                 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
543
544         rc_remove_instruction(inst);
545 }
546
547 static void transform_SNE(struct radeon_compiler* c,
548         struct rc_instruction* inst)
549 {
550         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
551
552         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
553         emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
554                 negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
555
556         rc_remove_instruction(inst);
557 }
558
559 static void transform_SSG(struct radeon_compiler* c,
560         struct rc_instruction* inst)
561 {
562         /* result = sign(x)
563          *
564          *   CMP tmp0, -x, 1, 0
565          *   CMP tmp1, x, 1, 0
566          *   ADD result, tmp0, -tmp1;
567          */
568         struct rc_dst_register dst0;
569         unsigned tmp1;
570
571         /* 0 < x */
572         dst0 = try_to_reuse_dst(c, inst);
573         emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
574               dst0,
575               negate(inst->U.I.SrcReg[0]),
576               builtin_one,
577               builtin_zero);
578
579         /* x < 0 */
580         tmp1 = rc_find_free_temporary(c);
581         emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
582               dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
583               inst->U.I.SrcReg[0],
584               builtin_one,
585               builtin_zero);
586
587         /* Either both are zero, or one of them is one and the other is zero. */
588         /* result = tmp0 - tmp1 */
589         emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
590               inst->U.I.DstReg,
591               srcreg(RC_FILE_TEMPORARY, dst0.Index),
592               negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
593
594         rc_remove_instruction(inst);
595 }
596
597 static void transform_SUB(struct radeon_compiler* c,
598         struct rc_instruction* inst)
599 {
600         inst->U.I.Opcode = RC_OPCODE_ADD;
601         inst->U.I.SrcReg[1] = negate(inst->U.I.SrcReg[1]);
602 }
603
604 static void transform_SWZ(struct radeon_compiler* c,
605         struct rc_instruction* inst)
606 {
607         inst->U.I.Opcode = RC_OPCODE_MOV;
608 }
609
610 static void transform_XPD(struct radeon_compiler* c,
611         struct rc_instruction* inst)
612 {
613         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
614
615         emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
616                 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
617                 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
618         emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I, inst->U.I.DstReg,
619                 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
620                 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
621                 negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
622
623         rc_remove_instruction(inst);
624 }
625
626
627 /**
628  * Can be used as a transformation for @ref radeonClauseLocalTransform,
629  * no userData necessary.
630  *
631  * Eliminates the following ALU instructions:
632  *  ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
633  * using:
634  *  MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
635  *
636  * Transforms RSQ to Radeon's native RSQ by explicitly setting
637  * absolute value.
638  *
639  * @note should be applicable to R300 and R500 fragment programs.
640  */
641 int radeonTransformALU(
642         struct radeon_compiler * c,
643         struct rc_instruction* inst,
644         void* unused)
645 {
646         switch(inst->U.I.Opcode) {
647         case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
648         case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
649         case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
650         case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
651         case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
652         case RC_OPCODE_DST: transform_DST(c, inst); return 1;
653         case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
654         case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;
655         case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
656         case RC_OPCODE_POW: transform_POW(c, inst); return 1;
657         case RC_OPCODE_ROUND: transform_ROUND(c, inst); return 1;
658         case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
659         case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;
660         case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
661         case RC_OPCODE_SGE: transform_SGE(c, inst); return 1;
662         case RC_OPCODE_SGT: transform_SGT(c, inst); return 1;
663         case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;
664         case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;
665         case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;
666         case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;
667         case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
668         case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
669         case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
670         default:
671                 return 0;
672         }
673 }
674
675
676 static void transform_r300_vertex_ABS(struct radeon_compiler* c,
677         struct rc_instruction* inst)
678 {
679         /* Note: r500 can take absolute values, but r300 cannot. */
680         inst->U.I.Opcode = RC_OPCODE_MAX;
681         inst->U.I.SrcReg[1] = inst->U.I.SrcReg[0];
682         inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
683 }
684
685 static void transform_r300_vertex_CMP(struct radeon_compiler* c,
686         struct rc_instruction* inst)
687 {
688         /* There is no decent CMP available, so let's rig one up.
689          * CMP is defined as dst = src0 < 0.0 ? src1 : src2
690          * The following sequence consumes zero to two temps and two extra slots
691          * (the second temp and the second slot is consumed by transform_LRP),
692          * but should be equivalent:
693          *
694          * SLT tmp0, src0, 0.0
695          * LRP dst, tmp0, src1, src2
696          *
697          * Yes, I know, I'm a mad scientist. ~ C. & M. */
698         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
699
700         /* SLT tmp0, src0, 0.0 */
701         emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
702                 dst,
703                 inst->U.I.SrcReg[0], builtin_zero);
704
705         /* LRP dst, tmp0, src1, src2 */
706         transform_LRP(c,
707                 emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
708                       inst->U.I.DstReg,
709                       srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1],  inst->U.I.SrcReg[2]));
710
711         rc_remove_instruction(inst);
712 }
713
714 static void transform_r300_vertex_DP2(struct radeon_compiler* c,
715         struct rc_instruction* inst)
716 {
717         struct rc_instruction *next_inst = inst->Next;
718         transform_DP2(c, inst);
719         next_inst->Prev->U.I.Opcode = RC_OPCODE_DP4;
720 }
721
722 static void transform_r300_vertex_DP3(struct radeon_compiler* c,
723         struct rc_instruction* inst)
724 {
725         struct rc_src_register src0 = inst->U.I.SrcReg[0];
726         struct rc_src_register src1 = inst->U.I.SrcReg[1];
727         src0.Negate &= ~RC_MASK_W;
728         src0.Swizzle &= ~(7 << (3 * 3));
729         src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
730         src1.Negate &= ~RC_MASK_W;
731         src1.Swizzle &= ~(7 << (3 * 3));
732         src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
733         emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, src1);
734         rc_remove_instruction(inst);
735 }
736
737 static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
738         struct rc_instruction* inst)
739 {
740         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
741         unsigned constant_swizzle;
742         int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,
743                                                          0.0000000000000000001,
744                                                          &constant_swizzle);
745
746         /* MOV dst, src */
747         dst.WriteMask = RC_MASK_XYZW;
748         emit1(c, inst->Prev, RC_OPCODE_MOV, 0,
749                 dst,
750                 inst->U.I.SrcReg[0]);
751
752         /* MAX dst.y, src, 0.00...001 */
753         emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
754                 dstregtmpmask(dst.Index, RC_MASK_Y),
755                 srcreg(RC_FILE_TEMPORARY, dst.Index),
756                 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
757
758         inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);
759 }
760
761 static void transform_r300_vertex_SEQ(struct radeon_compiler *c,
762         struct rc_instruction *inst)
763 {
764         /* x = y  <==>  x >= y && y >= x */
765         int tmp = rc_find_free_temporary(c);
766
767         /* x <= y */
768         emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
769               dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
770               inst->U.I.SrcReg[0],
771               inst->U.I.SrcReg[1]);
772
773         /* y <= x */
774         emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
775               inst->U.I.DstReg,
776               inst->U.I.SrcReg[1],
777               inst->U.I.SrcReg[0]);
778
779         /* x && y  =  x * y */
780         emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
781               inst->U.I.DstReg,
782               srcreg(RC_FILE_TEMPORARY, tmp),
783               srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
784
785         rc_remove_instruction(inst);
786 }
787
788 static void transform_r300_vertex_SNE(struct radeon_compiler *c,
789         struct rc_instruction *inst)
790 {
791         /* x != y  <==>  x < y || y < x */
792         int tmp = rc_find_free_temporary(c);
793
794         /* x < y */
795         emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
796               dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
797               inst->U.I.SrcReg[0],
798               inst->U.I.SrcReg[1]);
799
800         /* y < x */
801         emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
802               inst->U.I.DstReg,
803               inst->U.I.SrcReg[1],
804               inst->U.I.SrcReg[0]);
805
806         /* x || y  =  max(x, y) */
807         emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
808               inst->U.I.DstReg,
809               srcreg(RC_FILE_TEMPORARY, tmp),
810               srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
811
812         rc_remove_instruction(inst);
813 }
814
815 static void transform_r300_vertex_SGT(struct radeon_compiler* c,
816         struct rc_instruction* inst)
817 {
818         /* x > y  <==>  -x < -y */
819         inst->U.I.Opcode = RC_OPCODE_SLT;
820         inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
821         inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
822 }
823
824 static void transform_r300_vertex_SLE(struct radeon_compiler* c,
825         struct rc_instruction* inst)
826 {
827         /* x <= y  <==>  -x >= -y */
828         inst->U.I.Opcode = RC_OPCODE_SGE;
829         inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
830         inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
831 }
832
833 static void transform_r300_vertex_SSG(struct radeon_compiler* c,
834         struct rc_instruction* inst)
835 {
836         /* result = sign(x)
837          *
838          *   SLT tmp0, 0, x;
839          *   SLT tmp1, x, 0;
840          *   ADD result, tmp0, -tmp1;
841          */
842         struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);
843         unsigned tmp1;
844
845         /* 0 < x */
846         dst0 = try_to_reuse_dst(c, inst);
847         emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
848               dst0,
849               builtin_zero,
850               inst->U.I.SrcReg[0]);
851
852         /* x < 0 */
853         tmp1 = rc_find_free_temporary(c);
854         emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
855               dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
856               inst->U.I.SrcReg[0],
857               builtin_zero);
858
859         /* Either both are zero, or one of them is one and the other is zero. */
860         /* result = tmp0 - tmp1 */
861         emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
862               inst->U.I.DstReg,
863               srcreg(RC_FILE_TEMPORARY, dst0.Index),
864               negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
865
866         rc_remove_instruction(inst);
867 }
868
869 /**
870  * For use with rc_local_transform, this transforms non-native ALU
871  * instructions of the r300 up to r500 vertex engine.
872  */
873 int r300_transform_vertex_alu(
874         struct radeon_compiler * c,
875         struct rc_instruction* inst,
876         void* unused)
877 {
878         switch(inst->U.I.Opcode) {
879         case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
880         case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
881         case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
882         case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
883         case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
884         case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
885         case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
886         case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
887         case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1;
888         case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
889         case RC_OPCODE_SEQ:
890                 if (!c->is_r500) {
891                         transform_r300_vertex_SEQ(c, inst);
892                         return 1;
893                 }
894                 return 0;
895         case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
896         case RC_OPCODE_SGT: transform_r300_vertex_SGT(c, inst); return 1;
897         case RC_OPCODE_SLE: transform_r300_vertex_SLE(c, inst); return 1;
898         case RC_OPCODE_SNE:
899                 if (!c->is_r500) {
900                         transform_r300_vertex_SNE(c, inst);
901                         return 1;
902                 }
903                 return 0;
904         case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;
905         case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
906         case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
907         case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
908         default:
909                 return 0;
910         }
911 }
912
913 static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
914 {
915         static const float SinCosConsts[2][4] = {
916                 {
917                         1.273239545,            /* 4/PI */
918                         -0.405284735,           /* -4/(PI*PI) */
919                         3.141592654,            /* PI */
920                         0.2225                  /* weight */
921                 },
922                 {
923                         0.75,
924                         0.5,
925                         0.159154943,            /* 1/(2*PI) */
926                         6.283185307             /* 2*PI */
927                 }
928         };
929         int i;
930
931         for(i = 0; i < 2; ++i)
932                 constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);
933 }
934
935 /**
936  * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
937  *
938  * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
939  * MAD tmp.x, tmp.y, |src|, tmp.x
940  * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
941  * MAD dest, tmp.y, weight, tmp.x
942  */
943 static void sin_approx(
944         struct radeon_compiler* c, struct rc_instruction * inst,
945         struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
946 {
947         unsigned int tempreg = rc_find_free_temporary(c);
948
949         emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY),
950                 swizzle_xxxx(src),
951                 srcreg(RC_FILE_CONSTANT, constants[0]));
952         emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X),
953                 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
954                 absolute(swizzle_xxxx(src)),
955                 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
956         emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y),
957                 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
958                 absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))),
959                 negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))));
960         emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dst,
961                 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
962                 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])),
963                 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
964 }
965
966 /**
967  * Translate the trigonometric functions COS, SIN, and SCS
968  * using only the basic instructions
969  *  MOV, ADD, MUL, MAD, FRC
970  */
971 int r300_transform_trig_simple(struct radeon_compiler* c,
972         struct rc_instruction* inst,
973         void* unused)
974 {
975         unsigned int constants[2];
976         unsigned int tempreg;
977
978         if (inst->U.I.Opcode != RC_OPCODE_COS &&
979             inst->U.I.Opcode != RC_OPCODE_SIN &&
980             inst->U.I.Opcode != RC_OPCODE_SCS)
981                 return 0;
982
983         tempreg = rc_find_free_temporary(c);
984
985         sincos_constants(c, constants);
986
987         if (inst->U.I.Opcode == RC_OPCODE_COS) {
988                 /* MAD tmp.x, src, 1/(2*PI), 0.75 */
989                 /* FRC tmp.x, tmp.x */
990                 /* MAD tmp.z, tmp.x, 2*PI, -PI */
991                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
992                         swizzle_xxxx(inst->U.I.SrcReg[0]),
993                         swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
994                         swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1])));
995                 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
996                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
997                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
998                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
999                         swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1000                         negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1001
1002                 sin_approx(c, inst, inst->U.I.DstReg,
1003                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1004                         constants);
1005         } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1006                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1007                         swizzle_xxxx(inst->U.I.SrcReg[0]),
1008                         swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1009                         swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1])));
1010                 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
1011                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
1012                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1013                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1014                         swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1015                         negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1016
1017                 sin_approx(c, inst, inst->U.I.DstReg,
1018                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1019                         constants);
1020         } else {
1021                 struct rc_dst_register dst;
1022
1023                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1024                         swizzle_xxxx(inst->U.I.SrcReg[0]),
1025                         swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1026                         swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));
1027                 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1028                         srcreg(RC_FILE_TEMPORARY, tempreg));
1029                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1030                         srcreg(RC_FILE_TEMPORARY, tempreg),
1031                         swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1032                         negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1033
1034                 dst = inst->U.I.DstReg;
1035
1036                 dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;
1037                 sin_approx(c, inst, dst,
1038                         swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
1039                         constants);
1040
1041                 dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;
1042                 sin_approx(c, inst, dst,
1043                         swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
1044                         constants);
1045         }
1046
1047         rc_remove_instruction(inst);
1048
1049         return 1;
1050 }
1051
1052 static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c,
1053         struct rc_instruction *inst,
1054         unsigned srctmp)
1055 {
1056         if (inst->U.I.Opcode == RC_OPCODE_COS) {
1057                 emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg,
1058                         srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1059         } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1060                 emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I,
1061                         inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1062         } else if (inst->U.I.Opcode == RC_OPCODE_SCS) {
1063                 struct rc_dst_register moddst = inst->U.I.DstReg;
1064
1065                 if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {
1066                         moddst.WriteMask = RC_MASK_X;
1067                         emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, moddst,
1068                                 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1069                 }
1070                 if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {
1071                         moddst.WriteMask = RC_MASK_Y;
1072                         emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I, moddst,
1073                                 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1074                 }
1075         }
1076
1077         rc_remove_instruction(inst);
1078 }
1079
1080
1081 /**
1082  * Transform the trigonometric functions COS, SIN, and SCS
1083  * to include pre-scaling by 1/(2*PI) and taking the fractional
1084  * part, so that the input to COS and SIN is always in the range [0,1).
1085  * SCS is replaced by one COS and one SIN instruction.
1086  *
1087  * @warning This transformation implicitly changes the semantics of SIN and COS!
1088  */
1089 int radeonTransformTrigScale(struct radeon_compiler* c,
1090         struct rc_instruction* inst,
1091         void* unused)
1092 {
1093         static const float RCP_2PI = 0.15915494309189535;
1094         unsigned int temp;
1095         unsigned int constant;
1096         unsigned int constant_swizzle;
1097
1098         if (inst->U.I.Opcode != RC_OPCODE_COS &&
1099             inst->U.I.Opcode != RC_OPCODE_SIN &&
1100             inst->U.I.Opcode != RC_OPCODE_SCS)
1101                 return 0;
1102
1103         temp = rc_find_free_temporary(c);
1104         constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
1105
1106         emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(temp, RC_MASK_W),
1107                 swizzle_xxxx(inst->U.I.SrcReg[0]),
1108                 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
1109         emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1110                 srcreg(RC_FILE_TEMPORARY, temp));
1111
1112         r300_transform_SIN_COS_SCS(c, inst, temp);
1113         return 1;
1114 }
1115
1116 /**
1117  * Transform the trigonometric functions COS, SIN, and SCS
1118  * so that the input to COS and SIN is always in the range [-PI, PI].
1119  * SCS is replaced by one COS and one SIN instruction.
1120  */
1121 int r300_transform_trig_scale_vertex(struct radeon_compiler *c,
1122         struct rc_instruction *inst,
1123         void *unused)
1124 {
1125         static const float cons[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979};
1126         unsigned int temp;
1127         unsigned int constant;
1128
1129         if (inst->U.I.Opcode != RC_OPCODE_COS &&
1130             inst->U.I.Opcode != RC_OPCODE_SIN &&
1131             inst->U.I.Opcode != RC_OPCODE_SCS)
1132                 return 0;
1133
1134         /* Repeat x in the range [-PI, PI]:
1135          *
1136          *   repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI
1137          */
1138
1139         temp = rc_find_free_temporary(c);
1140         constant = rc_constants_add_immediate_vec4(&c->Program.Constants, cons);
1141
1142         emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1143                 swizzle_xxxx(inst->U.I.SrcReg[0]),
1144                 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_XXXX),
1145                 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_YYYY));
1146         emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1147                 srcreg(RC_FILE_TEMPORARY, temp));
1148         emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1149                 srcreg(RC_FILE_TEMPORARY, temp),
1150                 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_ZZZZ),
1151                 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_WWWW));
1152
1153         r300_transform_SIN_COS_SCS(c, inst, temp);
1154         return 1;
1155 }
1156
1157 /**
1158  * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
1159  * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
1160  * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
1161  *
1162  * @warning This explicitly changes the form of DDX and DDY!
1163  */
1164
1165 int radeonTransformDeriv(struct radeon_compiler* c,
1166         struct rc_instruction* inst,
1167         void* unused)
1168 {
1169         if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY)
1170                 return 0;
1171
1172         inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_1111;
1173         inst->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
1174
1175         return 1;
1176 }
1177
1178 /**
1179  * IF Temp[0].x -> IF Temp[0].x
1180  * ...          -> ...
1181  * KILP         -> KIL -abs(Temp[0].x)
1182  * ...          -> ...
1183  * ENDIF        -> ENDIF
1184  *
1185  * === OR ===
1186  *
1187  * IF Temp[0].x -\
1188  * KILP         - > KIL -abs(Temp[0].x)
1189  * ENDIF        -/
1190  *
1191  * === OR ===
1192  *
1193  * IF Temp[0].x -> IF Temp[0].x
1194  * ...          -> ...
1195  * ELSE         -> ELSE
1196  * ...          -> ...
1197  * KILP         -> KIL -abs(Temp[0].x)
1198  * ...          -> ...
1199  * ENDIF        -> ENDIF
1200  *
1201  * === OR ===
1202  *
1203  * KILP         -> KIL -none.1111
1204  *
1205  * This needs to be done in its own pass, because it might modify the
1206  * instructions before and after KILP.
1207  */
1208 void rc_transform_KILP(struct radeon_compiler * c, void *user)
1209 {
1210         struct rc_instruction * inst;
1211         for (inst = c->Program.Instructions.Next;
1212                         inst != &c->Program.Instructions; inst = inst->Next) {
1213                 struct rc_instruction * if_inst;
1214                 unsigned in_if = 0;
1215
1216                 if (inst->U.I.Opcode != RC_OPCODE_KILP)
1217                         continue;
1218
1219                 for (if_inst = inst->Prev; if_inst != &c->Program.Instructions;
1220                                                 if_inst = if_inst->Prev) {
1221
1222                         if (if_inst->U.I.Opcode == RC_OPCODE_IF) {
1223                                 in_if = 1;
1224                                 break;
1225                         }
1226                 }
1227
1228                 inst->U.I.Opcode = RC_OPCODE_KIL;
1229
1230                 if (!in_if) {
1231                         inst->U.I.SrcReg[0] = negate(builtin_one);
1232                 } else {
1233                         /* This should work even if the KILP is inside the ELSE
1234                          * block, because -0.0 is considered negative. */
1235                         inst->U.I.SrcReg[0] =
1236                                 negate(absolute(if_inst->U.I.SrcReg[0]));
1237
1238                         if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
1239                                 && inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
1240
1241                                 /* Optimize the special case:
1242                                  * IF Temp[0].x
1243                                  * KILP
1244                                  * ENDIF
1245                                  */
1246
1247                                 /* Remove IF */
1248                                 rc_remove_instruction(inst->Prev);
1249                                 /* Remove ENDIF */
1250                                 rc_remove_instruction(inst->Next);
1251                         }
1252                 }
1253         }
1254 }