Tizen 2.0 Release
[profile/ivi/osmesa.git] / src / mesa / drivers / dri / r300 / compiler / radeon_program_alu.c
1 /*
2  * Copyright (C) 2008 Nicolai Haehnle.
3  *
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining
7  * a copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sublicense, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial
16  * portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21  * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  */
27
28 /**
29  * @file
30  *
31  * Shareable transformations that transform "special" ALU instructions
32  * into ALU instructions that are supported by hardware.
33  *
34  */
35
36 #include "radeon_program_alu.h"
37
38 #include "radeon_compiler.h"
39 #include "radeon_compiler_util.h"
40
41
42 static struct rc_instruction *emit1(
43         struct radeon_compiler * c, struct rc_instruction * after,
44         rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
45         struct rc_src_register SrcReg)
46 {
47         struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
48
49         fpi->U.I.Opcode = Opcode;
50         fpi->U.I.SaturateMode = Saturate;
51         fpi->U.I.DstReg = DstReg;
52         fpi->U.I.SrcReg[0] = SrcReg;
53         return fpi;
54 }
55
56 static struct rc_instruction *emit2(
57         struct radeon_compiler * c, struct rc_instruction * after,
58         rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
59         struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
60 {
61         struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
62
63         fpi->U.I.Opcode = Opcode;
64         fpi->U.I.SaturateMode = Saturate;
65         fpi->U.I.DstReg = DstReg;
66         fpi->U.I.SrcReg[0] = SrcReg0;
67         fpi->U.I.SrcReg[1] = SrcReg1;
68         return fpi;
69 }
70
71 static struct rc_instruction *emit3(
72         struct radeon_compiler * c, struct rc_instruction * after,
73         rc_opcode Opcode, rc_saturate_mode Saturate, struct rc_dst_register DstReg,
74         struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
75         struct rc_src_register SrcReg2)
76 {
77         struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
78
79         fpi->U.I.Opcode = Opcode;
80         fpi->U.I.SaturateMode = Saturate;
81         fpi->U.I.DstReg = DstReg;
82         fpi->U.I.SrcReg[0] = SrcReg0;
83         fpi->U.I.SrcReg[1] = SrcReg1;
84         fpi->U.I.SrcReg[2] = SrcReg2;
85         return fpi;
86 }
87
88 static struct rc_dst_register dstregtmpmask(int index, int mask)
89 {
90         struct rc_dst_register dst = {0};
91         dst.File = RC_FILE_TEMPORARY;
92         dst.Index = index;
93         dst.WriteMask = mask;
94         return dst;
95 }
96
97 static const struct rc_src_register builtin_zero = {
98         .File = RC_FILE_NONE,
99         .Index = 0,
100         .Swizzle = RC_SWIZZLE_0000
101 };
102 static const struct rc_src_register builtin_one = {
103         .File = RC_FILE_NONE,
104         .Index = 0,
105         .Swizzle = RC_SWIZZLE_1111
106 };
107 static const struct rc_src_register srcreg_undefined = {
108         .File = RC_FILE_NONE,
109         .Index = 0,
110         .Swizzle = RC_SWIZZLE_XYZW
111 };
112
113 static struct rc_src_register srcreg(int file, int index)
114 {
115         struct rc_src_register src = srcreg_undefined;
116         src.File = file;
117         src.Index = index;
118         return src;
119 }
120
121 static struct rc_src_register srcregswz(int file, int index, int swz)
122 {
123         struct rc_src_register src = srcreg_undefined;
124         src.File = file;
125         src.Index = index;
126         src.Swizzle = swz;
127         return src;
128 }
129
130 static struct rc_src_register absolute(struct rc_src_register reg)
131 {
132         struct rc_src_register newreg = reg;
133         newreg.Abs = 1;
134         newreg.Negate = RC_MASK_NONE;
135         return newreg;
136 }
137
138 static struct rc_src_register negate(struct rc_src_register reg)
139 {
140         struct rc_src_register newreg = reg;
141         newreg.Negate = newreg.Negate ^ RC_MASK_XYZW;
142         return newreg;
143 }
144
145 static struct rc_src_register swizzle(struct rc_src_register reg,
146                 rc_swizzle x, rc_swizzle y, rc_swizzle z, rc_swizzle w)
147 {
148         struct rc_src_register swizzled = reg;
149         swizzled.Swizzle = combine_swizzles4(reg.Swizzle, x, y, z, w);
150         return swizzled;
151 }
152
153 static struct rc_src_register swizzle_smear(struct rc_src_register reg,
154                 rc_swizzle x)
155 {
156         return swizzle(reg, x, x, x, x);
157 }
158
159 static struct rc_src_register swizzle_xxxx(struct rc_src_register reg)
160 {
161         return swizzle_smear(reg, RC_SWIZZLE_X);
162 }
163
164 static struct rc_src_register swizzle_yyyy(struct rc_src_register reg)
165 {
166         return swizzle_smear(reg, RC_SWIZZLE_Y);
167 }
168
169 static struct rc_src_register swizzle_zzzz(struct rc_src_register reg)
170 {
171         return swizzle_smear(reg, RC_SWIZZLE_Z);
172 }
173
174 static struct rc_src_register swizzle_wwww(struct rc_src_register reg)
175 {
176         return swizzle_smear(reg, RC_SWIZZLE_W);
177 }
178
179 static int is_dst_safe_to_reuse(struct rc_instruction *inst)
180 {
181         const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
182         unsigned i;
183
184         assert(info->HasDstReg);
185
186         if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
187                 return 0;
188
189         for (i = 0; i < info->NumSrcRegs; i++) {
190                 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
191                     inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index)
192                         return 0;
193         }
194
195         return 1;
196 }
197
198 static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c,
199                                                struct rc_instruction *inst)
200 {
201         unsigned tmp;
202
203         if (is_dst_safe_to_reuse(inst))
204                 tmp = inst->U.I.DstReg.Index;
205         else
206                 tmp = rc_find_free_temporary(c);
207
208         return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
209 }
210
211 static void transform_ABS(struct radeon_compiler* c,
212         struct rc_instruction* inst)
213 {
214         struct rc_src_register src = inst->U.I.SrcReg[0];
215         src.Abs = 1;
216         src.Negate = RC_MASK_NONE;
217         emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode, inst->U.I.DstReg, src);
218         rc_remove_instruction(inst);
219 }
220
221 static void transform_CEIL(struct radeon_compiler* c,
222         struct rc_instruction* inst)
223 {
224         /* Assuming:
225          *     ceil(x) = -floor(-x)
226          *
227          * After inlining floor:
228          *     ceil(x) = -(-x-frac(-x))
229          *
230          * After simplification:
231          *     ceil(x) = x+frac(-x)
232          */
233
234         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
235         emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
236         emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
237                 inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
238         rc_remove_instruction(inst);
239 }
240
241 static void transform_CLAMP(struct radeon_compiler *c,
242         struct rc_instruction *inst)
243 {
244         /* CLAMP dst, src, min, max
245          *    into:
246          * MIN tmp, src, max
247          * MAX dst, tmp, min
248          */
249         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
250         emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
251                 inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
252         emit2(c, inst->Prev, RC_OPCODE_MAX, inst->U.I.SaturateMode, inst->U.I.DstReg,
253                 srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
254         rc_remove_instruction(inst);
255 }
256
257 static void transform_DP2(struct radeon_compiler* c,
258         struct rc_instruction* inst)
259 {
260         struct rc_src_register src0 = inst->U.I.SrcReg[0];
261         struct rc_src_register src1 = inst->U.I.SrcReg[1];
262         src0.Negate &= ~(RC_MASK_Z | RC_MASK_W);
263         src0.Swizzle &= ~(63 << (3 * 2));
264         src0.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
265         src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
266         src1.Swizzle &= ~(63 << (3 * 2));
267         src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
268         emit2(c, inst->Prev, RC_OPCODE_DP3, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, src1);
269         rc_remove_instruction(inst);
270 }
271
272 static void transform_DPH(struct radeon_compiler* c,
273         struct rc_instruction* inst)
274 {
275         struct rc_src_register src0 = inst->U.I.SrcReg[0];
276         src0.Negate &= ~RC_MASK_W;
277         src0.Swizzle &= ~(7 << (3 * 3));
278         src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);
279         emit2(c, inst->Prev, RC_OPCODE_DP4, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
280         rc_remove_instruction(inst);
281 }
282
283 /**
284  * [1, src0.y*src1.y, src0.z, src1.w]
285  * So basically MUL with lotsa swizzling.
286  */
287 static void transform_DST(struct radeon_compiler* c,
288         struct rc_instruction* inst)
289 {
290         emit2(c, inst->Prev, RC_OPCODE_MUL, inst->U.I.SaturateMode, inst->U.I.DstReg,
291                 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
292                 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
293         rc_remove_instruction(inst);
294 }
295
296 static void transform_FLR(struct radeon_compiler* c,
297         struct rc_instruction* inst)
298 {
299         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
300         emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
301         emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
302                 inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
303         rc_remove_instruction(inst);
304 }
305
306 /**
307  * Definition of LIT (from ARB_fragment_program):
308  *
309  *  tmp = VectorLoad(op0);
310  *  if (tmp.x < 0) tmp.x = 0;
311  *  if (tmp.y < 0) tmp.y = 0;
312  *  if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
313  *  else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
314  *  result.x = 1.0;
315  *  result.y = tmp.x;
316  *  result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
317  *  result.w = 1.0;
318  *
319  * The longest path of computation is the one leading to result.z,
320  * consisting of 5 operations. This implementation of LIT takes
321  * 5 slots, if the subsequent optimization passes are clever enough
322  * to pair instructions correctly.
323  */
324 static void transform_LIT(struct radeon_compiler* c,
325         struct rc_instruction* inst)
326 {
327         unsigned int constant;
328         unsigned int constant_swizzle;
329         unsigned int temp;
330         struct rc_src_register srctemp;
331
332         constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
333
334         if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
335                 struct rc_instruction * inst_mov;
336
337                 inst_mov = emit1(c, inst,
338                         RC_OPCODE_MOV, 0, inst->U.I.DstReg,
339                         srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));
340
341                 inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
342                 inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
343                 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
344         }
345
346         temp = inst->U.I.DstReg.Index;
347         srctemp = srcreg(RC_FILE_TEMPORARY, temp);
348
349         /* tmp.x = max(0.0, Src.x); */
350         /* tmp.y = max(0.0, Src.y); */
351         /* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
352         emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
353                 dstregtmpmask(temp, RC_MASK_XYW),
354                 inst->U.I.SrcReg[0],
355                 swizzle(srcreg(RC_FILE_CONSTANT, constant),
356                         RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));
357         emit2(c, inst->Prev, RC_OPCODE_MIN, 0,
358                 dstregtmpmask(temp, RC_MASK_Z),
359                 swizzle_wwww(srctemp),
360                 negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
361
362         /* tmp.w = Pow(tmp.y, tmp.w) */
363         emit1(c, inst->Prev, RC_OPCODE_LG2, 0,
364                 dstregtmpmask(temp, RC_MASK_W),
365                 swizzle_yyyy(srctemp));
366         emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
367                 dstregtmpmask(temp, RC_MASK_W),
368                 swizzle_wwww(srctemp),
369                 swizzle_zzzz(srctemp));
370         emit1(c, inst->Prev, RC_OPCODE_EX2, 0,
371                 dstregtmpmask(temp, RC_MASK_W),
372                 swizzle_wwww(srctemp));
373
374         /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
375         emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode,
376                 dstregtmpmask(temp, RC_MASK_Z),
377                 negate(swizzle_xxxx(srctemp)),
378                 swizzle_wwww(srctemp),
379                 builtin_zero);
380
381         /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
382         emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode,
383                 dstregtmpmask(temp, RC_MASK_XYW),
384                 swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
385
386         rc_remove_instruction(inst);
387 }
388
389 static void transform_LRP(struct radeon_compiler* c,
390         struct rc_instruction* inst)
391 {
392         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
393
394         emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
395                 dst,
396                 inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
397         emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode,
398                 inst->U.I.DstReg,
399                 inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
400
401         rc_remove_instruction(inst);
402 }
403
404 static void transform_POW(struct radeon_compiler* c,
405         struct rc_instruction* inst)
406 {
407         struct rc_dst_register tempdst = try_to_reuse_dst(c, inst);
408         struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
409         tempdst.WriteMask = RC_MASK_W;
410         tempsrc.Swizzle = RC_SWIZZLE_WWWW;
411
412         emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
413         emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
414         emit1(c, inst->Prev, RC_OPCODE_EX2, inst->U.I.SaturateMode, inst->U.I.DstReg, tempsrc);
415
416         rc_remove_instruction(inst);
417 }
418
419 static void transform_RSQ(struct radeon_compiler* c,
420         struct rc_instruction* inst)
421 {
422         inst->U.I.SrcReg[0] = absolute(inst->U.I.SrcReg[0]);
423 }
424
425 static void transform_SEQ(struct radeon_compiler* c,
426         struct rc_instruction* inst)
427 {
428         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
429
430         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
431         emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
432                 negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
433
434         rc_remove_instruction(inst);
435 }
436
437 static void transform_SFL(struct radeon_compiler* c,
438         struct rc_instruction* inst)
439 {
440         emit1(c, inst->Prev, RC_OPCODE_MOV, inst->U.I.SaturateMode, inst->U.I.DstReg, builtin_zero);
441         rc_remove_instruction(inst);
442 }
443
444 static void transform_SGE(struct radeon_compiler* c,
445         struct rc_instruction* inst)
446 {
447         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
448
449         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
450         emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
451                 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
452
453         rc_remove_instruction(inst);
454 }
455
456 static void transform_SGT(struct radeon_compiler* c,
457         struct rc_instruction* inst)
458 {
459         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
460
461         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
462         emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
463                 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
464
465         rc_remove_instruction(inst);
466 }
467
468 static void transform_SLE(struct radeon_compiler* c,
469         struct rc_instruction* inst)
470 {
471         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
472
473         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
474         emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
475                 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
476
477         rc_remove_instruction(inst);
478 }
479
480 static void transform_SLT(struct radeon_compiler* c,
481         struct rc_instruction* inst)
482 {
483         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
484
485         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
486         emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
487                 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
488
489         rc_remove_instruction(inst);
490 }
491
492 static void transform_SNE(struct radeon_compiler* c,
493         struct rc_instruction* inst)
494 {
495         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
496
497         emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
498         emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
499                 negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
500
501         rc_remove_instruction(inst);
502 }
503
504 static void transform_SSG(struct radeon_compiler* c,
505         struct rc_instruction* inst)
506 {
507         /* result = sign(x)
508          *
509          *   CMP tmp0, -x, 1, 0
510          *   CMP tmp1, x, 1, 0
511          *   ADD result, tmp0, -tmp1;
512          */
513         struct rc_dst_register dst0;
514         unsigned tmp1;
515
516         /* 0 < x */
517         dst0 = try_to_reuse_dst(c, inst);
518         emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
519               dst0,
520               negate(inst->U.I.SrcReg[0]),
521               builtin_one,
522               builtin_zero);
523
524         /* x < 0 */
525         tmp1 = rc_find_free_temporary(c);
526         emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
527               dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
528               inst->U.I.SrcReg[0],
529               builtin_one,
530               builtin_zero);
531
532         /* Either both are zero, or one of them is one and the other is zero. */
533         /* result = tmp0 - tmp1 */
534         emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
535               inst->U.I.DstReg,
536               srcreg(RC_FILE_TEMPORARY, dst0.Index),
537               negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
538
539         rc_remove_instruction(inst);
540 }
541
542 static void transform_SUB(struct radeon_compiler* c,
543         struct rc_instruction* inst)
544 {
545         inst->U.I.Opcode = RC_OPCODE_ADD;
546         inst->U.I.SrcReg[1] = negate(inst->U.I.SrcReg[1]);
547 }
548
549 static void transform_SWZ(struct radeon_compiler* c,
550         struct rc_instruction* inst)
551 {
552         inst->U.I.Opcode = RC_OPCODE_MOV;
553 }
554
555 static void transform_XPD(struct radeon_compiler* c,
556         struct rc_instruction* inst)
557 {
558         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
559
560         emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
561                 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
562                 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
563         emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode, inst->U.I.DstReg,
564                 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
565                 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
566                 negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
567
568         rc_remove_instruction(inst);
569 }
570
571
572 /**
573  * Can be used as a transformation for @ref radeonClauseLocalTransform,
574  * no userData necessary.
575  *
576  * Eliminates the following ALU instructions:
577  *  ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
578  * using:
579  *  MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
580  *
581  * Transforms RSQ to Radeon's native RSQ by explicitly setting
582  * absolute value.
583  *
584  * @note should be applicable to R300 and R500 fragment programs.
585  */
586 int radeonTransformALU(
587         struct radeon_compiler * c,
588         struct rc_instruction* inst,
589         void* unused)
590 {
591         switch(inst->U.I.Opcode) {
592         case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
593         case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
594         case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
595         case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
596         case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
597         case RC_OPCODE_DST: transform_DST(c, inst); return 1;
598         case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
599         case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;
600         case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
601         case RC_OPCODE_POW: transform_POW(c, inst); return 1;
602         case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
603         case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;
604         case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
605         case RC_OPCODE_SGE: transform_SGE(c, inst); return 1;
606         case RC_OPCODE_SGT: transform_SGT(c, inst); return 1;
607         case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;
608         case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;
609         case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;
610         case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;
611         case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
612         case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
613         case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
614         default:
615                 return 0;
616         }
617 }
618
619
620 static void transform_r300_vertex_ABS(struct radeon_compiler* c,
621         struct rc_instruction* inst)
622 {
623         /* Note: r500 can take absolute values, but r300 cannot. */
624         inst->U.I.Opcode = RC_OPCODE_MAX;
625         inst->U.I.SrcReg[1] = inst->U.I.SrcReg[0];
626         inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
627 }
628
629 static void transform_r300_vertex_CMP(struct radeon_compiler* c,
630         struct rc_instruction* inst)
631 {
632         /* There is no decent CMP available, so let's rig one up.
633          * CMP is defined as dst = src0 < 0.0 ? src1 : src2
634          * The following sequence consumes zero to two temps and two extra slots
635          * (the second temp and the second slot is consumed by transform_LRP),
636          * but should be equivalent:
637          *
638          * SLT tmp0, src0, 0.0
639          * LRP dst, tmp0, src1, src2
640          *
641          * Yes, I know, I'm a mad scientist. ~ C. & M. */
642         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
643
644         /* SLT tmp0, src0, 0.0 */
645         emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
646                 dst,
647                 inst->U.I.SrcReg[0], builtin_zero);
648
649         /* LRP dst, tmp0, src1, src2 */
650         transform_LRP(c,
651                 emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
652                       inst->U.I.DstReg,
653                       srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1],  inst->U.I.SrcReg[2]));
654
655         rc_remove_instruction(inst);
656 }
657
658 static void transform_r300_vertex_DP2(struct radeon_compiler* c,
659         struct rc_instruction* inst)
660 {
661         struct rc_instruction *next_inst = inst->Next;
662         transform_DP2(c, inst);
663         next_inst->Prev->U.I.Opcode = RC_OPCODE_DP4;
664 }
665
666 static void transform_r300_vertex_DP3(struct radeon_compiler* c,
667         struct rc_instruction* inst)
668 {
669         struct rc_src_register src0 = inst->U.I.SrcReg[0];
670         struct rc_src_register src1 = inst->U.I.SrcReg[1];
671         src0.Negate &= ~RC_MASK_W;
672         src0.Swizzle &= ~(7 << (3 * 3));
673         src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
674         src1.Negate &= ~RC_MASK_W;
675         src1.Swizzle &= ~(7 << (3 * 3));
676         src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
677         emit2(c, inst->Prev, RC_OPCODE_DP4, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, src1);
678         rc_remove_instruction(inst);
679 }
680
681 static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
682         struct rc_instruction* inst)
683 {
684         struct rc_dst_register dst = try_to_reuse_dst(c, inst);
685         unsigned constant_swizzle;
686         int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,
687                                                          0.0000000000000000001,
688                                                          &constant_swizzle);
689
690         /* MOV dst, src */
691         dst.WriteMask = RC_MASK_XYZW;
692         emit1(c, inst->Prev, RC_OPCODE_MOV, 0,
693                 dst,
694                 inst->U.I.SrcReg[0]);
695
696         /* MAX dst.y, src, 0.00...001 */
697         emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
698                 dstregtmpmask(dst.Index, RC_MASK_Y),
699                 srcreg(RC_FILE_TEMPORARY, dst.Index),
700                 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
701
702         inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);
703 }
704
705 static void transform_r300_vertex_SEQ(struct radeon_compiler *c,
706         struct rc_instruction *inst)
707 {
708         /* x = y  <==>  x >= y && y >= x */
709         int tmp = rc_find_free_temporary(c);
710
711         /* x <= y */
712         emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
713               dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
714               inst->U.I.SrcReg[0],
715               inst->U.I.SrcReg[1]);
716
717         /* y <= x */
718         emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
719               inst->U.I.DstReg,
720               inst->U.I.SrcReg[1],
721               inst->U.I.SrcReg[0]);
722
723         /* x && y  =  x * y */
724         emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
725               inst->U.I.DstReg,
726               srcreg(RC_FILE_TEMPORARY, tmp),
727               srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
728
729         rc_remove_instruction(inst);
730 }
731
732 static void transform_r300_vertex_SNE(struct radeon_compiler *c,
733         struct rc_instruction *inst)
734 {
735         /* x != y  <==>  x < y || y < x */
736         int tmp = rc_find_free_temporary(c);
737
738         /* x < y */
739         emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
740               dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
741               inst->U.I.SrcReg[0],
742               inst->U.I.SrcReg[1]);
743
744         /* y < x */
745         emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
746               inst->U.I.DstReg,
747               inst->U.I.SrcReg[1],
748               inst->U.I.SrcReg[0]);
749
750         /* x || y  =  max(x, y) */
751         emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
752               inst->U.I.DstReg,
753               srcreg(RC_FILE_TEMPORARY, tmp),
754               srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
755
756         rc_remove_instruction(inst);
757 }
758
759 static void transform_r300_vertex_SGT(struct radeon_compiler* c,
760         struct rc_instruction* inst)
761 {
762         /* x > y  <==>  -x < -y */
763         inst->U.I.Opcode = RC_OPCODE_SLT;
764         inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
765         inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
766 }
767
768 static void transform_r300_vertex_SLE(struct radeon_compiler* c,
769         struct rc_instruction* inst)
770 {
771         /* x <= y  <==>  -x >= -y */
772         inst->U.I.Opcode = RC_OPCODE_SGE;
773         inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
774         inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
775 }
776
777 static void transform_r300_vertex_SSG(struct radeon_compiler* c,
778         struct rc_instruction* inst)
779 {
780         /* result = sign(x)
781          *
782          *   SLT tmp0, 0, x;
783          *   SLT tmp1, x, 0;
784          *   ADD result, tmp0, -tmp1;
785          */
786         struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);
787         unsigned tmp1;
788
789         /* 0 < x */
790         dst0 = try_to_reuse_dst(c, inst);
791         emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
792               dst0,
793               builtin_zero,
794               inst->U.I.SrcReg[0]);
795
796         /* x < 0 */
797         tmp1 = rc_find_free_temporary(c);
798         emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
799               dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
800               inst->U.I.SrcReg[0],
801               builtin_zero);
802
803         /* Either both are zero, or one of them is one and the other is zero. */
804         /* result = tmp0 - tmp1 */
805         emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
806               inst->U.I.DstReg,
807               srcreg(RC_FILE_TEMPORARY, dst0.Index),
808               negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
809
810         rc_remove_instruction(inst);
811 }
812
813 /**
814  * For use with rc_local_transform, this transforms non-native ALU
815  * instructions of the r300 up to r500 vertex engine.
816  */
817 int r300_transform_vertex_alu(
818         struct radeon_compiler * c,
819         struct rc_instruction* inst,
820         void* unused)
821 {
822         switch(inst->U.I.Opcode) {
823         case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
824         case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
825         case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
826         case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
827         case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
828         case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
829         case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
830         case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
831         case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1;
832         case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
833         case RC_OPCODE_SEQ:
834                 if (!c->is_r500) {
835                         transform_r300_vertex_SEQ(c, inst);
836                         return 1;
837                 }
838                 return 0;
839         case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
840         case RC_OPCODE_SGT: transform_r300_vertex_SGT(c, inst); return 1;
841         case RC_OPCODE_SLE: transform_r300_vertex_SLE(c, inst); return 1;
842         case RC_OPCODE_SNE:
843                 if (!c->is_r500) {
844                         transform_r300_vertex_SNE(c, inst);
845                         return 1;
846                 }
847                 return 0;
848         case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;
849         case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
850         case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
851         case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
852         default:
853                 return 0;
854         }
855 }
856
857 static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
858 {
859         static const float SinCosConsts[2][4] = {
860                 {
861                         1.273239545,            /* 4/PI */
862                         -0.405284735,           /* -4/(PI*PI) */
863                         3.141592654,            /* PI */
864                         0.2225                  /* weight */
865                 },
866                 {
867                         0.75,
868                         0.5,
869                         0.159154943,            /* 1/(2*PI) */
870                         6.283185307             /* 2*PI */
871                 }
872         };
873         int i;
874
875         for(i = 0; i < 2; ++i)
876                 constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);
877 }
878
879 /**
880  * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
881  *
882  * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
883  * MAD tmp.x, tmp.y, |src|, tmp.x
884  * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
885  * MAD dest, tmp.y, weight, tmp.x
886  */
887 static void sin_approx(
888         struct radeon_compiler* c, struct rc_instruction * inst,
889         struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
890 {
891         unsigned int tempreg = rc_find_free_temporary(c);
892
893         emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY),
894                 swizzle_xxxx(src),
895                 srcreg(RC_FILE_CONSTANT, constants[0]));
896         emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X),
897                 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
898                 absolute(swizzle_xxxx(src)),
899                 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
900         emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y),
901                 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
902                 absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))),
903                 negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))));
904         emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dst,
905                 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
906                 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])),
907                 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
908 }
909
910 /**
911  * Translate the trigonometric functions COS, SIN, and SCS
912  * using only the basic instructions
913  *  MOV, ADD, MUL, MAD, FRC
914  */
915 int r300_transform_trig_simple(struct radeon_compiler* c,
916         struct rc_instruction* inst,
917         void* unused)
918 {
919         unsigned int constants[2];
920         unsigned int tempreg;
921
922         if (inst->U.I.Opcode != RC_OPCODE_COS &&
923             inst->U.I.Opcode != RC_OPCODE_SIN &&
924             inst->U.I.Opcode != RC_OPCODE_SCS)
925                 return 0;
926
927         tempreg = rc_find_free_temporary(c);
928
929         sincos_constants(c, constants);
930
931         if (inst->U.I.Opcode == RC_OPCODE_COS) {
932                 /* MAD tmp.x, src, 1/(2*PI), 0.75 */
933                 /* FRC tmp.x, tmp.x */
934                 /* MAD tmp.z, tmp.x, 2*PI, -PI */
935                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
936                         swizzle_xxxx(inst->U.I.SrcReg[0]),
937                         swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
938                         swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1])));
939                 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
940                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
941                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
942                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
943                         swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
944                         negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
945
946                 sin_approx(c, inst, inst->U.I.DstReg,
947                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
948                         constants);
949         } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
950                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
951                         swizzle_xxxx(inst->U.I.SrcReg[0]),
952                         swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
953                         swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1])));
954                 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
955                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
956                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
957                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
958                         swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
959                         negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
960
961                 sin_approx(c, inst, inst->U.I.DstReg,
962                         swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
963                         constants);
964         } else {
965                 struct rc_dst_register dst;
966
967                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
968                         swizzle_xxxx(inst->U.I.SrcReg[0]),
969                         swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
970                         swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));
971                 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_XY),
972                         srcreg(RC_FILE_TEMPORARY, tempreg));
973                 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
974                         srcreg(RC_FILE_TEMPORARY, tempreg),
975                         swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
976                         negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
977
978                 dst = inst->U.I.DstReg;
979
980                 dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;
981                 sin_approx(c, inst, dst,
982                         swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
983                         constants);
984
985                 dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;
986                 sin_approx(c, inst, dst,
987                         swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
988                         constants);
989         }
990
991         rc_remove_instruction(inst);
992
993         return 1;
994 }
995
996 static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c,
997         struct rc_instruction *inst,
998         unsigned srctmp)
999 {
1000         if (inst->U.I.Opcode == RC_OPCODE_COS) {
1001                 emit1(c, inst->Prev, RC_OPCODE_COS, inst->U.I.SaturateMode, inst->U.I.DstReg,
1002                         srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1003         } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1004                 emit1(c, inst->Prev, RC_OPCODE_SIN, inst->U.I.SaturateMode,
1005                         inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1006         } else if (inst->U.I.Opcode == RC_OPCODE_SCS) {
1007                 struct rc_dst_register moddst = inst->U.I.DstReg;
1008
1009                 if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {
1010                         moddst.WriteMask = RC_MASK_X;
1011                         emit1(c, inst->Prev, RC_OPCODE_COS, inst->U.I.SaturateMode, moddst,
1012                                 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1013                 }
1014                 if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {
1015                         moddst.WriteMask = RC_MASK_Y;
1016                         emit1(c, inst->Prev, RC_OPCODE_SIN, inst->U.I.SaturateMode, moddst,
1017                                 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1018                 }
1019         }
1020
1021         rc_remove_instruction(inst);
1022 }
1023
1024
1025 /**
1026  * Transform the trigonometric functions COS, SIN, and SCS
1027  * to include pre-scaling by 1/(2*PI) and taking the fractional
1028  * part, so that the input to COS and SIN is always in the range [0,1).
1029  * SCS is replaced by one COS and one SIN instruction.
1030  *
1031  * @warning This transformation implicitly changes the semantics of SIN and COS!
1032  */
1033 int radeonTransformTrigScale(struct radeon_compiler* c,
1034         struct rc_instruction* inst,
1035         void* unused)
1036 {
1037         static const float RCP_2PI = 0.15915494309189535;
1038         unsigned int temp;
1039         unsigned int constant;
1040         unsigned int constant_swizzle;
1041
1042         if (inst->U.I.Opcode != RC_OPCODE_COS &&
1043             inst->U.I.Opcode != RC_OPCODE_SIN &&
1044             inst->U.I.Opcode != RC_OPCODE_SCS)
1045                 return 0;
1046
1047         temp = rc_find_free_temporary(c);
1048         constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
1049
1050         emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(temp, RC_MASK_W),
1051                 swizzle_xxxx(inst->U.I.SrcReg[0]),
1052                 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
1053         emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1054                 srcreg(RC_FILE_TEMPORARY, temp));
1055
1056         r300_transform_SIN_COS_SCS(c, inst, temp);
1057         return 1;
1058 }
1059
1060 /**
1061  * Transform the trigonometric functions COS, SIN, and SCS
1062  * so that the input to COS and SIN is always in the range [-PI, PI].
1063  * SCS is replaced by one COS and one SIN instruction.
1064  */
1065 int r300_transform_trig_scale_vertex(struct radeon_compiler *c,
1066         struct rc_instruction *inst,
1067         void *unused)
1068 {
1069         static const float cons[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979};
1070         unsigned int temp;
1071         unsigned int constant;
1072
1073         if (inst->U.I.Opcode != RC_OPCODE_COS &&
1074             inst->U.I.Opcode != RC_OPCODE_SIN &&
1075             inst->U.I.Opcode != RC_OPCODE_SCS)
1076                 return 0;
1077
1078         /* Repeat x in the range [-PI, PI]:
1079          *
1080          *   repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI
1081          */
1082
1083         temp = rc_find_free_temporary(c);
1084         constant = rc_constants_add_immediate_vec4(&c->Program.Constants, cons);
1085
1086         emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1087                 swizzle_xxxx(inst->U.I.SrcReg[0]),
1088                 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_XXXX),
1089                 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_YYYY));
1090         emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1091                 srcreg(RC_FILE_TEMPORARY, temp));
1092         emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1093                 srcreg(RC_FILE_TEMPORARY, temp),
1094                 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_ZZZZ),
1095                 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_WWWW));
1096
1097         r300_transform_SIN_COS_SCS(c, inst, temp);
1098         return 1;
1099 }
1100
1101 /**
1102  * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
1103  * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
1104  * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
1105  *
1106  * @warning This explicitly changes the form of DDX and DDY!
1107  */
1108
1109 int radeonTransformDeriv(struct radeon_compiler* c,
1110         struct rc_instruction* inst,
1111         void* unused)
1112 {
1113         if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY)
1114                 return 0;
1115
1116         inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_1111;
1117         inst->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
1118
1119         return 1;
1120 }
1121
1122 /**
1123  * IF Temp[0].x -\
1124  * KILP         - > KIL -abs(Temp[0].x)
1125  * ENDIF        -/
1126  *
1127  * This needs to be done in its own pass, because it modifies the instructions
1128  * before and after KILP.
1129  */
1130 void rc_transform_KILP(struct radeon_compiler * c, void *user)
1131 {
1132         struct rc_instruction * inst;
1133         for (inst = c->Program.Instructions.Next;
1134                         inst != &c->Program.Instructions; inst = inst->Next) {
1135
1136                 if (inst->U.I.Opcode != RC_OPCODE_KILP)
1137                         continue;
1138
1139                 inst->U.I.Opcode = RC_OPCODE_KIL;
1140
1141                 if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
1142                                 || inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
1143                         inst->U.I.SrcReg[0] = negate(builtin_one);
1144                 } else {
1145
1146                         inst->U.I.SrcReg[0] =
1147                                 negate(absolute(inst->Prev->U.I.SrcReg[0]));
1148                         /* Remove IF */
1149                         rc_remove_instruction(inst->Prev);
1150                         /* Remove ENDIF */
1151                         rc_remove_instruction(inst->Next);
1152                 }
1153         }
1154 }