2 * Copyright (C) 2008 Nicolai Haehnle.
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 * Shareable transformations that transform "special" ALU instructions
32 * into ALU instructions that are supported by hardware.
36 #include "radeon_program_alu.h"
38 #include "radeon_compiler.h"
39 #include "radeon_compiler_util.h"
42 static struct rc_instruction *emit1(
43 struct radeon_compiler * c, struct rc_instruction * after,
44 rc_opcode Opcode, struct rc_sub_instruction * base,
45 struct rc_dst_register DstReg, struct rc_src_register SrcReg)
47 struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
50 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
53 fpi->U.I.Opcode = Opcode;
54 fpi->U.I.DstReg = DstReg;
55 fpi->U.I.SrcReg[0] = SrcReg;
59 static struct rc_instruction *emit2(
60 struct radeon_compiler * c, struct rc_instruction * after,
61 rc_opcode Opcode, struct rc_sub_instruction * base,
62 struct rc_dst_register DstReg,
63 struct rc_src_register SrcReg0, struct rc_src_register SrcReg1)
65 struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
68 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
71 fpi->U.I.Opcode = Opcode;
72 fpi->U.I.DstReg = DstReg;
73 fpi->U.I.SrcReg[0] = SrcReg0;
74 fpi->U.I.SrcReg[1] = SrcReg1;
78 static struct rc_instruction *emit3(
79 struct radeon_compiler * c, struct rc_instruction * after,
80 rc_opcode Opcode, struct rc_sub_instruction * base,
81 struct rc_dst_register DstReg,
82 struct rc_src_register SrcReg0, struct rc_src_register SrcReg1,
83 struct rc_src_register SrcReg2)
85 struct rc_instruction *fpi = rc_insert_new_instruction(c, after);
88 memcpy(&fpi->U.I, base, sizeof(struct rc_sub_instruction));
91 fpi->U.I.Opcode = Opcode;
92 fpi->U.I.DstReg = DstReg;
93 fpi->U.I.SrcReg[0] = SrcReg0;
94 fpi->U.I.SrcReg[1] = SrcReg1;
95 fpi->U.I.SrcReg[2] = SrcReg2;
99 static struct rc_dst_register dstregtmpmask(int index, int mask)
101 struct rc_dst_register dst = {0, 0, 0};
102 dst.File = RC_FILE_TEMPORARY;
104 dst.WriteMask = mask;
108 static const struct rc_src_register builtin_zero = {
109 .File = RC_FILE_NONE,
111 .Swizzle = RC_SWIZZLE_0000
113 static const struct rc_src_register builtin_one = {
114 .File = RC_FILE_NONE,
116 .Swizzle = RC_SWIZZLE_1111
119 static const struct rc_src_register builtin_half = {
120 .File = RC_FILE_NONE,
122 .Swizzle = RC_SWIZZLE_HHHH
125 static const struct rc_src_register srcreg_undefined = {
126 .File = RC_FILE_NONE,
128 .Swizzle = RC_SWIZZLE_XYZW
131 static struct rc_src_register srcreg(int file, int index)
133 struct rc_src_register src = srcreg_undefined;
139 static struct rc_src_register srcregswz(int file, int index, int swz)
141 struct rc_src_register src = srcreg_undefined;
148 static struct rc_src_register absolute(struct rc_src_register reg)
150 struct rc_src_register newreg = reg;
152 newreg.Negate = RC_MASK_NONE;
156 static struct rc_src_register negate(struct rc_src_register reg)
158 struct rc_src_register newreg = reg;
159 newreg.Negate = newreg.Negate ^ RC_MASK_XYZW;
163 static struct rc_src_register swizzle(struct rc_src_register reg,
164 rc_swizzle x, rc_swizzle y, rc_swizzle z, rc_swizzle w)
166 struct rc_src_register swizzled = reg;
167 swizzled.Swizzle = combine_swizzles4(reg.Swizzle, x, y, z, w);
171 static struct rc_src_register swizzle_smear(struct rc_src_register reg,
174 return swizzle(reg, x, x, x, x);
177 static struct rc_src_register swizzle_xxxx(struct rc_src_register reg)
179 return swizzle_smear(reg, RC_SWIZZLE_X);
182 static struct rc_src_register swizzle_yyyy(struct rc_src_register reg)
184 return swizzle_smear(reg, RC_SWIZZLE_Y);
187 static struct rc_src_register swizzle_zzzz(struct rc_src_register reg)
189 return swizzle_smear(reg, RC_SWIZZLE_Z);
192 static struct rc_src_register swizzle_wwww(struct rc_src_register reg)
194 return swizzle_smear(reg, RC_SWIZZLE_W);
197 static int is_dst_safe_to_reuse(struct rc_instruction *inst)
199 const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
202 assert(info->HasDstReg);
204 if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
207 for (i = 0; i < info->NumSrcRegs; i++) {
208 if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
209 inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index)
216 static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c,
217 struct rc_instruction *inst)
221 if (is_dst_safe_to_reuse(inst))
222 tmp = inst->U.I.DstReg.Index;
224 tmp = rc_find_free_temporary(c);
226 return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
229 static void transform_ABS(struct radeon_compiler* c,
230 struct rc_instruction* inst)
232 struct rc_src_register src = inst->U.I.SrcReg[0];
234 src.Negate = RC_MASK_NONE;
235 emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, src);
236 rc_remove_instruction(inst);
239 static void transform_CEIL(struct radeon_compiler* c,
240 struct rc_instruction* inst)
243 * ceil(x) = -floor(-x)
245 * After inlining floor:
246 * ceil(x) = -(-x-frac(-x))
248 * After simplification:
249 * ceil(x) = x+frac(-x)
252 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
253 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
254 emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
255 inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
256 rc_remove_instruction(inst);
259 static void transform_CLAMP(struct radeon_compiler *c,
260 struct rc_instruction *inst)
262 /* CLAMP dst, src, min, max
267 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
268 emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
269 inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
270 emit2(c, inst->Prev, RC_OPCODE_MAX, &inst->U.I, inst->U.I.DstReg,
271 srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
272 rc_remove_instruction(inst);
275 static void transform_DP2(struct radeon_compiler* c,
276 struct rc_instruction* inst)
278 struct rc_src_register src0 = inst->U.I.SrcReg[0];
279 struct rc_src_register src1 = inst->U.I.SrcReg[1];
280 src0.Negate &= ~(RC_MASK_Z | RC_MASK_W);
281 src0.Swizzle &= ~(63 << (3 * 2));
282 src0.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
283 src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
284 src1.Swizzle &= ~(63 << (3 * 2));
285 src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
286 emit2(c, inst->Prev, RC_OPCODE_DP3, &inst->U.I, inst->U.I.DstReg, src0, src1);
287 rc_remove_instruction(inst);
290 static void transform_DPH(struct radeon_compiler* c,
291 struct rc_instruction* inst)
293 struct rc_src_register src0 = inst->U.I.SrcReg[0];
294 src0.Negate &= ~RC_MASK_W;
295 src0.Swizzle &= ~(7 << (3 * 3));
296 src0.Swizzle |= RC_SWIZZLE_ONE << (3 * 3);
297 emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, inst->U.I.SrcReg[1]);
298 rc_remove_instruction(inst);
302 * [1, src0.y*src1.y, src0.z, src1.w]
303 * So basically MUL with lotsa swizzling.
305 static void transform_DST(struct radeon_compiler* c,
306 struct rc_instruction* inst)
308 emit2(c, inst->Prev, RC_OPCODE_MUL, &inst->U.I, inst->U.I.DstReg,
309 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_ONE),
310 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_ONE, RC_SWIZZLE_Y, RC_SWIZZLE_ONE, RC_SWIZZLE_W));
311 rc_remove_instruction(inst);
314 static void transform_FLR(struct radeon_compiler* c,
315 struct rc_instruction* inst)
317 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
318 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
319 emit2(c, inst->Prev, RC_OPCODE_ADD, &inst->U.I, inst->U.I.DstReg,
320 inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
321 rc_remove_instruction(inst);
325 * Definition of LIT (from ARB_fragment_program):
327 * tmp = VectorLoad(op0);
328 * if (tmp.x < 0) tmp.x = 0;
329 * if (tmp.y < 0) tmp.y = 0;
330 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
331 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
334 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
337 * The longest path of computation is the one leading to result.z,
338 * consisting of 5 operations. This implementation of LIT takes
339 * 5 slots, if the subsequent optimization passes are clever enough
340 * to pair instructions correctly.
342 static void transform_LIT(struct radeon_compiler* c,
343 struct rc_instruction* inst)
345 unsigned int constant;
346 unsigned int constant_swizzle;
348 struct rc_src_register srctemp;
350 constant = rc_constants_add_immediate_scalar(&c->Program.Constants, -127.999999, &constant_swizzle);
352 if (inst->U.I.DstReg.WriteMask != RC_MASK_XYZW || inst->U.I.DstReg.File != RC_FILE_TEMPORARY) {
353 struct rc_instruction * inst_mov;
355 inst_mov = emit1(c, inst,
356 RC_OPCODE_MOV, 0, inst->U.I.DstReg,
357 srcreg(RC_FILE_TEMPORARY, rc_find_free_temporary(c)));
359 inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
360 inst->U.I.DstReg.Index = inst_mov->U.I.SrcReg[0].Index;
361 inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
364 temp = inst->U.I.DstReg.Index;
365 srctemp = srcreg(RC_FILE_TEMPORARY, temp);
367 /* tmp.x = max(0.0, Src.x); */
368 /* tmp.y = max(0.0, Src.y); */
369 /* tmp.w = clamp(Src.z, -128+eps, 128-eps); */
370 emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
371 dstregtmpmask(temp, RC_MASK_XYW),
373 swizzle(srcreg(RC_FILE_CONSTANT, constant),
374 RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, RC_SWIZZLE_ZERO, constant_swizzle&3));
375 emit2(c, inst->Prev, RC_OPCODE_MIN, 0,
376 dstregtmpmask(temp, RC_MASK_Z),
377 swizzle_wwww(srctemp),
378 negate(srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle)));
380 /* tmp.w = Pow(tmp.y, tmp.w) */
381 emit1(c, inst->Prev, RC_OPCODE_LG2, 0,
382 dstregtmpmask(temp, RC_MASK_W),
383 swizzle_yyyy(srctemp));
384 emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
385 dstregtmpmask(temp, RC_MASK_W),
386 swizzle_wwww(srctemp),
387 swizzle_zzzz(srctemp));
388 emit1(c, inst->Prev, RC_OPCODE_EX2, 0,
389 dstregtmpmask(temp, RC_MASK_W),
390 swizzle_wwww(srctemp));
392 /* tmp.z = (tmp.x > 0) ? tmp.w : 0.0 */
393 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I,
394 dstregtmpmask(temp, RC_MASK_Z),
395 negate(swizzle_xxxx(srctemp)),
396 swizzle_wwww(srctemp),
399 /* tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0 */
400 emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I,
401 dstregtmpmask(temp, RC_MASK_XYW),
402 swizzle(srctemp, RC_SWIZZLE_ONE, RC_SWIZZLE_X, RC_SWIZZLE_ONE, RC_SWIZZLE_ONE));
404 rc_remove_instruction(inst);
407 static void transform_LRP(struct radeon_compiler* c,
408 struct rc_instruction* inst)
410 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
412 emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
414 inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
415 emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I,
417 inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
419 rc_remove_instruction(inst);
422 static void transform_POW(struct radeon_compiler* c,
423 struct rc_instruction* inst)
425 struct rc_dst_register tempdst = try_to_reuse_dst(c, inst);
426 struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
427 tempdst.WriteMask = RC_MASK_W;
428 tempsrc.Swizzle = RC_SWIZZLE_WWWW;
430 emit1(c, inst->Prev, RC_OPCODE_LG2, 0, tempdst, swizzle_xxxx(inst->U.I.SrcReg[0]));
431 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, tempdst, tempsrc, swizzle_xxxx(inst->U.I.SrcReg[1]));
432 emit1(c, inst->Prev, RC_OPCODE_EX2, &inst->U.I, inst->U.I.DstReg, tempsrc);
434 rc_remove_instruction(inst);
437 /* dst = ROUND(src) :
442 * According to the GLSL spec, the implementor can decide which way to round
443 * when the fraction is .5. We round down for .5.
446 static void transform_ROUND(struct radeon_compiler* c,
447 struct rc_instruction* inst)
449 unsigned int mask = inst->U.I.DstReg.WriteMask;
450 unsigned int frac_index, add_index;
451 struct rc_dst_register frac_dst, add_dst;
452 struct rc_src_register frac_src, add_src;
455 add_index = rc_find_free_temporary(c);
456 add_dst = dstregtmpmask(add_index, mask);
457 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, add_dst, inst->U.I.SrcReg[0],
459 add_src = srcreg(RC_FILE_TEMPORARY, add_dst.Index);
462 /* frac = FRC(add) */
463 frac_index = rc_find_free_temporary(c);
464 frac_dst = dstregtmpmask(frac_index, mask);
465 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, frac_dst, add_src);
466 frac_src = srcreg(RC_FILE_TEMPORARY, frac_dst.Index);
468 /* dst = add - frac */
469 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, inst->U.I.DstReg,
470 add_src, negate(frac_src));
471 rc_remove_instruction(inst);
474 static void transform_RSQ(struct radeon_compiler* c,
475 struct rc_instruction* inst)
477 inst->U.I.SrcReg[0] = absolute(inst->U.I.SrcReg[0]);
480 static void transform_SEQ(struct radeon_compiler* c,
481 struct rc_instruction* inst)
483 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
485 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
486 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
487 negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
489 rc_remove_instruction(inst);
492 static void transform_SFL(struct radeon_compiler* c,
493 struct rc_instruction* inst)
495 emit1(c, inst->Prev, RC_OPCODE_MOV, &inst->U.I, inst->U.I.DstReg, builtin_zero);
496 rc_remove_instruction(inst);
499 static void transform_SGE(struct radeon_compiler* c,
500 struct rc_instruction* inst)
502 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
504 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
505 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
506 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
508 rc_remove_instruction(inst);
511 static void transform_SGT(struct radeon_compiler* c,
512 struct rc_instruction* inst)
514 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
516 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
517 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
518 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
520 rc_remove_instruction(inst);
523 static void transform_SLE(struct radeon_compiler* c,
524 struct rc_instruction* inst)
526 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
528 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
529 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
530 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
532 rc_remove_instruction(inst);
535 static void transform_SLT(struct radeon_compiler* c,
536 struct rc_instruction* inst)
538 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
540 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
541 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
542 srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
544 rc_remove_instruction(inst);
547 static void transform_SNE(struct radeon_compiler* c,
548 struct rc_instruction* inst)
550 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
552 emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
553 emit3(c, inst->Prev, RC_OPCODE_CMP, &inst->U.I, inst->U.I.DstReg,
554 negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
556 rc_remove_instruction(inst);
559 static void transform_SSG(struct radeon_compiler* c,
560 struct rc_instruction* inst)
566 * ADD result, tmp0, -tmp1;
568 struct rc_dst_register dst0;
572 dst0 = try_to_reuse_dst(c, inst);
573 emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
575 negate(inst->U.I.SrcReg[0]),
580 tmp1 = rc_find_free_temporary(c);
581 emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
582 dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
587 /* Either both are zero, or one of them is one and the other is zero. */
588 /* result = tmp0 - tmp1 */
589 emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
591 srcreg(RC_FILE_TEMPORARY, dst0.Index),
592 negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
594 rc_remove_instruction(inst);
597 static void transform_SUB(struct radeon_compiler* c,
598 struct rc_instruction* inst)
600 inst->U.I.Opcode = RC_OPCODE_ADD;
601 inst->U.I.SrcReg[1] = negate(inst->U.I.SrcReg[1]);
604 static void transform_SWZ(struct radeon_compiler* c,
605 struct rc_instruction* inst)
607 inst->U.I.Opcode = RC_OPCODE_MOV;
610 static void transform_XPD(struct radeon_compiler* c,
611 struct rc_instruction* inst)
613 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
615 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
616 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
617 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
618 emit3(c, inst->Prev, RC_OPCODE_MAD, &inst->U.I, inst->U.I.DstReg,
619 swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
620 swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
621 negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
623 rc_remove_instruction(inst);
628 * Can be used as a transformation for @ref radeonClauseLocalTransform,
629 * no userData necessary.
631 * Eliminates the following ALU instructions:
632 * ABS, CEIL, DPH, DST, FLR, LIT, LRP, POW, SEQ, SFL, SGE, SGT, SLE, SLT, SNE, SUB, SWZ, XPD
634 * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
636 * Transforms RSQ to Radeon's native RSQ by explicitly setting
639 * @note should be applicable to R300 and R500 fragment programs.
641 int radeonTransformALU(
642 struct radeon_compiler * c,
643 struct rc_instruction* inst,
646 switch(inst->U.I.Opcode) {
647 case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
648 case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
649 case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
650 case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
651 case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
652 case RC_OPCODE_DST: transform_DST(c, inst); return 1;
653 case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
654 case RC_OPCODE_LIT: transform_LIT(c, inst); return 1;
655 case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
656 case RC_OPCODE_POW: transform_POW(c, inst); return 1;
657 case RC_OPCODE_ROUND: transform_ROUND(c, inst); return 1;
658 case RC_OPCODE_RSQ: transform_RSQ(c, inst); return 1;
659 case RC_OPCODE_SEQ: transform_SEQ(c, inst); return 1;
660 case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
661 case RC_OPCODE_SGE: transform_SGE(c, inst); return 1;
662 case RC_OPCODE_SGT: transform_SGT(c, inst); return 1;
663 case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;
664 case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;
665 case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;
666 case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;
667 case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
668 case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
669 case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
676 static void transform_r300_vertex_ABS(struct radeon_compiler* c,
677 struct rc_instruction* inst)
679 /* Note: r500 can take absolute values, but r300 cannot. */
680 inst->U.I.Opcode = RC_OPCODE_MAX;
681 inst->U.I.SrcReg[1] = inst->U.I.SrcReg[0];
682 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
685 static void transform_r300_vertex_CMP(struct radeon_compiler* c,
686 struct rc_instruction* inst)
688 /* There is no decent CMP available, so let's rig one up.
689 * CMP is defined as dst = src0 < 0.0 ? src1 : src2
690 * The following sequence consumes zero to two temps and two extra slots
691 * (the second temp and the second slot is consumed by transform_LRP),
692 * but should be equivalent:
694 * SLT tmp0, src0, 0.0
695 * LRP dst, tmp0, src1, src2
697 * Yes, I know, I'm a mad scientist. ~ C. & M. */
698 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
700 /* SLT tmp0, src0, 0.0 */
701 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
703 inst->U.I.SrcReg[0], builtin_zero);
705 /* LRP dst, tmp0, src1, src2 */
707 emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
709 srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1], inst->U.I.SrcReg[2]));
711 rc_remove_instruction(inst);
714 static void transform_r300_vertex_DP2(struct radeon_compiler* c,
715 struct rc_instruction* inst)
717 struct rc_instruction *next_inst = inst->Next;
718 transform_DP2(c, inst);
719 next_inst->Prev->U.I.Opcode = RC_OPCODE_DP4;
722 static void transform_r300_vertex_DP3(struct radeon_compiler* c,
723 struct rc_instruction* inst)
725 struct rc_src_register src0 = inst->U.I.SrcReg[0];
726 struct rc_src_register src1 = inst->U.I.SrcReg[1];
727 src0.Negate &= ~RC_MASK_W;
728 src0.Swizzle &= ~(7 << (3 * 3));
729 src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
730 src1.Negate &= ~RC_MASK_W;
731 src1.Swizzle &= ~(7 << (3 * 3));
732 src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
733 emit2(c, inst->Prev, RC_OPCODE_DP4, &inst->U.I, inst->U.I.DstReg, src0, src1);
734 rc_remove_instruction(inst);
737 static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
738 struct rc_instruction* inst)
740 struct rc_dst_register dst = try_to_reuse_dst(c, inst);
741 unsigned constant_swizzle;
742 int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,
743 0.0000000000000000001,
747 dst.WriteMask = RC_MASK_XYZW;
748 emit1(c, inst->Prev, RC_OPCODE_MOV, 0,
750 inst->U.I.SrcReg[0]);
752 /* MAX dst.y, src, 0.00...001 */
753 emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
754 dstregtmpmask(dst.Index, RC_MASK_Y),
755 srcreg(RC_FILE_TEMPORARY, dst.Index),
756 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
758 inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);
761 static void transform_r300_vertex_SEQ(struct radeon_compiler *c,
762 struct rc_instruction *inst)
764 /* x = y <==> x >= y && y >= x */
765 int tmp = rc_find_free_temporary(c);
768 emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
769 dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
771 inst->U.I.SrcReg[1]);
774 emit2(c, inst->Prev, RC_OPCODE_SGE, 0,
777 inst->U.I.SrcReg[0]);
780 emit2(c, inst->Prev, RC_OPCODE_MUL, 0,
782 srcreg(RC_FILE_TEMPORARY, tmp),
783 srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
785 rc_remove_instruction(inst);
788 static void transform_r300_vertex_SNE(struct radeon_compiler *c,
789 struct rc_instruction *inst)
791 /* x != y <==> x < y || y < x */
792 int tmp = rc_find_free_temporary(c);
795 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
796 dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask),
798 inst->U.I.SrcReg[1]);
801 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
804 inst->U.I.SrcReg[0]);
806 /* x || y = max(x, y) */
807 emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
809 srcreg(RC_FILE_TEMPORARY, tmp),
810 srcreg(inst->U.I.DstReg.File, inst->U.I.DstReg.Index));
812 rc_remove_instruction(inst);
815 static void transform_r300_vertex_SGT(struct radeon_compiler* c,
816 struct rc_instruction* inst)
818 /* x > y <==> -x < -y */
819 inst->U.I.Opcode = RC_OPCODE_SLT;
820 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
821 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
824 static void transform_r300_vertex_SLE(struct radeon_compiler* c,
825 struct rc_instruction* inst)
827 /* x <= y <==> -x >= -y */
828 inst->U.I.Opcode = RC_OPCODE_SGE;
829 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
830 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
833 static void transform_r300_vertex_SSG(struct radeon_compiler* c,
834 struct rc_instruction* inst)
840 * ADD result, tmp0, -tmp1;
842 struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);
846 dst0 = try_to_reuse_dst(c, inst);
847 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
850 inst->U.I.SrcReg[0]);
853 tmp1 = rc_find_free_temporary(c);
854 emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
855 dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
859 /* Either both are zero, or one of them is one and the other is zero. */
860 /* result = tmp0 - tmp1 */
861 emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
863 srcreg(RC_FILE_TEMPORARY, dst0.Index),
864 negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
866 rc_remove_instruction(inst);
870 * For use with rc_local_transform, this transforms non-native ALU
871 * instructions of the r300 up to r500 vertex engine.
873 int r300_transform_vertex_alu(
874 struct radeon_compiler * c,
875 struct rc_instruction* inst,
878 switch(inst->U.I.Opcode) {
879 case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
880 case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
881 case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
882 case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
883 case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
884 case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
885 case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
886 case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
887 case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1;
888 case RC_OPCODE_LRP: transform_LRP(c, inst); return 1;
891 transform_r300_vertex_SEQ(c, inst);
895 case RC_OPCODE_SFL: transform_SFL(c, inst); return 1;
896 case RC_OPCODE_SGT: transform_r300_vertex_SGT(c, inst); return 1;
897 case RC_OPCODE_SLE: transform_r300_vertex_SLE(c, inst); return 1;
900 transform_r300_vertex_SNE(c, inst);
904 case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;
905 case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
906 case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
907 case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
913 static void sincos_constants(struct radeon_compiler* c, unsigned int *constants)
915 static const float SinCosConsts[2][4] = {
917 1.273239545, /* 4/PI */
918 -0.405284735, /* -4/(PI*PI) */
919 3.141592654, /* PI */
925 0.159154943, /* 1/(2*PI) */
926 6.283185307 /* 2*PI */
931 for(i = 0; i < 2; ++i)
932 constants[i] = rc_constants_add_immediate_vec4(&c->Program.Constants, SinCosConsts[i]);
936 * Approximate sin(x), where x is clamped to (-pi/2, pi/2).
938 * MUL tmp.xy, src, { 4/PI, -4/(PI^2) }
939 * MAD tmp.x, tmp.y, |src|, tmp.x
940 * MAD tmp.y, tmp.x, |tmp.x|, -tmp.x
941 * MAD dest, tmp.y, weight, tmp.x
943 static void sin_approx(
944 struct radeon_compiler* c, struct rc_instruction * inst,
945 struct rc_dst_register dst, struct rc_src_register src, const unsigned int* constants)
947 unsigned int tempreg = rc_find_free_temporary(c);
949 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(tempreg, RC_MASK_XY),
951 srcreg(RC_FILE_CONSTANT, constants[0]));
952 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_X),
953 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
954 absolute(swizzle_xxxx(src)),
955 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
956 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_Y),
957 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
958 absolute(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))),
959 negate(swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg))));
960 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dst,
961 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
962 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[0])),
963 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)));
967 * Translate the trigonometric functions COS, SIN, and SCS
968 * using only the basic instructions
969 * MOV, ADD, MUL, MAD, FRC
971 int r300_transform_trig_simple(struct radeon_compiler* c,
972 struct rc_instruction* inst,
975 unsigned int constants[2];
976 unsigned int tempreg;
978 if (inst->U.I.Opcode != RC_OPCODE_COS &&
979 inst->U.I.Opcode != RC_OPCODE_SIN &&
980 inst->U.I.Opcode != RC_OPCODE_SCS)
983 tempreg = rc_find_free_temporary(c);
985 sincos_constants(c, constants);
987 if (inst->U.I.Opcode == RC_OPCODE_COS) {
988 /* MAD tmp.x, src, 1/(2*PI), 0.75 */
989 /* FRC tmp.x, tmp.x */
990 /* MAD tmp.z, tmp.x, 2*PI, -PI */
991 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
992 swizzle_xxxx(inst->U.I.SrcReg[0]),
993 swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
994 swizzle_xxxx(srcreg(RC_FILE_CONSTANT, constants[1])));
995 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
996 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
997 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
998 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
999 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1000 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1002 sin_approx(c, inst, inst->U.I.DstReg,
1003 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1005 } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1006 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1007 swizzle_xxxx(inst->U.I.SrcReg[0]),
1008 swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1009 swizzle_yyyy(srcreg(RC_FILE_CONSTANT, constants[1])));
1010 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_W),
1011 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)));
1012 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_W),
1013 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1014 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1015 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1017 sin_approx(c, inst, inst->U.I.DstReg,
1018 swizzle_wwww(srcreg(RC_FILE_TEMPORARY, tempreg)),
1021 struct rc_dst_register dst;
1023 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1024 swizzle_xxxx(inst->U.I.SrcReg[0]),
1025 swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[1])),
1026 swizzle(srcreg(RC_FILE_CONSTANT, constants[1]), RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_W));
1027 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1028 srcreg(RC_FILE_TEMPORARY, tempreg));
1029 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(tempreg, RC_MASK_XY),
1030 srcreg(RC_FILE_TEMPORARY, tempreg),
1031 swizzle_wwww(srcreg(RC_FILE_CONSTANT, constants[1])),
1032 negate(swizzle_zzzz(srcreg(RC_FILE_CONSTANT, constants[0]))));
1034 dst = inst->U.I.DstReg;
1036 dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_X;
1037 sin_approx(c, inst, dst,
1038 swizzle_xxxx(srcreg(RC_FILE_TEMPORARY, tempreg)),
1041 dst.WriteMask = inst->U.I.DstReg.WriteMask & RC_MASK_Y;
1042 sin_approx(c, inst, dst,
1043 swizzle_yyyy(srcreg(RC_FILE_TEMPORARY, tempreg)),
1047 rc_remove_instruction(inst);
1052 static void r300_transform_SIN_COS_SCS(struct radeon_compiler *c,
1053 struct rc_instruction *inst,
1056 if (inst->U.I.Opcode == RC_OPCODE_COS) {
1057 emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, inst->U.I.DstReg,
1058 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1059 } else if (inst->U.I.Opcode == RC_OPCODE_SIN) {
1060 emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I,
1061 inst->U.I.DstReg, srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1062 } else if (inst->U.I.Opcode == RC_OPCODE_SCS) {
1063 struct rc_dst_register moddst = inst->U.I.DstReg;
1065 if (inst->U.I.DstReg.WriteMask & RC_MASK_X) {
1066 moddst.WriteMask = RC_MASK_X;
1067 emit1(c, inst->Prev, RC_OPCODE_COS, &inst->U.I, moddst,
1068 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1070 if (inst->U.I.DstReg.WriteMask & RC_MASK_Y) {
1071 moddst.WriteMask = RC_MASK_Y;
1072 emit1(c, inst->Prev, RC_OPCODE_SIN, &inst->U.I, moddst,
1073 srcregswz(RC_FILE_TEMPORARY, srctmp, RC_SWIZZLE_WWWW));
1077 rc_remove_instruction(inst);
1082 * Transform the trigonometric functions COS, SIN, and SCS
1083 * to include pre-scaling by 1/(2*PI) and taking the fractional
1084 * part, so that the input to COS and SIN is always in the range [0,1).
1085 * SCS is replaced by one COS and one SIN instruction.
1087 * @warning This transformation implicitly changes the semantics of SIN and COS!
1089 int radeonTransformTrigScale(struct radeon_compiler* c,
1090 struct rc_instruction* inst,
1093 static const float RCP_2PI = 0.15915494309189535;
1095 unsigned int constant;
1096 unsigned int constant_swizzle;
1098 if (inst->U.I.Opcode != RC_OPCODE_COS &&
1099 inst->U.I.Opcode != RC_OPCODE_SIN &&
1100 inst->U.I.Opcode != RC_OPCODE_SCS)
1103 temp = rc_find_free_temporary(c);
1104 constant = rc_constants_add_immediate_scalar(&c->Program.Constants, RCP_2PI, &constant_swizzle);
1106 emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstregtmpmask(temp, RC_MASK_W),
1107 swizzle_xxxx(inst->U.I.SrcReg[0]),
1108 srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
1109 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1110 srcreg(RC_FILE_TEMPORARY, temp));
1112 r300_transform_SIN_COS_SCS(c, inst, temp);
1117 * Transform the trigonometric functions COS, SIN, and SCS
1118 * so that the input to COS and SIN is always in the range [-PI, PI].
1119 * SCS is replaced by one COS and one SIN instruction.
1121 int r300_transform_trig_scale_vertex(struct radeon_compiler *c,
1122 struct rc_instruction *inst,
1125 static const float cons[4] = {0.15915494309189535, 0.5, 6.28318530717959, -3.14159265358979};
1127 unsigned int constant;
1129 if (inst->U.I.Opcode != RC_OPCODE_COS &&
1130 inst->U.I.Opcode != RC_OPCODE_SIN &&
1131 inst->U.I.Opcode != RC_OPCODE_SCS)
1134 /* Repeat x in the range [-PI, PI]:
1136 * repeat(x) = frac(x / 2PI + 0.5) * 2PI - PI
1139 temp = rc_find_free_temporary(c);
1140 constant = rc_constants_add_immediate_vec4(&c->Program.Constants, cons);
1142 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1143 swizzle_xxxx(inst->U.I.SrcReg[0]),
1144 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_XXXX),
1145 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_YYYY));
1146 emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstregtmpmask(temp, RC_MASK_W),
1147 srcreg(RC_FILE_TEMPORARY, temp));
1148 emit3(c, inst->Prev, RC_OPCODE_MAD, 0, dstregtmpmask(temp, RC_MASK_W),
1149 srcreg(RC_FILE_TEMPORARY, temp),
1150 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_ZZZZ),
1151 srcregswz(RC_FILE_CONSTANT, constant, RC_SWIZZLE_WWWW));
1153 r300_transform_SIN_COS_SCS(c, inst, temp);
1158 * Rewrite DDX/DDY instructions to properly work with r5xx shaders.
1159 * The r5xx MDH/MDV instruction provides per-quad partial derivatives.
1160 * It takes the form A*B+C. A and C are set by setting src0. B should be -1.
1162 * @warning This explicitly changes the form of DDX and DDY!
1165 int radeonTransformDeriv(struct radeon_compiler* c,
1166 struct rc_instruction* inst,
1169 if (inst->U.I.Opcode != RC_OPCODE_DDX && inst->U.I.Opcode != RC_OPCODE_DDY)
1172 inst->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_1111;
1173 inst->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
1179 * IF Temp[0].x -> IF Temp[0].x
1181 * KILP -> KIL -abs(Temp[0].x)
1188 * KILP - > KIL -abs(Temp[0].x)
1193 * IF Temp[0].x -> IF Temp[0].x
1197 * KILP -> KIL -abs(Temp[0].x)
1203 * KILP -> KIL -none.1111
1205 * This needs to be done in its own pass, because it might modify the
1206 * instructions before and after KILP.
1208 void rc_transform_KILP(struct radeon_compiler * c, void *user)
1210 struct rc_instruction * inst;
1211 for (inst = c->Program.Instructions.Next;
1212 inst != &c->Program.Instructions; inst = inst->Next) {
1213 struct rc_instruction * if_inst;
1216 if (inst->U.I.Opcode != RC_OPCODE_KILP)
1219 for (if_inst = inst->Prev; if_inst != &c->Program.Instructions;
1220 if_inst = if_inst->Prev) {
1222 if (if_inst->U.I.Opcode == RC_OPCODE_IF) {
1228 inst->U.I.Opcode = RC_OPCODE_KIL;
1231 inst->U.I.SrcReg[0] = negate(builtin_one);
1233 /* This should work even if the KILP is inside the ELSE
1234 * block, because -0.0 is considered negative. */
1235 inst->U.I.SrcReg[0] =
1236 negate(absolute(if_inst->U.I.SrcReg[0]));
1238 if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
1239 && inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
1241 /* Optimize the special case:
1248 rc_remove_instruction(inst->Prev);
1250 rc_remove_instruction(inst->Next);