Tizen 2.1 base
[sdk/emulator/qemu.git] / gl / mesa / src / gallium / drivers / r300 / compiler / radeon_optimize.c
1 /*
2  * Copyright (C) 2009 Nicolai Haehnle.
3  * Copyright 2010 Tom Stellard <tstellar@gmail.com>
4  *
5  * All Rights Reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining
8  * a copy of this software and associated documentation files (the
9  * "Software"), to deal in the Software without restriction, including
10  * without limitation the rights to use, copy, modify, merge, publish,
11  * distribute, sublicense, and/or sell copies of the Software, and to
12  * permit persons to whom the Software is furnished to do so, subject to
13  * the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the
16  * next paragraph) shall be included in all copies or substantial
17  * portions of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22  * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
23  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
25  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26  *
27  */
28
29 #include "radeon_dataflow.h"
30
31 #include "radeon_compiler.h"
32 #include "radeon_compiler_util.h"
33 #include "radeon_list.h"
34 #include "radeon_swizzle.h"
35 #include "radeon_variable.h"
36
37 struct src_clobbered_reads_cb_data {
38         rc_register_file File;
39         unsigned int Index;
40         unsigned int Mask;
41         struct rc_reader_data * ReaderData;
42 };
43
44 typedef void (*rc_presub_replace_fn)(struct rc_instruction *,
45                                                 struct rc_instruction *,
46                                                 unsigned int);
47
48 static struct rc_src_register chain_srcregs(struct rc_src_register outer, struct rc_src_register inner)
49 {
50         struct rc_src_register combine;
51         combine.File = inner.File;
52         combine.Index = inner.Index;
53         combine.RelAddr = inner.RelAddr;
54         if (outer.Abs) {
55                 combine.Abs = 1;
56                 combine.Negate = outer.Negate;
57         } else {
58                 combine.Abs = inner.Abs;
59                 combine.Negate = swizzle_mask(outer.Swizzle, inner.Negate);
60                 combine.Negate ^= outer.Negate;
61         }
62         combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
63         return combine;
64 }
65
66 static void copy_propagate_scan_read(void * data, struct rc_instruction * inst,
67                                                 struct rc_src_register * src)
68 {
69         rc_register_file file = src->File;
70         struct rc_reader_data * reader_data = data;
71
72         if(!rc_inst_can_use_presub(inst,
73                                 reader_data->Writer->U.I.PreSub.Opcode,
74                                 rc_swizzle_to_writemask(src->Swizzle),
75                                 src,
76                                 &reader_data->Writer->U.I.PreSub.SrcReg[0],
77                                 &reader_data->Writer->U.I.PreSub.SrcReg[1])) {
78                 reader_data->Abort = 1;
79                 return;
80         }
81
82         /* XXX This could probably be handled better. */
83         if (file == RC_FILE_ADDRESS) {
84                 reader_data->Abort = 1;
85                 return;
86         }
87
88         /* These instructions cannot read from the constants file.
89          * see radeonTransformTEX()
90          */
91         if(reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_TEMPORARY &&
92                         reader_data->Writer->U.I.SrcReg[0].File != RC_FILE_INPUT &&
93                                 (inst->U.I.Opcode == RC_OPCODE_TEX ||
94                                 inst->U.I.Opcode == RC_OPCODE_TXB ||
95                                 inst->U.I.Opcode == RC_OPCODE_TXP ||
96                                 inst->U.I.Opcode == RC_OPCODE_TXD ||
97                                 inst->U.I.Opcode == RC_OPCODE_TXL ||
98                                 inst->U.I.Opcode == RC_OPCODE_KIL)){
99                 reader_data->Abort = 1;
100                 return;
101         }
102 }
103
104 static void src_clobbered_reads_cb(
105         void * data,
106         struct rc_instruction * inst,
107         struct rc_src_register * src)
108 {
109         struct src_clobbered_reads_cb_data * sc_data = data;
110
111         if (src->File == sc_data->File
112             && src->Index == sc_data->Index
113             && (rc_swizzle_to_writemask(src->Swizzle) & sc_data->Mask)) {
114
115                 sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
116         }
117
118         if (src->RelAddr && sc_data->File == RC_FILE_ADDRESS) {
119                 sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
120         }
121 }
122
123 static void is_src_clobbered_scan_write(
124         void * data,
125         struct rc_instruction * inst,
126         rc_register_file file,
127         unsigned int index,
128         unsigned int mask)
129 {
130         struct src_clobbered_reads_cb_data sc_data;
131         struct rc_reader_data * reader_data = data;
132         sc_data.File = file;
133         sc_data.Index = index;
134         sc_data.Mask = mask;
135         sc_data.ReaderData = reader_data;
136         rc_for_all_reads_src(reader_data->Writer,
137                                         src_clobbered_reads_cb, &sc_data);
138 }
139
140 static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * inst_mov)
141 {
142         struct rc_reader_data reader_data;
143         unsigned int i;
144
145         if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY ||
146             inst_mov->U.I.WriteALUResult ||
147             inst_mov->U.I.SaturateMode)
148                 return;
149
150         /* Get a list of all the readers of this MOV instruction. */
151         reader_data.ExitOnAbort = 1;
152         rc_get_readers(c, inst_mov, &reader_data,
153                        copy_propagate_scan_read, NULL,
154                        is_src_clobbered_scan_write);
155
156         if (reader_data.Abort || reader_data.ReaderCount == 0)
157                 return;
158
159         /* Propagate the MOV instruction. */
160         for (i = 0; i < reader_data.ReaderCount; i++) {
161                 struct rc_instruction * inst = reader_data.Readers[i].Inst;
162                 *reader_data.Readers[i].U.I.Src = chain_srcregs(*reader_data.Readers[i].U.I.Src, inst_mov->U.I.SrcReg[0]);
163
164                 if (inst_mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
165                         inst->U.I.PreSub = inst_mov->U.I.PreSub;
166         }
167
168         /* Finally, remove the original MOV instruction */
169         rc_remove_instruction(inst_mov);
170 }
171
172 /**
173  * Check if a source register is actually always the same
174  * swizzle constant.
175  */
176 static int is_src_uniform_constant(struct rc_src_register src,
177                 rc_swizzle * pswz, unsigned int * pnegate)
178 {
179         int have_used = 0;
180
181         if (src.File != RC_FILE_NONE) {
182                 *pswz = 0;
183                 return 0;
184         }
185
186         for(unsigned int chan = 0; chan < 4; ++chan) {
187                 unsigned int swz = GET_SWZ(src.Swizzle, chan);
188                 if (swz < 4) {
189                         *pswz = 0;
190                         return 0;
191                 }
192                 if (swz == RC_SWIZZLE_UNUSED)
193                         continue;
194
195                 if (!have_used) {
196                         *pswz = swz;
197                         *pnegate = GET_BIT(src.Negate, chan);
198                         have_used = 1;
199                 } else {
200                         if (swz != *pswz || *pnegate != GET_BIT(src.Negate, chan)) {
201                                 *pswz = 0;
202                                 return 0;
203                         }
204                 }
205         }
206
207         return 1;
208 }
209
210 static void constant_folding_mad(struct rc_instruction * inst)
211 {
212         rc_swizzle swz = 0;
213         unsigned int negate= 0;
214
215         if (is_src_uniform_constant(inst->U.I.SrcReg[2], &swz, &negate)) {
216                 if (swz == RC_SWIZZLE_ZERO) {
217                         inst->U.I.Opcode = RC_OPCODE_MUL;
218                         return;
219                 }
220         }
221
222         if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
223                 if (swz == RC_SWIZZLE_ONE) {
224                         inst->U.I.Opcode = RC_OPCODE_ADD;
225                         if (negate)
226                                 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
227                         inst->U.I.SrcReg[1] = inst->U.I.SrcReg[2];
228                         return;
229                 } else if (swz == RC_SWIZZLE_ZERO) {
230                         inst->U.I.Opcode = RC_OPCODE_MOV;
231                         inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
232                         return;
233                 }
234         }
235
236         if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
237                 if (swz == RC_SWIZZLE_ONE) {
238                         inst->U.I.Opcode = RC_OPCODE_ADD;
239                         if (negate)
240                                 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
241                         inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
242                         return;
243                 } else if (swz == RC_SWIZZLE_ZERO) {
244                         inst->U.I.Opcode = RC_OPCODE_MOV;
245                         inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
246                         return;
247                 }
248         }
249 }
250
251 static void constant_folding_mul(struct rc_instruction * inst)
252 {
253         rc_swizzle swz = 0;
254         unsigned int negate = 0;
255
256         if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
257                 if (swz == RC_SWIZZLE_ONE) {
258                         inst->U.I.Opcode = RC_OPCODE_MOV;
259                         inst->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
260                         if (negate)
261                                 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
262                         return;
263                 } else if (swz == RC_SWIZZLE_ZERO) {
264                         inst->U.I.Opcode = RC_OPCODE_MOV;
265                         inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
266                         return;
267                 }
268         }
269
270         if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
271                 if (swz == RC_SWIZZLE_ONE) {
272                         inst->U.I.Opcode = RC_OPCODE_MOV;
273                         if (negate)
274                                 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
275                         return;
276                 } else if (swz == RC_SWIZZLE_ZERO) {
277                         inst->U.I.Opcode = RC_OPCODE_MOV;
278                         inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
279                         return;
280                 }
281         }
282 }
283
284 static void constant_folding_add(struct rc_instruction * inst)
285 {
286         rc_swizzle swz = 0;
287         unsigned int negate = 0;
288
289         if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
290                 if (swz == RC_SWIZZLE_ZERO) {
291                         inst->U.I.Opcode = RC_OPCODE_MOV;
292                         inst->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
293                         return;
294                 }
295         }
296
297         if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
298                 if (swz == RC_SWIZZLE_ZERO) {
299                         inst->U.I.Opcode = RC_OPCODE_MOV;
300                         return;
301                 }
302         }
303 }
304
305 /**
306  * Replace 0.0, 1.0 and 0.5 immediate constants by their
307  * respective swizzles. Simplify instructions like ADD dst, src, 0;
308  */
309 static void constant_folding(struct radeon_compiler * c, struct rc_instruction * inst)
310 {
311         const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
312         unsigned int i;
313
314         /* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
315         for(unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
316                 struct rc_constant * constant;
317                 struct rc_src_register newsrc;
318                 int have_real_reference;
319                 unsigned int chan;
320
321                 /* If there are only 0, 0.5, 1, or _ swizzles, mark the source as a constant. */
322                 for (chan = 0; chan < 4; ++chan)
323                         if (GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan) <= 3)
324                                 break;
325                 if (chan == 4) {
326                         inst->U.I.SrcReg[src].File = RC_FILE_NONE;
327                         continue;
328                 }
329
330                 /* Convert immediates to swizzles. */
331                 if (inst->U.I.SrcReg[src].File != RC_FILE_CONSTANT ||
332                     inst->U.I.SrcReg[src].RelAddr ||
333                     inst->U.I.SrcReg[src].Index >= c->Program.Constants.Count)
334                         continue;
335
336                 constant =
337                         &c->Program.Constants.Constants[inst->U.I.SrcReg[src].Index];
338
339                 if (constant->Type != RC_CONSTANT_IMMEDIATE)
340                         continue;
341
342                 newsrc = inst->U.I.SrcReg[src];
343                 have_real_reference = 0;
344                 for (chan = 0; chan < 4; ++chan) {
345                         unsigned int swz = GET_SWZ(newsrc.Swizzle, chan);
346                         unsigned int newswz;
347                         float imm;
348                         float baseimm;
349
350                         if (swz >= 4)
351                                 continue;
352
353                         imm = constant->u.Immediate[swz];
354                         baseimm = imm;
355                         if (imm < 0.0)
356                                 baseimm = -baseimm;
357
358                         if (baseimm == 0.0) {
359                                 newswz = RC_SWIZZLE_ZERO;
360                         } else if (baseimm == 1.0) {
361                                 newswz = RC_SWIZZLE_ONE;
362                         } else if (baseimm == 0.5 && c->has_half_swizzles) {
363                                 newswz = RC_SWIZZLE_HALF;
364                         } else {
365                                 have_real_reference = 1;
366                                 continue;
367                         }
368
369                         SET_SWZ(newsrc.Swizzle, chan, newswz);
370                         if (imm < 0.0 && !newsrc.Abs)
371                                 newsrc.Negate ^= 1 << chan;
372                 }
373
374                 if (!have_real_reference) {
375                         newsrc.File = RC_FILE_NONE;
376                         newsrc.Index = 0;
377                 }
378
379                 /* don't make the swizzle worse */
380                 if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, newsrc) &&
381                     c->SwizzleCaps->IsNative(inst->U.I.Opcode, inst->U.I.SrcReg[src]))
382                         continue;
383
384                 inst->U.I.SrcReg[src] = newsrc;
385         }
386
387         /* Simplify instructions based on constants */
388         if (inst->U.I.Opcode == RC_OPCODE_MAD)
389                 constant_folding_mad(inst);
390
391         /* note: MAD can simplify to MUL or ADD */
392         if (inst->U.I.Opcode == RC_OPCODE_MUL)
393                 constant_folding_mul(inst);
394         else if (inst->U.I.Opcode == RC_OPCODE_ADD)
395                 constant_folding_add(inst);
396
397         /* In case this instruction has been converted, make sure all of the
398          * registers that are no longer used are empty. */
399         opcode = rc_get_opcode_info(inst->U.I.Opcode);
400         for(i = opcode->NumSrcRegs; i < 3; i++) {
401                 memset(&inst->U.I.SrcReg[i], 0, sizeof(struct rc_src_register));
402         }
403 }
404
405 /**
406  * If src and dst use the same register, this function returns a writemask that
407  * indicates wich components are read by src.  Otherwise zero is returned.
408  */
409 static unsigned int src_reads_dst_mask(struct rc_src_register src,
410                                                 struct rc_dst_register dst)
411 {
412         if (dst.File != src.File || dst.Index != src.Index) {
413                 return 0;
414         }
415         return rc_swizzle_to_writemask(src.Swizzle);
416 }
417
418 /* Return 1 if the source registers has a constant swizzle (e.g. 0, 0.5, 1.0)
419  * in any of its channels.  Return 0 otherwise. */
420 static int src_has_const_swz(struct rc_src_register src) {
421         int chan;
422         for(chan = 0; chan < 4; chan++) {
423                 unsigned int swz = GET_SWZ(src.Swizzle, chan);
424                 if (swz == RC_SWIZZLE_ZERO || swz == RC_SWIZZLE_HALF
425                                                 || swz == RC_SWIZZLE_ONE) {
426                         return 1;
427                 }
428         }
429         return 0;
430 }
431
432 static void presub_scan_read(
433         void * data,
434         struct rc_instruction * inst,
435         struct rc_src_register * src)
436 {
437         struct rc_reader_data * reader_data = data;
438         rc_presubtract_op * presub_opcode = reader_data->CbData;
439
440         if (!rc_inst_can_use_presub(inst, *presub_opcode,
441                         reader_data->Writer->U.I.DstReg.WriteMask,
442                         src,
443                         &reader_data->Writer->U.I.SrcReg[0],
444                         &reader_data->Writer->U.I.SrcReg[1])) {
445                 reader_data->Abort = 1;
446                 return;
447         }
448 }
449
450 static int presub_helper(
451         struct radeon_compiler * c,
452         struct rc_instruction * inst_add,
453         rc_presubtract_op presub_opcode,
454         rc_presub_replace_fn presub_replace)
455 {
456         struct rc_reader_data reader_data;
457         unsigned int i;
458         rc_presubtract_op cb_op = presub_opcode;
459
460         reader_data.CbData = &cb_op;
461         reader_data.ExitOnAbort = 1;
462         rc_get_readers(c, inst_add, &reader_data, presub_scan_read, NULL,
463                                                 is_src_clobbered_scan_write);
464
465         if (reader_data.Abort || reader_data.ReaderCount == 0)
466                 return 0;
467
468         for(i = 0; i < reader_data.ReaderCount; i++) {
469                 unsigned int src_index;
470                 struct rc_reader reader = reader_data.Readers[i];
471                 const struct rc_opcode_info * info =
472                                 rc_get_opcode_info(reader.Inst->U.I.Opcode);
473
474                 for (src_index = 0; src_index < info->NumSrcRegs; src_index++) {
475                         if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.I.Src)
476                                 presub_replace(inst_add, reader.Inst, src_index);
477                 }
478         }
479         return 1;
480 }
481
482 /* This function assumes that inst_add->U.I.SrcReg[0] and
483  * inst_add->U.I.SrcReg[1] aren't both negative. */
484 static void presub_replace_add(
485         struct rc_instruction * inst_add,
486         struct rc_instruction * inst_reader,
487         unsigned int src_index)
488 {
489         rc_presubtract_op presub_opcode;
490         if (inst_add->U.I.SrcReg[1].Negate || inst_add->U.I.SrcReg[0].Negate)
491                 presub_opcode = RC_PRESUB_SUB;
492         else
493                 presub_opcode = RC_PRESUB_ADD;
494
495         if (inst_add->U.I.SrcReg[1].Negate) {
496                 inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
497                 inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[0];
498         } else {
499                 inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[0];
500                 inst_reader->U.I.PreSub.SrcReg[1] = inst_add->U.I.SrcReg[1];
501         }
502         inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
503         inst_reader->U.I.PreSub.SrcReg[1].Negate = 0;
504         inst_reader->U.I.PreSub.Opcode = presub_opcode;
505         inst_reader->U.I.SrcReg[src_index] =
506                         chain_srcregs(inst_reader->U.I.SrcReg[src_index],
507                                         inst_reader->U.I.PreSub.SrcReg[0]);
508         inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
509         inst_reader->U.I.SrcReg[src_index].Index = presub_opcode;
510 }
511
512 static int is_presub_candidate(
513         struct radeon_compiler * c,
514         struct rc_instruction * inst)
515 {
516         const struct rc_opcode_info * info = rc_get_opcode_info(inst->U.I.Opcode);
517         unsigned int i;
518         unsigned int is_constant[2] = {0, 0};
519
520         assert(inst->U.I.Opcode == RC_OPCODE_ADD);
521
522         if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE
523                         || inst->U.I.SaturateMode
524                         || inst->U.I.WriteALUResult
525                         || inst->U.I.Omod) {
526                 return 0;
527         }
528
529         /* If both sources use a constant swizzle, then we can't convert it to
530          * a presubtract operation.  In fact for the ADD and SUB presubtract
531          * operations neither source can contain a constant swizzle.  This
532          * specific case is checked in peephole_add_presub_add() when
533          * we make sure the swizzles for both sources are equal, so we
534          * don't need to worry about it here. */
535         for (i = 0; i < 2; i++) {
536                 int chan;
537                 for (chan = 0; chan < 4; chan++) {
538                         rc_swizzle swz =
539                                 get_swz(inst->U.I.SrcReg[i].Swizzle, chan);
540                         if (swz == RC_SWIZZLE_ONE
541                                         || swz == RC_SWIZZLE_ZERO
542                                         || swz == RC_SWIZZLE_HALF) {
543                                 is_constant[i] = 1;
544                         }
545                 }
546         }
547         if (is_constant[0] && is_constant[1])
548                 return 0;
549
550         for(i = 0; i < info->NumSrcRegs; i++) {
551                 struct rc_src_register src = inst->U.I.SrcReg[i];
552                 if (src_reads_dst_mask(src, inst->U.I.DstReg))
553                         return 0;
554
555                 src.File = RC_FILE_PRESUB;
556                 if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, src))
557                         return 0;
558         }
559         return 1;
560 }
561
562 static int peephole_add_presub_add(
563         struct radeon_compiler * c,
564         struct rc_instruction * inst_add)
565 {
566         unsigned dstmask = inst_add->U.I.DstReg.WriteMask;
567         unsigned src0_neg = inst_add->U.I.SrcReg[0].Negate & dstmask;
568         unsigned src1_neg = inst_add->U.I.SrcReg[1].Negate & dstmask;
569
570         if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
571                 return 0;
572
573         /* src0 and src1 can't have absolute values */
574         if (inst_add->U.I.SrcReg[0].Abs || inst_add->U.I.SrcReg[1].Abs)
575                 return 0;
576
577         /* presub_replace_add() assumes only one is negative */
578         if (inst_add->U.I.SrcReg[0].Negate && inst_add->U.I.SrcReg[1].Negate)
579                 return 0;
580
581         /* if src0 is negative, at least all bits of dstmask have to be set */
582         if (inst_add->U.I.SrcReg[0].Negate && src0_neg != dstmask)
583                 return 0;
584
585         /* if src1 is negative, at least all bits of dstmask have to be set */
586         if (inst_add->U.I.SrcReg[1].Negate && src1_neg != dstmask)
587                 return 0;
588
589         if (!is_presub_candidate(c, inst_add))
590                 return 0;
591
592         if (presub_helper(c, inst_add, RC_PRESUB_ADD, presub_replace_add)) {
593                 rc_remove_instruction(inst_add);
594                 return 1;
595         }
596         return 0;
597 }
598
599 static void presub_replace_inv(
600         struct rc_instruction * inst_add,
601         struct rc_instruction * inst_reader,
602         unsigned int src_index)
603 {
604         /* We must be careful not to modify inst_add, since it
605          * is possible it will remain part of the program.*/
606         inst_reader->U.I.PreSub.SrcReg[0] = inst_add->U.I.SrcReg[1];
607         inst_reader->U.I.PreSub.SrcReg[0].Negate = 0;
608         inst_reader->U.I.PreSub.Opcode = RC_PRESUB_INV;
609         inst_reader->U.I.SrcReg[src_index] = chain_srcregs(inst_reader->U.I.SrcReg[src_index],
610                                                 inst_reader->U.I.PreSub.SrcReg[0]);
611
612         inst_reader->U.I.SrcReg[src_index].File = RC_FILE_PRESUB;
613         inst_reader->U.I.SrcReg[src_index].Index = RC_PRESUB_INV;
614 }
615
616 /**
617  * PRESUB_INV: ADD TEMP[0], none.1, -TEMP[1]
618  * Use the presubtract 1 - src0 for all readers of TEMP[0].  The first source
619  * of the add instruction must have the constatnt 1 swizzle.  This function
620  * does not check const registers to see if their value is 1.0, so it should
621  * be called after the constant_folding optimization.
622  * @return
623  *      0 if the ADD instruction is still part of the program.
624  *      1 if the ADD instruction is no longer part of the program.
625  */
626 static int peephole_add_presub_inv(
627         struct radeon_compiler * c,
628         struct rc_instruction * inst_add)
629 {
630         unsigned int i, swz;
631
632         if (!is_presub_candidate(c, inst_add))
633                 return 0;
634
635         /* Check if src0 is 1. */
636         /* XXX It would be nice to use is_src_uniform_constant here, but that
637          * function only works if the register's file is RC_FILE_NONE */
638         for(i = 0; i < 4; i++ ) {
639                 swz = GET_SWZ(inst_add->U.I.SrcReg[0].Swizzle, i);
640                 if(((1 << i) & inst_add->U.I.DstReg.WriteMask)
641                                                 && swz != RC_SWIZZLE_ONE) {
642                         return 0;
643                 }
644         }
645
646         /* Check src1. */
647         if ((inst_add->U.I.SrcReg[1].Negate & inst_add->U.I.DstReg.WriteMask) !=
648                                                 inst_add->U.I.DstReg.WriteMask
649                 || inst_add->U.I.SrcReg[1].Abs
650                 || (inst_add->U.I.SrcReg[1].File != RC_FILE_TEMPORARY
651                         && inst_add->U.I.SrcReg[1].File != RC_FILE_CONSTANT)
652                 || src_has_const_swz(inst_add->U.I.SrcReg[1])) {
653
654                 return 0;
655         }
656
657         if (presub_helper(c, inst_add, RC_PRESUB_INV, presub_replace_inv)) {
658                 rc_remove_instruction(inst_add);
659                 return 1;
660         }
661         return 0;
662 }
663
664 struct peephole_mul_cb_data {
665         struct rc_dst_register * Writer;
666         unsigned int Clobbered;
667 };
668
669 static void omod_filter_reader_cb(
670         void * userdata,
671         struct rc_instruction * inst,
672         rc_register_file file,
673         unsigned int index,
674         unsigned int mask)
675 {
676         struct peephole_mul_cb_data * d = userdata;
677         if (rc_src_reads_dst_mask(file, mask, index,
678                 d->Writer->File, d->Writer->Index, d->Writer->WriteMask)) {
679
680                 d->Clobbered = 1;
681         }
682 }
683
684 static void omod_filter_writer_cb(
685         void * userdata,
686         struct rc_instruction * inst,
687         rc_register_file file,
688         unsigned int index,
689         unsigned int mask)
690 {
691         struct peephole_mul_cb_data * d = userdata;
692         if (file == d->Writer->File && index == d->Writer->Index &&
693                                         (mask & d->Writer->WriteMask)) {
694                 d->Clobbered = 1;
695         }
696 }
697
698 static int peephole_mul_omod(
699         struct radeon_compiler * c,
700         struct rc_instruction * inst_mul,
701         struct rc_list * var_list)
702 {
703         unsigned int chan, swz, i;
704         int const_index = -1;
705         int temp_index = -1;
706         float const_value;
707         rc_omod_op omod_op = RC_OMOD_DISABLE;
708         struct rc_list * writer_list;
709         struct rc_variable * var;
710         struct peephole_mul_cb_data cb_data;
711
712         for (i = 0; i < 2; i++) {
713                 unsigned int j;
714                 if (inst_mul->U.I.SrcReg[i].File != RC_FILE_CONSTANT
715                         && inst_mul->U.I.SrcReg[i].File != RC_FILE_TEMPORARY) {
716                         return 0;
717                 }
718                 if (inst_mul->U.I.SrcReg[i].File == RC_FILE_TEMPORARY) {
719                         if (temp_index != -1) {
720                                 /* The instruction has two temp sources */
721                                 return 0;
722                         } else {
723                                 temp_index = i;
724                                 continue;
725                         }
726                 }
727                 /* If we get this far Src[i] must be a constant src */
728                 if (inst_mul->U.I.SrcReg[i].Negate) {
729                         return 0;
730                 }
731                 /* The constant src needs to read from the same swizzle */
732                 swz = RC_SWIZZLE_UNUSED;
733                 chan = 0;
734                 for (j = 0; j < 4; j++) {
735                         unsigned int j_swz =
736                                 GET_SWZ(inst_mul->U.I.SrcReg[i].Swizzle, j);
737                         if (j_swz == RC_SWIZZLE_UNUSED) {
738                                 continue;
739                         }
740                         if (swz == RC_SWIZZLE_UNUSED) {
741                                 swz = j_swz;
742                                 chan = j;
743                         } else if (j_swz != swz) {
744                                 return 0;
745                         }
746                 }
747
748                 if (const_index != -1) {
749                         /* The instruction has two constant sources */
750                         return 0;
751                 } else {
752                         const_index = i;
753                 }
754         }
755
756         if (!rc_src_reg_is_immediate(c, inst_mul->U.I.SrcReg[const_index].File,
757                                 inst_mul->U.I.SrcReg[const_index].Index)) {
758                 return 0;
759         }
760         const_value = rc_get_constant_value(c,
761                         inst_mul->U.I.SrcReg[const_index].Index,
762                         inst_mul->U.I.SrcReg[const_index].Swizzle,
763                         inst_mul->U.I.SrcReg[const_index].Negate,
764                         chan);
765
766         if (const_value == 2.0f) {
767                 omod_op = RC_OMOD_MUL_2;
768         } else if (const_value == 4.0f) {
769                 omod_op = RC_OMOD_MUL_4;
770         } else if (const_value == 8.0f) {
771                 omod_op = RC_OMOD_MUL_8;
772         } else if (const_value == (1.0f / 2.0f)) {
773                 omod_op = RC_OMOD_DIV_2;
774         } else if (const_value == (1.0f / 4.0f)) {
775                 omod_op = RC_OMOD_DIV_4;
776         } else if (const_value == (1.0f / 8.0f)) {
777                 omod_op = RC_OMOD_DIV_8;
778         } else {
779                 return 0;
780         }
781
782         writer_list = rc_variable_list_get_writers_one_reader(var_list,
783                 RC_INSTRUCTION_NORMAL, &inst_mul->U.I.SrcReg[temp_index]);
784
785         if (!writer_list) {
786                 return 0;
787         }
788
789         cb_data.Clobbered = 0;
790         cb_data.Writer = &inst_mul->U.I.DstReg;
791         for (var = writer_list->Item; var; var = var->Friend) {
792                 struct rc_instruction * inst;
793                 const struct rc_opcode_info * info = rc_get_opcode_info(
794                                 var->Inst->U.I.Opcode);
795                 if (info->HasTexture) {
796                         return 0;
797                 }
798                 if (var->Inst->U.I.SaturateMode != RC_SATURATE_NONE) {
799                         return 0;
800                 }
801                 for (inst = inst_mul->Prev; inst != var->Inst;
802                                                         inst = inst->Prev) {
803                         rc_for_all_reads_mask(inst, omod_filter_reader_cb,
804                                                                 &cb_data);
805                         rc_for_all_writes_mask(inst, omod_filter_writer_cb,
806                                                                 &cb_data);
807                         if (cb_data.Clobbered) {
808                                 break;
809                         }
810                 }
811         }
812
813         if (cb_data.Clobbered) {
814                 return 0;
815         }
816
817         /* Rewrite the instructions */
818         for (var = writer_list->Item; var; var = var->Friend) {
819                 struct rc_variable * writer = writer_list->Item;
820                 unsigned conversion_swizzle = rc_make_conversion_swizzle(
821                                         writer->Inst->U.I.DstReg.WriteMask,
822                                         inst_mul->U.I.DstReg.WriteMask);
823                 writer->Inst->U.I.Omod = omod_op;
824                 writer->Inst->U.I.DstReg.File = inst_mul->U.I.DstReg.File;
825                 writer->Inst->U.I.DstReg.Index = inst_mul->U.I.DstReg.Index;
826                 rc_normal_rewrite_writemask(writer->Inst, conversion_swizzle);
827                 writer->Inst->U.I.SaturateMode = inst_mul->U.I.SaturateMode;
828         }
829
830         rc_remove_instruction(inst_mul);
831
832         return 1;
833 }
834
835 /**
836  * @return
837  *      0 if inst is still part of the program.
838  *      1 if inst is no longer part of the program.
839  */
840 static int peephole(struct radeon_compiler * c, struct rc_instruction * inst)
841 {
842         switch(inst->U.I.Opcode){
843         case RC_OPCODE_ADD:
844                 if (c->has_presub) {
845                         if(peephole_add_presub_inv(c, inst))
846                                 return 1;
847                         if(peephole_add_presub_add(c, inst))
848                                 return 1;
849                 }
850                 break;
851         default:
852                 break;
853         }
854         return 0;
855 }
856
857 void rc_optimize(struct radeon_compiler * c, void *user)
858 {
859         struct rc_instruction * inst = c->Program.Instructions.Next;
860         struct rc_list * var_list;
861         while(inst != &c->Program.Instructions) {
862                 struct rc_instruction * cur = inst;
863                 inst = inst->Next;
864
865                 constant_folding(c, cur);
866
867                 if(peephole(c, cur))
868                         continue;
869
870                 if (cur->U.I.Opcode == RC_OPCODE_MOV) {
871                         copy_propagate(c, cur);
872                         /* cur may no longer be part of the program */
873                 }
874         }
875
876         if (!c->has_omod) {
877                 return;
878         }
879
880         inst = c->Program.Instructions.Next;
881         while(inst != &c->Program.Instructions) {
882                 struct rc_instruction * cur = inst;
883                 inst = inst->Next;
884                 if (cur->U.I.Opcode == RC_OPCODE_MUL) {
885                         var_list = rc_get_variables(c);
886                         peephole_mul_omod(c, cur, var_list);
887                 }
888         }
889 }