1 ; PowerPC optimized zoom for Goom
2 ; © 2001-2003 Guillaume Borios
3 ; This library is free software; you can redistribute it and/or
4 ; modify it under the terms of the GNU Library General Public
5 ; License as published by the Free Software Foundation; either
6 ; version 2 of the License, or (at your option) any later version.
8 ; This library is distributed in the hope that it will be useful,
9 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
10 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 ; Library General Public License for more details.
13 ; You should have received a copy of the GNU Library General Public
14 ; License along with this library; if not, write to the
15 ; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 ; Boston, MA 02111-1307, USA.
19 ; 21 Dec 2003 : Use of altivec is now determined with a parameter
21 ; Section definition : We use a read only section
24 ; name of the function to call by C program : ppc_zoom
25 ; We declare this label as a global to extend its scope outside this file
26 .globl _ppc_zoom_generic
30 ; This routine dynamically computes and applies a zoom filter
33 ; r3 <=> unsigned int sizeX (in pixels)
34 ; r4 <=> unsigned int sizeY (in pixels)
35 ; r5 <=> unsigned int * frompixmap
36 ; r6 <=> unsigned int * topixmap
37 ; r7 <=> unsigned int * brutS
38 ; r8 <=> unsigned int * brutD
39 ; r9 <=> unsigned int buffratio
40 ; r10 <=> int [16][16] precalccoeffs
43 ; r5 <=> frompixmap - 1 byte needed for preincremental fetch (replaces r5)
44 ; r6 <=> topixmap - 1 byte needed for preincremental fetch (replaces r6)
45 ; r3 <=> ax = x max in 16th of pixels (replaces old r3)
46 ; r4 <=> ay = y max in 16th of pixels (replaces old r4)
47 ; r20 <=> row size in bytes
48 ; r12 <=> 0xFF00FF (mask for parallel 32 bits pixs computing)
49 ; r30 <=> brutS - 1 byte needed for preincremental fetch (replaces r7)
50 ; r31 <=> brutD - 1 byte needed for preincremental fetch (replaces r8)
53 ; r1 is the Stack Pointer (SP) => Do not use
54 ; r13..r31 are non-volatiles => Do not use
58 ; Saves the used non volatile registers in the Mach-O stack s Red-Zone
62 li r18,0 ; Default value if out of range : 0 (Black)
65 mullw r2,r3,r4 ; Number of pixels to compute
72 mtspr ctr,r2 ; Init the loop count (one loop per pixel computed)
88 ; computes dynamically the position to fetch
101 ; if px>ax or py>ay goto outofrange
102 ; computes the attenuation coeffs and the original point address
103 rlwinm r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0 (r10=(r2%16)*4*16)
105 rlwimi r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) | (r10 & !0x0000002D) (r10=(r10%16)*4 | r10)
107 srawi r29,r29,4 ; pos computing
109 srawi r2,r2,4 ; pos computing
110 mullw r29, r29,r19 ; pos computing
113 ; Channels notation : 00112233 (AARRVVBB)
115 add r2,r2,r29 ; pos computing
116 lwzx r10,r11,r10 ; Loads coefs
117 slwi r2,r2,2 ; pos computing
118 add r2,r2,r5 ; pos computing
119 rlwinm r21,r10,0,24,31 ; Isolates coef1 (??????11 -> 00000011)
120 lwz r25,0(r2) ; Loads col1 -> r25
121 lwz r26,4(r2) ; Loads col2 -> r26
122 rlwinm r22,r10,24,24,31 ; Isolates coef2 (????22?? -> 00000022)
123 rlwinm r23,r10,16,24,31 ; Isolates coef3 (??33???? -> 00000033)
124 add r2,r2,r20 ; Adds one line for future load of col3 and col4
125 and r8, r25,r12 ; Masks col1 channels 1 & 3 : 0x00XX00XX
126 rlwinm r24,r10,8,24,31 ; Isolates coef4 (44?????? -> 00000044)
127 andi. r25,r25,0xFF00 ; Masks col1 channel 2 : 0x0000XX00
128 mullw r8, r8, r21 ; Applies coef1 on col1 channels 1 & 3
131 ; computes final pixel color
132 and r10,r26,r12 ; Masks col2 channels 1 & 3 : 0x00XX00XX
133 lwz r27,0(r2) ; Loads col3 -> r27
134 mullw r10,r10,r22 ; Applies coef2 on col2 channels 1 & 3
135 mullw r25,r25,r21 ; Applies coef1 on col1 channel 2
136 andi. r29,r26,0xFF00 ; Masks col2 channel 2 : 0x0000XX00
137 mullw r29,r29,r22 ; Applies coef2 on col2 channel 2
138 lwz r28,4(r2) ; Loads col4 -> r28
139 add r8 ,r8 ,r10 ; Adds col1 & col2 channels 1 & 3
140 and r10,r27,r12 ; Masks col3 channels 1 & 3 : 0x00XX00XX
141 add r25,r25,r29 ; Adds col1 & col2 channel 2
142 mullw r10,r10,r23 ; Applies coef3 on col3 channels 1 & 3
143 andi. r29,r27,0xFF00 ; Masks col3 channel 2 : 0x0000XX00
144 mullw r29,r29,r23 ; Applies coef3 on col3 channel 2
146 add r7 ,r8 ,r10 ; Adds col3 to (col1 + col2) channels 1 & 3
147 and r10,r28,r12 ; Masks col4 channels 1 & 3 : 0x00XX00XX
148 mullw r10,r10,r24 ; Applies coef4 on col4 channels 1 & 3
149 add r25,r25,r29 ; Adds col 3 to (col1 + col2) channel 2
151 andi. r28,r28,0xFF00 ; Masks col4 channel 2 : 0x0000XX00
152 add r7 ,r7 ,r10 ; Adds col4 to (col1 + col2 + col3) channels 1 & 3
154 mullw r28,r28,r24 ; Applies coef4 on col4 channel 2
155 srawi r7, r7, 8 ; (sum of channels 1 & 3) >> 8
157 add r25,r25,r28 ; Adds col 4 to (col1 + col2 + col3) channel 2
158 rlwimi r7, r25, 24, 16, 23 ; (((sum of channels 2) >> 8 ) & 0x0000FF00) | ((sum of channels 1 and 3) & 0xFFFF00FF)
159 stwu r7,4(r6) ; Stores the computed pixel
160 bdnz L1 ; Iterate again if needed
161 b L3 ;goto end ; If not, returns from the function
176 ; Restore saved registers and return
189 ; Saves the used non volatile registers in the Mach-O stack s Red-Zone
193 li r18,0 ; Default value if out of range : 0 (Black)
196 mullw r2,r3,r4 ; Number of pixels to compute
203 mtspr ctr,r2 ; Init the loop count (one loop per pixel computed)
215 ;*********************
224 ; Optimization to ensure the destination buffer
225 ; won't be loaded into the data cache
226 rlwinm. r0,r6,0,27,31
232 ; computes dynamically the position to fetch
254 ; if px>ax or py>ay goto outofrange
255 ; computes the attenuation coeffs and the original point address
256 rlwinm r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0 (r10=(r2%16)*4*16)
258 rlwimi r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) | (r10 & !0x0000002D) (r10=(r29%16)*4 | r10)
260 srawi r29,r29,4 ; pos computing
262 srawi r2,r2,4 ; pos computing
263 mullw r29, r29,r19 ; pos computing
266 ; Channels notation : 00112233 (AARRVVBB)
268 add r2,r2,r29 ; pos computing
269 lwzx r10,r11,r10 ; Loads coefs
270 slwi r2,r2,2 ; pos computing
271 add r2,r2,r5 ; pos computing
272 rlwinm r21,r10,0,24,31 ; Isolates coef1 (??????11 -> 00000011)
273 lwz r25,0(r2) ; Loads col1 -> r25
274 lwz r26,4(r2) ; Loads col2 -> r26
275 rlwinm r22,r10,24,24,31 ; Isolates coef2 (????22?? -> 00000022)
276 rlwinm r23,r10,16,24,31 ; Isolates coef3 (??33???? -> 00000033)
277 add r2,r2,r20 ; Adds one line for future load of col3 and col4
278 and r8, r25,r12 ; Masks col1 channels 1 & 3 : 0x00XX00XX
279 rlwinm r24,r10,8,24,31 ; Isolates coef4 (44?????? -> 00000044)
281 rlwinm r25,r25,0,16,23 ; Masks col1 channel 2 : 0x0000XX00
282 ;andi. r25,r25,0xFF00 ; Masks col1 channel 2 : 0x0000XX00
283 mullw r8, r8, r21 ; Applies coef1 on col1 channels 1 & 3
286 ; computes final pixel color
287 and r10,r26,r12 ; Masks col2 channels 1 & 3 : 0x00XX00XX
288 lwz r27,0(r2) ; Loads col3 -> r27
289 mullw r10,r10,r22 ; Applies coef2 on col2 channels 1 & 3
290 mullw r25,r25,r21 ; Applies coef1 on col1 channel 2
291 rlwinm r29,r26,0,16,23 ; Masks col2 channel 2 : 0x0000XX00
292 ;andi. r29,r26,0xFF00 ; Masks col2 channel 2 : 0x0000XX00
293 mullw r29,r29,r22 ; Applies coef2 on col2 channel 2
294 lwz r28,4(r2) ; Loads col4 -> r28
295 add r8 ,r8 ,r10 ; Adds col1 & col2 channels 1 & 3
296 and r10,r27,r12 ; Masks col3 channels 1 & 3 : 0x00XX00XX
297 add r25,r25,r29 ; Adds col1 & col2 channel 2
298 mullw r10,r10,r23 ; Applies coef3 on col3 channels 1 & 3
299 rlwinm r29,r27,0,16,23 ; Masks col3 channel 2 : 0x0000XX00
300 ;andi. r29,r27,0xFF00 ; Masks col3 channel 2 : 0x0000XX00
301 mullw r29,r29,r23 ; Applies coef3 on col3 channel 2
303 add r7 ,r8 ,r10 ; Adds col3 to (col1 + col2) channels 1 & 3
304 and r10,r28,r12 ; Masks col4 channels 1 & 3 : 0x00XX00XX
305 mullw r10,r10,r24 ; Applies coef4 on col4 channels 1 & 3
306 add r25,r25,r29 ; Adds col 3 to (col1 + col2) channel 2
308 rlwinm r28,r28,0,16,23 ; Masks col4 channel 2 : 0x0000XX00
309 ;andi. r28,r28,0xFF00 ; Masks col4 channel 2 : 0x0000XX00
310 add r7 ,r7 ,r10 ; Adds col4 to (col1 + col2 + col3) channels 1 & 3
312 mullw r28,r28,r24 ; Applies coef4 on col4 channel 2
313 srawi r7, r7, 8 ; (sum of channels 1 & 3) >> 8
315 add r25,r25,r28 ; Adds col 4 to (col1 + col2 + col3) channel 2
316 rlwimi r7, r25, 24, 16, 23 ; (((sum of channels 2) >> 8 ) & 0x0000FF00) | ((sum of channels 1 and 3) & 0xFFFF00FF)
317 stw r7,0(r6) ; Stores the computed pixel
318 bdnz L100 ; Iterate again if needed
319 b L300 ;goto end ; If not, returns from the function
334 ; Restore saved registers and return