1 ; PowerPC optimized zoom for Goom
2 ; © 2001-2003 Guillaume Borios
3 ; This Source Code is released under the terms of the General Public License
6 ; 21 Dec 2003 : Use of altivec is now determined with a parameter
8 ; Section definition : We use a read only section
11 ; name of the function to call by C program : ppc_zoom
12 ; We declare this label as a global to extend its scope outside this file
13 .globl _ppc_zoom_generic
17 ; This routine dynamically computes and applies a zoom filter
20 ; r3 <=> unsigned int sizeX (in pixels)
21 ; r4 <=> unsigned int sizeY (in pixels)
22 ; r5 <=> unsigned int * frompixmap
23 ; r6 <=> unsigned int * topixmap
24 ; r7 <=> unsigned int * brutS
25 ; r8 <=> unsigned int * brutD
26 ; r9 <=> unsigned int buffratio
27 ; r10 <=> int [16][16] precalccoeffs
30 ; r5 <=> frompixmap - 1 byte needed for preincremental fetch (replaces r5)
31 ; r6 <=> topixmap - 1 byte needed for preincremental fetch (replaces r6)
32 ; r3 <=> ax = x max in 16th of pixels (replaces old r3)
33 ; r4 <=> ay = y max in 16th of pixels (replaces old r4)
34 ; r20 <=> row size in bytes
35 ; r12 <=> 0xFF00FF (mask for parallel 32 bits pixs computing)
36 ; r30 <=> brutS - 1 byte needed for preincremental fetch (replaces r7)
37 ; r31 <=> brutD - 1 byte needed for preincremental fetch (replaces r8)
40 ; r1 is the Stack Pointer (SP) => Do not use
41 ; r13..r31 are non-volatiles => Do not use
45 ; Saves the used non volatile registers in the Mach-O stack s Red-Zone
49 li r18,0 ; Default value if out of range : 0 (Black)
52 mullw r2,r3,r4 ; Number of pixels to compute
59 mtspr ctr,r2 ; Init the loop count (one loop per pixel computed)
75 ; computes dynamically the position to fetch
88 ; if px>ax or py>ay goto outofrange
89 ; computes the attenuation coeffs and the original point address
90 rlwinm r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0 (r10=(r2%16)*4*16)
92 rlwimi r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) | (r10 & !0x0000002D) (r10=(r10%16)*4 | r10)
94 srawi r29,r29,4 ; pos computing
96 srawi r2,r2,4 ; pos computing
97 mullw r29, r29,r19 ; pos computing
100 ; Channels notation : 00112233 (AARRVVBB)
102 add r2,r2,r29 ; pos computing
103 lwzx r10,r11,r10 ; Loads coefs
104 slwi r2,r2,2 ; pos computing
105 add r2,r2,r5 ; pos computing
106 rlwinm r21,r10,0,24,31 ; Isolates coef1 (??????11 -> 00000011)
107 lwz r25,0(r2) ; Loads col1 -> r25
108 lwz r26,4(r2) ; Loads col2 -> r26
109 rlwinm r22,r10,24,24,31 ; Isolates coef2 (????22?? -> 00000022)
110 rlwinm r23,r10,16,24,31 ; Isolates coef3 (??33???? -> 00000033)
111 add r2,r2,r20 ; Adds one line for future load of col3 and col4
112 and r8, r25,r12 ; Masks col1 channels 1 & 3 : 0x00XX00XX
113 rlwinm r24,r10,8,24,31 ; Isolates coef4 (44?????? -> 00000044)
114 andi. r25,r25,0xFF00 ; Masks col1 channel 2 : 0x0000XX00
115 mullw r8, r8, r21 ; Applies coef1 on col1 channels 1 & 3
118 ; computes final pixel color
119 and r10,r26,r12 ; Masks col2 channels 1 & 3 : 0x00XX00XX
120 lwz r27,0(r2) ; Loads col3 -> r27
121 mullw r10,r10,r22 ; Applies coef2 on col2 channels 1 & 3
122 mullw r25,r25,r21 ; Applies coef1 on col1 channel 2
123 andi. r29,r26,0xFF00 ; Masks col2 channel 2 : 0x0000XX00
124 mullw r29,r29,r22 ; Applies coef2 on col2 channel 2
125 lwz r28,4(r2) ; Loads col4 -> r28
126 add r8 ,r8 ,r10 ; Adds col1 & col2 channels 1 & 3
127 and r10,r27,r12 ; Masks col3 channels 1 & 3 : 0x00XX00XX
128 add r25,r25,r29 ; Adds col1 & col2 channel 2
129 mullw r10,r10,r23 ; Applies coef3 on col3 channels 1 & 3
130 andi. r29,r27,0xFF00 ; Masks col3 channel 2 : 0x0000XX00
131 mullw r29,r29,r23 ; Applies coef3 on col3 channel 2
133 add r7 ,r8 ,r10 ; Adds col3 to (col1 + col2) channels 1 & 3
134 and r10,r28,r12 ; Masks col4 channels 1 & 3 : 0x00XX00XX
135 mullw r10,r10,r24 ; Applies coef4 on col4 channels 1 & 3
136 add r25,r25,r29 ; Adds col 3 to (col1 + col2) channel 2
138 andi. r28,r28,0xFF00 ; Masks col4 channel 2 : 0x0000XX00
139 add r7 ,r7 ,r10 ; Adds col4 to (col1 + col2 + col3) channels 1 & 3
141 mullw r28,r28,r24 ; Applies coef4 on col4 channel 2
142 srawi r7, r7, 8 ; (sum of channels 1 & 3) >> 8
144 add r25,r25,r28 ; Adds col 4 to (col1 + col2 + col3) channel 2
145 rlwimi r7, r25, 24, 16, 23 ; (((sum of channels 2) >> 8 ) & 0x0000FF00) | ((sum of channels 1 and 3) & 0xFFFF00FF)
146 stwu r7,4(r6) ; Stores the computed pixel
147 bdnz L1 ; Iterate again if needed
148 b L3 ;goto end ; If not, returns from the function
163 ; Restore saved registers and return
176 ; Saves the used non volatile registers in the Mach-O stack s Red-Zone
180 li r18,0 ; Default value if out of range : 0 (Black)
183 mullw r2,r3,r4 ; Number of pixels to compute
190 mtspr ctr,r2 ; Init the loop count (one loop per pixel computed)
202 ;*********************
211 ; Optimization to ensure the destination buffer
212 ; won't be loaded into the data cache
213 rlwinm. r0,r6,0,27,31
219 ; computes dynamically the position to fetch
241 ; if px>ax or py>ay goto outofrange
242 ; computes the attenuation coeffs and the original point address
243 rlwinm r10,r2,6,28-6,31-6 ; r10 <- (r2 << 2) & 0x000002D0 (r10=(r2%16)*4*16)
245 rlwimi r10, r29, 2, 28-2, 31-2 ; r10 <- ((r29 << 2) & 0x0000002D) | (r10 & !0x0000002D) (r10=(r29%16)*4 | r10)
247 srawi r29,r29,4 ; pos computing
249 srawi r2,r2,4 ; pos computing
250 mullw r29, r29,r19 ; pos computing
253 ; Channels notation : 00112233 (AARRVVBB)
255 add r2,r2,r29 ; pos computing
256 lwzx r10,r11,r10 ; Loads coefs
257 slwi r2,r2,2 ; pos computing
258 add r2,r2,r5 ; pos computing
259 rlwinm r21,r10,0,24,31 ; Isolates coef1 (??????11 -> 00000011)
260 lwz r25,0(r2) ; Loads col1 -> r25
261 lwz r26,4(r2) ; Loads col2 -> r26
262 rlwinm r22,r10,24,24,31 ; Isolates coef2 (????22?? -> 00000022)
263 rlwinm r23,r10,16,24,31 ; Isolates coef3 (??33???? -> 00000033)
264 add r2,r2,r20 ; Adds one line for future load of col3 and col4
265 and r8, r25,r12 ; Masks col1 channels 1 & 3 : 0x00XX00XX
266 rlwinm r24,r10,8,24,31 ; Isolates coef4 (44?????? -> 00000044)
268 rlwinm r25,r25,0,16,23 ; Masks col1 channel 2 : 0x0000XX00
269 ;andi. r25,r25,0xFF00 ; Masks col1 channel 2 : 0x0000XX00
270 mullw r8, r8, r21 ; Applies coef1 on col1 channels 1 & 3
273 ; computes final pixel color
274 and r10,r26,r12 ; Masks col2 channels 1 & 3 : 0x00XX00XX
275 lwz r27,0(r2) ; Loads col3 -> r27
276 mullw r10,r10,r22 ; Applies coef2 on col2 channels 1 & 3
277 mullw r25,r25,r21 ; Applies coef1 on col1 channel 2
278 rlwinm r29,r26,0,16,23 ; Masks col2 channel 2 : 0x0000XX00
279 ;andi. r29,r26,0xFF00 ; Masks col2 channel 2 : 0x0000XX00
280 mullw r29,r29,r22 ; Applies coef2 on col2 channel 2
281 lwz r28,4(r2) ; Loads col4 -> r28
282 add r8 ,r8 ,r10 ; Adds col1 & col2 channels 1 & 3
283 and r10,r27,r12 ; Masks col3 channels 1 & 3 : 0x00XX00XX
284 add r25,r25,r29 ; Adds col1 & col2 channel 2
285 mullw r10,r10,r23 ; Applies coef3 on col3 channels 1 & 3
286 rlwinm r29,r27,0,16,23 ; Masks col3 channel 2 : 0x0000XX00
287 ;andi. r29,r27,0xFF00 ; Masks col3 channel 2 : 0x0000XX00
288 mullw r29,r29,r23 ; Applies coef3 on col3 channel 2
290 add r7 ,r8 ,r10 ; Adds col3 to (col1 + col2) channels 1 & 3
291 and r10,r28,r12 ; Masks col4 channels 1 & 3 : 0x00XX00XX
292 mullw r10,r10,r24 ; Applies coef4 on col4 channels 1 & 3
293 add r25,r25,r29 ; Adds col 3 to (col1 + col2) channel 2
295 rlwinm r28,r28,0,16,23 ; Masks col4 channel 2 : 0x0000XX00
296 ;andi. r28,r28,0xFF00 ; Masks col4 channel 2 : 0x0000XX00
297 add r7 ,r7 ,r10 ; Adds col4 to (col1 + col2 + col3) channels 1 & 3
299 mullw r28,r28,r24 ; Applies coef4 on col4 channel 2
300 srawi r7, r7, 8 ; (sum of channels 1 & 3) >> 8
302 add r25,r25,r28 ; Adds col 4 to (col1 + col2 + col3) channel 2
303 rlwimi r7, r25, 24, 16, 23 ; (((sum of channels 2) >> 8 ) & 0x0000FF00) | ((sum of channels 1 and 3) & 0xFFFF00FF)
304 stw r7,0(r6) ; Stores the computed pixel
305 bdnz L100 ; Iterate again if needed
306 b L300 ;goto end ; If not, returns from the function
321 ; Restore saved registers and return