Initialize
[sdk/emulator/qemu.git] / tizen / distrib / ffmpeg / libswscale / bfin / internal_bfin.S
1 /*
2  * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3  *                    April 20, 2007
4  *
5  * Blackfin video color space converter operations
6  * convert I420 YV12 to RGB in various formats
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25
26 /*
27 YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28 and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
29
30
31 The following calculation is used for the conversion:
32
33   r = clipz((y-oy)*cy  + crv*(v-128))
34   g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
35   b = clipz((y-oy)*cy  + cbu*(u-128))
36
37 y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
38
39
40 New factorization to eliminate the truncation error which was
41 occurring due to the byteop3p.
42
43
44 1) Use the bytop16m to subtract quad bytes we use this in U8 this
45  then so the offsets need to be renormalized to 8bits.
46
47 2) Scale operands up by a factor of 4 not 8 because Blackfin
48    multiplies include a shift.
49
50 3) Compute into the accumulators cy*yx0, cy*yx1.
51
52 4) Compute each of the linear equations:
53      r = clipz((y - oy) * cy  + crv * (v - 128))
54
55      g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))
56
57      b = clipz((y - oy) * cy  + cbu * (u - 128))
58
59    Reuse of the accumulators requires that we actually multiply
60    twice once with addition and the second time with a subtraction.
61
62    Because of this we need to compute the equations in the order R B
63    then G saving the writes for B in the case of 24/32 bit color
64    formats.
65
66    API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67                       int dW, uint32_t *coeffs);
68
69        A          B
70        ---        ---
71        i2 = cb    i3 = cr
72        i1 = coeff i0 = y
73
74 Where coeffs have the following layout in memory.
75
76 uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv;
77
78 coeffs is a pointer to oy.
79
80 The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81 replication is used to simplify the internal algorithms for the dual Mac
82 architecture of BlackFin.
83
84 All routines are exported with _ff_bfin_ as a symbol prefix.
85
86 Rough performance gain compared against -O3:
87
88 2779809/1484290 187.28%
89
90 which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
91 c/pel for the optimized implementations. Not sure why there is such a
92 huge variation on the reference codes on Blackfin I guess it must have
93 to do with the memory system.
94 */
95
96 #define mL3 .text
97 #if defined(__FDPIC__) && CONFIG_SRAM
98 #define mL1 .l1.text
99 #else
100 #define mL1 mL3
101 #endif
102 #define MEM mL1
103
104 #define DEFUN(fname,where,interface) \
105         .section where;              \
106         .global _ff_bfin_ ## fname;  \
107         .type _ff_bfin_ ## fname, STT_FUNC; \
108         .align 8;                    \
109         _ff_bfin_ ## fname
110
111 #define DEFUN_END(fname) \
112         .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname
113
114
115 .text
116
117 #define COEFF_LEN        11*4
118 #define COEFF_REL_CY_OFF 4*4
119
120 #define ARG_OUT   20
121 #define ARG_W     24
122 #define ARG_COEFF 28
123
124 DEFUN(yuv2rgb565_line,MEM,
125    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
126         link 0;
127         [--sp] = (r7:4);
128         p1 = [fp+ARG_OUT];
129         r3 = [fp+ARG_W];
130
131         i0 = r0;
132         i2 = r1;
133         i3 = r2;
134
135         r0 = [fp+ARG_COEFF];
136         i1 = r0;
137         b1 = i1;
138         l1 = COEFF_LEN;
139         m0 = COEFF_REL_CY_OFF;
140         p0 = r3;
141
142         r0   = [i0++];         // 2Y
143         r1.l = w[i2++];        // 2u
144         r1.h = w[i3++];        // 2v
145         p0 = p0>>2;
146
147         lsetup (.L0565, .L1565) lc0 = p0;
148
149         /*
150            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
151            r0 -- used to load 4ys
152            r1 -- used to load 2us,2vs
153            r4 -- y3,y2
154            r5 -- y1,y0
155            r6 -- u1,u0
156            r7 -- v1,v0
157         */
158                                                               r2=[i1++]; // oy
159 .L0565:
160         /*
161         rrrrrrrr gggggggg bbbbbbbb
162          5432109876543210
163                     bbbbb >>3
164               gggggggg    <<3
165          rrrrrrrr         <<8
166          rrrrrggggggbbbbb
167         */
168         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
169         (r7,r6) = byteop16m (r1:0, r3:2) (r);
170         r5 = r5 << 2 (v);                                                // y1,y0
171         r4 = r4 << 2 (v);                                                // y3,y2
172         r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
173         r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
174         /* Y' = y*cy */
175         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
176
177         /* R = Y+ crv*(Cr-128) */
178         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
179                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
180         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
181         r2 = r2 >> 3 (v);
182         r3 = r2 & r5;
183
184         /* B = Y+ cbu*(Cb-128) */
185         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
186                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
187         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
188         r2 = r2 << 8 (v);
189         r2 = r2 & r5;
190         r3 = r3 | r2;
191
192         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
193                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
194         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
195         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
196         r2 = r2 << 3 (v);
197         r2 = r2 & r5;
198         r3 = r3 | r2;
199         [p1++]=r3                                          || r1=[i1++]; // cy
200
201         /* Y' = y*cy */
202
203         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
204
205         /* R = Y+ crv*(Cr-128) */
206         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
207                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
208         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
209         r2 = r2 >> 3 (v);
210         r3 = r2 & r5;
211
212         /* B = Y+ cbu*(Cb-128) */
213         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
214                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
215         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
216         r2 = r2 << 8 (v);
217         r2 = r2 & r5;
218         r3 = r3 | r2;
219
220         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
221                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
222         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
223         r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
224         r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
225         r2 = r2 & r5;
226         r3 = r3 | r2;
227         [p1++]=r3                                          || r1.h = w[i3++];        // 2v
228 .L1565:                                                       r2=[i1++]; // oy
229
230         l1 = 0;
231
232         (r7:4) = [sp++];
233         unlink;
234         rts;
235 DEFUN_END(yuv2rgb565_line)
236
237 DEFUN(yuv2rgb555_line,MEM,
238    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
239         link 0;
240         [--sp] = (r7:4);
241         p1 = [fp+ARG_OUT];
242         r3 = [fp+ARG_W];
243
244         i0 = r0;
245         i2 = r1;
246         i3 = r2;
247
248         r0 = [fp+ARG_COEFF];
249         i1 = r0;
250         b1 = i1;
251         l1 = COEFF_LEN;
252         m0 = COEFF_REL_CY_OFF;
253         p0 = r3;
254
255         r0   = [i0++];         // 2Y
256         r1.l = w[i2++];        // 2u
257         r1.h = w[i3++];        // 2v
258         p0 = p0>>2;
259
260         lsetup (.L0555, .L1555) lc0 = p0;
261
262         /*
263            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
264            r0 -- used to load 4ys
265            r1 -- used to load 2us,2vs
266            r4 -- y3,y2
267            r5 -- y1,y0
268            r6 -- u1,u0
269            r7 -- v1,v0
270         */
271                                                               r2=[i1++]; // oy
272 .L0555:
273         /*
274         rrrrrrrr gggggggg bbbbbbbb
275          5432109876543210
276                     bbbbb >>3
277                gggggggg   <<2
278           rrrrrrrr        <<7
279          xrrrrrgggggbbbbb
280         */
281
282         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
283         (r7,r6) = byteop16m (r1:0, r3:2) (r);
284         r5 = r5 << 2 (v);                                                // y1,y0
285         r4 = r4 << 2 (v);                                                // y3,y2
286         r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
287         r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
288         /* Y' = y*cy */
289         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
290
291         /* R = Y+ crv*(Cr-128) */
292         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
293                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
294         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
295         r2 = r2 >> 3 (v);
296         r3 = r2 & r5;
297
298         /* B = Y+ cbu*(Cb-128) */
299         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
300                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
301         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
302         r2 = r2 << 7 (v);
303         r2 = r2 & r5;
304         r3 = r3 | r2;
305
306         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
307                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
308         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
309         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
310         r2 = r2 << 2 (v);
311         r2 = r2 & r5;
312         r3 = r3 | r2;
313         [p1++]=r3                                          || r1=[i1++]; // cy
314
315         /* Y' = y*cy */
316
317         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
318
319         /* R = Y+ crv*(Cr-128) */
320         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
321                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
322         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
323         r2 = r2 >> 3 (v);
324         r3 = r2 & r5;
325
326         /* B = Y+ cbu*(Cb-128) */
327         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
328                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
329         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
330         r2 = r2 << 7 (v);
331         r2 = r2 & r5;
332         r3 = r3 | r2;
333
334         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
335                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
336         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
337         r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
338         r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
339         r2 = r2 & r5;
340         r3 = r3 | r2;
341         [p1++]=r3                                          || r1.h=w[i3++]; // 2v
342
343 .L1555:                                                       r2=[i1++]; // oy
344
345         l1 = 0;
346
347         (r7:4) = [sp++];
348         unlink;
349         rts;
350 DEFUN_END(yuv2rgb555_line)
351
352 DEFUN(yuv2rgb24_line,MEM,
353    (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
354         link 0;
355         [--sp] = (r7:4);
356         p1 = [fp+ARG_OUT];
357         r3 = [fp+ARG_W];
358         p2 = p1;
359         p2 += 3;
360
361         i0 = r0;
362         i2 = r1;
363         i3 = r2;
364
365         r0 = [fp+ARG_COEFF]; // coeff buffer
366         i1 = r0;
367         b1 = i1;
368         l1 = COEFF_LEN;
369         m0 = COEFF_REL_CY_OFF;
370         p0 = r3;
371
372         r0   = [i0++];         // 2Y
373         r1.l = w[i2++];        // 2u
374         r1.h = w[i3++];        // 2v
375         p0 = p0>>2;
376
377         lsetup (.L0888, .L1888) lc0 = p0;
378
379         /*
380            uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
381            r0 -- used to load 4ys
382            r1 -- used to load 2us,2vs
383            r4 -- y3,y2
384            r5 -- y1,y0
385            r6 -- u1,u0
386            r7 -- v1,v0
387         */
388                                                               r2=[i1++]; // oy
389 .L0888:
390         (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
391         (r7,r6) = byteop16m (r1:0, r3:2) (r);
392         r5 = r5 << 2 (v);               // y1,y0
393         r4 = r4 << 2 (v);               // y3,y2
394         r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
395         r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy
396
397         /* Y' = y*cy */
398         a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv
399
400         /* R = Y+ crv*(Cr-128) */
401         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
402                 a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
403         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
404         r2=r2>>16 || B[p1++]=r2;
405                      B[p2++]=r2;
406
407         /* B = Y+ cbu*(Cb-128) */
408         r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
409                 a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
410         r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
411
412         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
413                 a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
414         r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
415         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero
416
417         r2=r2>>16 || B[p1++]=r2;
418                      B[p2++]=r2;
419
420         r3=r3>>16 || B[p1++]=r3;
421                      B[p2++]=r3                            || r1=[i1++]; // cy
422
423         p1+=3;
424         p2+=3;
425         /* Y' = y*cy */
426         a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv
427
428         /* R = Y+ crv*(Cr-128) */
429         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
430                 a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
431         r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
432         r2=r2>>16 || B[p1++]=r2;
433         B[p2++]=r2;
434
435         /* B = Y+ cbu*(Cb-128) */
436         r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
437                 a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
438         r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
439
440         /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
441                 a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
442         r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
443         r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
444         r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
445                      B[p2++]=r2 || r1.l = w[i2++]; // 2u
446         r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
447                      B[p2++]=r3 || r2=[i1++];      // oy
448
449         p1+=3;
450 .L1888: p2+=3;
451
452         l1 = 0;
453
454         (r7:4) = [sp++];
455         unlink;
456         rts;
457 DEFUN_END(yuv2rgb24_line)
458
459
460
461 #define ARG_vdst        20
462 #define ARG_width       24
463 #define ARG_height      28
464 #define ARG_lumStride   32
465 #define ARG_chromStride 36
466 #define ARG_srcStride   40
467
468 DEFUN(uyvytoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
469                          long width, long height,
470                          long lumStride, long chromStride, long srcStride)):
471         link 0;
472         [--sp] = (r7:4,p5:4);
473
474         p0 = r1;       // Y top even
475
476         i2 = r2; // *u
477         r2 = [fp + ARG_vdst];
478         i3 = r2; // *v
479
480         r1 = [fp + ARG_srcStride];
481         r2 = r0 + r1;
482         i0 = r0;  // uyvy_T even
483         i1 = r2;  // uyvy_B odd
484
485         p2 = [fp + ARG_lumStride];
486         p1 = p0 + p2;  // Y bot odd
487
488         p5 = [fp + ARG_width];
489         p4 = [fp + ARG_height];
490         r0 = p5;
491         p4 = p4 >> 1;
492         p5 = p5 >> 2;
493
494         r2 = r0 << 1;
495         r1 = r1 << 1;
496         r1 = r1 - r2;  // srcStride + (srcStride - 2*width)
497         r1 += -8;  // i0,i1 is pre read need to correct
498         m0 = r1;
499
500         r2 = [fp + ARG_chromStride];
501         r0 = r0 >> 1;
502         r2 = r2 - r0;
503         m1 = r2;
504
505         /*   I0,I1 - src input line pointers
506          *   p0,p1 - luma output line pointers
507          *   I2    - dstU
508          *   I3    - dstV
509          */
510
511         lsetup (0f, 1f) lc1 = p4;   // H/2
512 0:        r0 = [i0++] || r2 = [i1++];
513           r1 = [i0++] || r3 = [i1++];
514           r4 = byteop1p(r1:0, r3:2);
515           r5 = byteop1p(r1:0, r3:2) (r);
516           lsetup (2f, 3f) lc0 = p5; // W/4
517 2:          r0 = r0 >> 8(v);
518             r1 = r1 >> 8(v);
519             r2 = r2 >> 8(v);
520             r3 = r3 >> 8(v);
521             r0 = bytepack(r0, r1);
522             r2 = bytepack(r2, r3)         ||  [p0++] = r0;    // yyyy
523             r6 = pack(r5.l, r4.l)         ||  [p1++] = r2;    // yyyy
524             r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
525             r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
526             r4 = byteop1p(r1:0, r3:2)     ||  w[i2++] = r6.l; // uu
527 3:          r5 = byteop1p(r1:0, r3:2) (r) ||  w[i3++] = r6.h; // vv
528
529           i0 += m0;
530           i1 += m0;
531           i2 += m1;
532           i3 += m1;
533           p0 = p0 + p2;
534 1:        p1 = p1 + p2;
535
536         (r7:4,p5:4) = [sp++];
537         unlink;
538         rts;
539 DEFUN_END(uyvytoyv12)
540
541 DEFUN(yuyvtoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
542                          long width, long height,
543                          long lumStride, long chromStride, long srcStride)):
544         link 0;
545         [--sp] = (r7:4,p5:4);
546
547         p0 = r1;       // Y top even
548
549         i2 = r2; // *u
550         r2 = [fp + ARG_vdst];
551         i3 = r2; // *v
552
553         r1 = [fp + ARG_srcStride];
554         r2 = r0 + r1;
555
556         i0 = r0;  // uyvy_T even
557         i1 = r2;  // uyvy_B odd
558
559         p2 = [fp + ARG_lumStride];
560         p1 = p0 + p2;  // Y bot odd
561
562         p5 = [fp + ARG_width];
563         p4 = [fp + ARG_height];
564         r0 = p5;
565         p4 = p4 >> 1;
566         p5 = p5 >> 2;
567
568         r2 = r0 << 1;
569         r1 = r1 << 1;
570         r1 = r1 - r2;  // srcStride + (srcStride - 2*width)
571         r1 += -8;  // i0,i1 is pre read need to correct
572         m0 = r1;
573
574         r2 = [fp + ARG_chromStride];
575         r0 = r0 >> 1;
576         r2 = r2 - r0;
577         m1 = r2;
578
579         /*   I0,I1 - src input line pointers
580          *   p0,p1 - luma output line pointers
581          *   I2    - dstU
582          *   I3    - dstV
583          */
584
585         lsetup (0f, 1f) lc1 = p4;   // H/2
586 0:        r0 = [i0++] || r2 = [i1++];
587           r1 = [i0++] || r3 = [i1++];
588           r4 = bytepack(r0, r1);
589           r5 = bytepack(r2, r3);
590           lsetup (2f, 3f) lc0 = p5; // W/4
591 2:          r0 = r0 >> 8(v) || [p0++] = r4;  // yyyy-even
592             r1 = r1 >> 8(v) || [p1++] = r5;  // yyyy-odd
593             r2 = r2 >> 8(v);
594             r3 = r3 >> 8(v);
595             r4 = byteop1p(r1:0, r3:2);
596             r5 = byteop1p(r1:0, r3:2) (r);
597             r6 = pack(r5.l, r4.l);
598             r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
599             r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
600             r4 = bytepack(r0, r1)         ||  w[i2++] = r6.l; // uu
601 3:          r5 = bytepack(r2, r3)         ||  w[i3++] = r6.h; // vv
602
603           i0 += m0;
604           i1 += m0;
605           i2 += m1;
606           i3 += m1;
607           p0 = p0 + p2;
608 1:        p1 = p1 + p2;
609
610         (r7:4,p5:4) = [sp++];
611         unlink;
612         rts;
613 DEFUN_END(yuyvtoyv12)