From c61d6ae39e5039dcb27bf95334a86520b562bbc5 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Jos=C3=A9=20Fonseca?= Date: Fri, 7 Sep 2007 10:52:01 +0100 Subject: [PATCH] MMX code path for YV12 copy, taken from the xserver glucose-2 branch. --- pixman/pixman-mmx.c | 1004 ++++++++++++++++++++++++++++++++++++++++++++++++++ pixman/pixman-mmx.h | 28 ++ pixman/pixman-pict.c | 30 +- 3 files changed, 1061 insertions(+), 1 deletion(-) diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c index 8c7be6d..4fb2579 100644 --- a/pixman/pixman-mmx.c +++ b/pixman/pixman-mmx.c @@ -30,6 +30,9 @@ */ #include +#include +#include + #ifdef USE_MMX #if defined(__amd64__) || defined(__x86_64__) @@ -917,6 +920,188 @@ void fbComposeSetupMMX(void) } } +static __inline__ uint8_t +interpolate_bilinear (int distx, + int idistx, + int disty, + int idisty, + uint8_t tl, + uint8_t tr, + uint8_t bl, + uint8_t br) +{ + return ((tl * idistx + tr * distx) * idisty + + (bl * idistx + br * distx) * disty) >> 16; +} + +static __inline__ uint32_t +interpolate_bilinear_8888 (int distx, + int idistx, + int disty, + int idisty, + uint8_t *l00, + uint8_t *l01, + uint8_t *l10, + uint8_t *l11, + int x00, + int x01, + int x10, + int x11) +{ + uint8_t buffer[4]; + + buffer[0] = interpolate_bilinear (distx, idistx, disty, idisty, + l00[x00], l01[x01], + l10[x10], l11[x11]); + + buffer[1] = interpolate_bilinear (distx, idistx, disty, idisty, + l00[x00 + 1], l01[x01 + 1], + l10[x10 + 1], l11[x11 + 1]); + + buffer[2] = interpolate_bilinear (distx, idistx, disty, idisty, + l00[x00 + 2], l01[x01 + 2], + l10[x10 + 2], l11[x11 + 2]); + + buffer[3] = interpolate_bilinear (distx, idistx, disty, idisty, + l00[x00 + 3], l01[x01 + 3], + l10[x10 + 3], l11[x11 + 3]); + + return *((uint32_t *) buffer); +} + +static __inline__ uint32_t +fetch_bilinear2_8888 (int distx, + int idistx, + int disty, + int idisty, + uint8_t *l0, + uint8_t *l1, + int x0, + int x1) +{ + return interpolate_bilinear_8888 (distx, + idistx, + disty, + idisty, + l0, + l0, + l1, + l1, + x0, + x0 + 4, + x1, + x1 + 4); +} + +static __inline__ uint32_t +fetch_bilinear_8888 (int distx, + int idistx, + int disty, + int idisty, + uint8_t *l0, + uint8_t *l1, + int x) +{ + return fetch_bilinear2_8888 (distx, idistx, disty, idisty, l0, l1, x, x); +} + +static uint32_t _zero32x2[2] = { 0x0, 0x0 }; +static uint8_t *_zero8x8 = (uint8_t *) _zero32x2; + +static __inline__ int +set_scale_steps (uint32_t *src, + int srcStride, + int xStart, + int xStep, + int width, + int line, + int lastLine, + pixman_repeat_t repeatType, + uint8_t **s0, + uint8_t **s1, + int *x0, + int *x0Step, + int *x1, + int *x1Step) +{ + if (line < 0) + { + if (repeatType == PIXMAN_REPEAT_PAD) + { + *s0 = (uint8_t *) src; + *s1 = (uint8_t *) src; + + *x0 = xStart; + *x0Step = xStep; + *x1 = xStart; + *x1Step = xStep; + } + else + { + if (line == -1) + { + *s0 = _zero8x8; + + *x0 = 0; + *x0Step = 0; + + *s1 = (uint8_t *) src; + + *x1 = xStart; + *x1Step = xStep; + } + else + { + return 0; + } + } + } + else if (line >= lastLine) + { + if (repeatType == PIXMAN_REPEAT_PAD) + { + *s0 = (uint8_t *) (src + srcStride * lastLine); + *s1 = (uint8_t *) (src + srcStride * lastLine); + + *x0 = xStart; + *x0Step = xStep; + *x1 = xStart; + *x1Step = xStep; + } + else + { + if (line == lastLine) + { + *s0 = (uint8_t *) (src + srcStride * line); + + *x0 = xStart; + *x0Step = xStep; + + *s1 = _zero8x8; + + *x1 = 0; + *x1Step = 0; + } + else + { + return 0; + } + } + } + else + { + *s0 = (uint8_t *) (src + srcStride * line); + *s1 = (uint8_t *) (src + srcStride * (line + 1)); + + *x0 = xStart; + *x0Step = xStep; + *x1 = xStart; + *x1Step = xStep; + } + + return width; +} + /* ------------------ MMX code paths called from fbpict.c ----------------------- */ @@ -2970,6 +3155,825 @@ fbCompositeOver_x888x8x8888mmx (pixman_op_t op, _mm_empty(); } +typedef struct _ScanlineBuf { + pixman_bool_t lock[2]; + int y[2]; + uint8_t *line[2]; + int height; + uint8_t *heap; +} ScanlineBuf; + +static pixman_bool_t +init_scanline_buffer (ScanlineBuf *slb, + uint8_t *buffer, + int size, + int length, + int height) +{ + int i, s; + + s = length << 1; + + if (size < s) + { + slb->heap = malloc (s); + if (!slb->heap) + return FALSE; + + buffer = slb->heap; + } + else + { + slb->heap = NULL; + } + + for (i = 0; i < 2; i++) + { + slb->lock[i] = FALSE; + slb->y[i] = SHRT_MAX; + slb->line[i] = buffer; + + buffer += length; + } + + slb->height = height; + + return TRUE; +} + +static void +fini_scanline_buffer (ScanlineBuf *slb) +{ + if (slb->heap) + free (slb->heap); +} + +static __inline__ void +release_scanlines (ScanlineBuf *slb) +{ + int i; + + for (i = 0; i < 2; i++) + slb->lock[i] = FALSE; +} + +static __inline__ int +_y_to_scanline (ScanlineBuf *slb, + int y) +{ + return (y < 0) ? 0 : (y >= slb->height) ? slb->height - 1 : y; +} + +static __inline__ uint8_t * +get_scanline (ScanlineBuf *slb, + int y) +{ + int i; + + y = _y_to_scanline (slb, y); + + for (i = 0; i < 2; i++) + { + if (slb->y[i] == y) + { + slb->lock[i] = TRUE; + return slb->line[i]; + } + } + + return NULL; +} + +typedef struct { + ullong subYw; + ullong U_green; + ullong U_blue; + ullong V_red; + ullong V_green; + ullong Y_coeff; + ullong mmx0080; + ullong mmx00ff; +} YUVData; + +static const YUVData yuv = { + .subYw = 0x1010101010101010ULL, + .U_green = 0xf377f377f377f377ULL, + .U_blue = 0x408d408d408d408dULL, + .V_red = 0x3313331333133313ULL, + .V_green = 0xe5fce5fce5fce5fcULL, + .Y_coeff = 0x2543254325432543ULL, + .mmx0080 = 0x0080008000800080ULL, + .mmx00ff = 0x00ff00ff00ff00ffULL +}; + +static __inline__ void +mmx_loadyv12 (uint8_t *py, + uint8_t *pu, + uint8_t *pv) +{ + __asm__ __volatile__ ( + "movq %0, %%mm6\n" /* mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ + "pxor %%mm4, %%mm4\n" /* mm4 = 0 */ + "psubusb %1, %%mm6\n" /* Y -= 16 */ + "movd %2, %%mm0\n" /* mm0 = 00 00 00 00 U3 U2 U1 U0 */ + "movq %%mm6, %%mm7\n" /* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ + "pand %3, %%mm6\n" /* mm6 = Y6 Y4 Y2 Y0 */ + "psrlw %4, %%mm7\n" /* mm7 = Y7 Y5 Y3 Y1 */ + "movd %5, %%mm1\n" /* mm1 = 00 00 00 00 V3 V2 V1 V0 */ + "psllw %6, %%mm6\n" /* promote precision */ + "pmulhw %7, %%mm6\n" /* mm6 = luma_rgb even */ + "psllw %8, %%mm7\n" /* promote precision */ + "punpcklbw %%mm4, %%mm0\n" /* mm0 = U3 U2 U1 U0 */ + "psubsw %9, %%mm0\n" /* U -= 128 */ + "punpcklbw %%mm4, %%mm1\n" /* mm1 = V3 V2 V1 V0 */ + "pmulhw %10, %%mm7\n" /* mm7 = luma_rgb odd */ + "psllw %11, %%mm0\n" /* promote precision */ + "psubsw %12, %%mm1\n" /* V -= 128 */ + "movq %%mm0, %%mm2\n" /* mm2 = U3 U2 U1 U0 */ + "psllw %13, %%mm1\n" /* promote precision */ + "movq %%mm1, %%mm4\n" /* mm4 = V3 V2 V1 V0 */ + "pmulhw %14, %%mm0\n" /* mm0 = chroma_b */ + "pmulhw %15, %%mm1\n" /* mm1 = chroma_r */ + "movq %%mm0, %%mm3\n" /* mm3 = chroma_b */ + "paddsw %%mm6, %%mm0\n" /* mm0 = B6 B4 B2 B0 */ + "paddsw %%mm7, %%mm3\n" /* mm3 = B7 B5 B3 B1 */ + "packuswb %%mm0, %%mm0\n" /* saturate to 0-255 */ + "pmulhw %16, %%mm2\n" /* mm2 = U * U_green */ + "packuswb %%mm3, %%mm3\n" /* saturate to 0-255 */ + "punpcklbw %%mm3, %%mm0\n" /* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */ + "pmulhw %17, %%mm4\n" /* mm4 = V * V_green */ + "paddsw %%mm4, %%mm2\n" /* mm2 = chroma_g */ + "movq %%mm2, %%mm5\n" /* mm5 = chroma_g */ + "movq %%mm1, %%mm4\n" /* mm4 = chroma_r */ + "paddsw %%mm6, %%mm2\n" /* mm2 = G6 G4 G2 G0 */ + "packuswb %%mm2, %%mm2\n" /* saturate to 0-255 */ + "paddsw %%mm6, %%mm1\n" /* mm1 = R6 R4 R2 R0 */ + "packuswb %%mm1, %%mm1\n" /* saturate to 0-255 */ + "paddsw %%mm7, %%mm4\n" /* mm4 = R7 R5 R3 R1 */ + "packuswb %%mm4, %%mm4\n" /* saturate to 0-255 */ + "paddsw %%mm7, %%mm5\n" /* mm5 = G7 G5 G3 G1 */ + "packuswb %%mm5, %%mm5\n" /* saturate to 0-255 */ + "punpcklbw %%mm4, %%mm1\n" /* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */ + "punpcklbw %%mm5, %%mm2\n" /* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */ + : /* no outputs */ + : "m" (*py), "m" (yuv.subYw), "m" (*pu), "m" (yuv.mmx00ff), + "i" (8), "m" (*pv), "i" (3), "m" (yuv.Y_coeff), + "i" (3), "m" (yuv.mmx0080), "m" (yuv.Y_coeff), "i" (3), + "m" (yuv.mmx0080), "i" (3), "m" (yuv.U_blue), "m" (yuv.V_red), + "m" (yuv.U_green), "m" (yuv.V_green)); +} + +static __inline__ void +mmx_pack8888 (uint8_t *image) +{ + __asm__ __volatile__ ( + "pxor %%mm3, %%mm3\n" + "movq %%mm0, %%mm6\n" + "punpcklbw %%mm2, %%mm6\n" + "movq %%mm1, %%mm7\n" + "punpcklbw %%mm3, %%mm7\n" + "movq %%mm0, %%mm4\n" + "punpcklwd %%mm7, %%mm6\n" + "movq %%mm1, %%mm5\n" + "movq %%mm6, (%0)\n" + "movq %%mm0, %%mm6\n" + "punpcklbw %%mm2, %%mm6\n" + "punpckhwd %%mm7, %%mm6\n" + "movq %%mm6, 8(%0)\n" + "punpckhbw %%mm2, %%mm4\n" + "punpckhbw %%mm3, %%mm5\n" + "punpcklwd %%mm5, %%mm4\n" + "movq %%mm4, 16(%0)\n" + "movq %%mm0, %%mm4\n" + "punpckhbw %%mm2, %%mm4\n" + "punpckhwd %%mm5, %%mm4\n" + "movq %%mm4, 24(%0)\n" + : /* no outputs */ + : "r" (image) ); +} + +static __inline__ uint32_t +loadyuv (uint8_t *py, + uint8_t *pu, + uint8_t *pv) +{ + int16_t y, u, v; + int32_t r, g, b; + + y = *py - 16; + u = *pu - 128; + v = *pv - 128; + + /* R = 1.164(Y - 16) + 1.596(V - 128) */ + r = 0x012b27 * y + 0x019a2e * v; + /* G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128) */ + g = 0x012b27 * y - 0x00d0f2 * v - 0x00647e * u; + /* B = 1.164(Y - 16) + 2.018(U - 128) */ + b = 0x012b27 * y + 0x0206a2 * u; + + return 0xff000000 | + (r >= 0 ? r < 0x1000000 ? r & 0xff0000 : 0xff0000 : 0) | + (g >= 0 ? g < 0x1000000 ? (g >> 8) & 0x00ff00 : 0x00ff00 : 0) | + (b >= 0 ? b < 0x1000000 ? (b >> 16) & 0x0000ff : 0x0000ff : 0); +} + +static __inline__ uint8_t * +loadyv12_scanline (ScanlineBuf *slb, + int y, + uint8_t *srcY, + int yStride, + uint8_t *srcU, + uint8_t *srcV, + int uvStride, + int x, + int width) +{ + uint8_t *py, *pu, *pv, *pd; + int i, w; + + y = _y_to_scanline (slb, y); + + for (i = 0; slb->lock[i]; i++); + + slb->y[i] = y; + slb->lock[i] = TRUE; + + py = srcY + yStride * (y >> 0); + pu = srcU + uvStride * (y >> 1); + pv = srcV + uvStride * (y >> 1); + + pd = slb->line[i]; + + w = width; + + while (w && (unsigned long) py & 7) + { + *((uint32_t *) pd) = loadyuv (py, pu, pv); + + pd += 4; + py += 1; + + if (w & 1) + { + pu += 1; + pv += 1; + } + + w--; + } + + while (w >= 8) + { + mmx_loadyv12 (py, pu, pv); + mmx_pack8888 (pd); + + py += 8; + pu += 4; + pv += 4; + pd += 32; + + w -= 8; + } + + while (w) + { + *((uint32_t *) pd) = loadyuv (py, pu, pv); + + pd += 4; + py += 1; + + if (w & 1) + { + pu += 1; + pv += 1; + } + + w--; + } + + return slb->line[i]; +} + +static __inline__ uint8_t * +loadyuy2_scanline (ScanlineBuf *slb, + int y, + uint8_t *src, + int stride, + int x, + int width) +{ + uint8_t *py, *pu, *pv, *pd; + int i, w; + + y = _y_to_scanline (slb, y); + + for (i = 0; slb->lock[i]; i++); + + slb->y[i] = y; + slb->lock[i] = TRUE; + + py = src + stride * (y >> 0); + pu = py + 1; + pv = py + 3; + + pd = slb->line[i]; + + w = width; + + while (w) + { + *((uint32_t *) pd) = loadyuv (py, pu, pv); + + pd += 4; + py += 2; + + if (w & 1) + { + pu += 4; + pv += 4; + } + + w--; + } + + return slb->line[i]; +} + +/* TODO: MMX code for bilinear interpolation */ +void +fbCompositeSrc_yv12x8888mmx (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + pixman_transform_t *transform = pSrc->common.transform; + uint8_t *dst, *srcY, *srcU, *srcV; + uint32_t *srcBits = pSrc->bits.bits; + int srcStride, uvStride; + uint32_t *dstBits = pDst->bits.bits; + int dstStride; + int offset, w; + uint8_t *pd; + + dst = (uint8_t *) dstBits; + dstStride = pDst->bits.rowstride * sizeof (uint32_t); + + srcY = (uint8_t *) srcBits; + srcStride = pSrc->bits.rowstride; + + if (srcStride < 0) + { + offset = ((-srcStride) >> 1) * ((pSrc->bits.height - 1) >> 1) - + srcStride; + srcV = (uint8_t *) (srcBits + offset); + offset += ((-srcStride) >> 1) * ((pSrc->bits.height) >> 1); + srcU = (uint8_t *) (srcBits + offset); + } + else + { + offset = srcStride * pSrc->bits.height; + + srcV = (uint8_t *) (srcBits + offset); + srcU = (uint8_t *) (srcBits + offset + (offset >> 2)); + } + + srcStride *= sizeof (uint32_t); + uvStride = srcStride >> 1; + + if (transform) + { + /* transformation is a Y coordinate flip, this is achieved by + moving start offsets for each plane and changing sign of stride */ + if (transform->matrix[0][0] == (1 << 16) && + transform->matrix[1][1] == -(1 << 16) && + transform->matrix[0][2] == 0 && + transform->matrix[1][2] == (pSrc->bits.height << 16)) + { + srcY = srcY + ((pSrc->bits.height >> 0) - 1) * srcStride; + srcU = srcU + ((pSrc->bits.height >> 1) - 1) * uvStride; + srcV = srcV + ((pSrc->bits.height >> 1) - 1) * uvStride; + + srcStride = -srcStride; + uvStride = -uvStride; + + transform = 0; + } + } + + dst += dstStride * yDst + (xDst << 2); + + if (transform) + { + ScanlineBuf slb; + uint8_t _scanline_buf[8192]; + uint8_t *ps0, *ps1; + int x, x0, y, line, xStep, yStep; + int distx, idistx, disty, idisty; + int srcEnd = pSrc->bits.width << 16; + int srcEndIndex = (pSrc->bits.width - 1) << 16; + + xStep = transform->matrix[0][0]; + yStep = transform->matrix[1][1]; + + x0 = transform->matrix[0][2] + xStep * xSrc; + y = transform->matrix[1][2] + yStep * ySrc; + + init_scanline_buffer (&slb, + _scanline_buf, sizeof (_scanline_buf), + pSrc->bits.width << 2, + pSrc->bits.height); + + while (height--) + { + disty = (y >> 8) & 0xff; + idisty = 256 - disty; + line = y >> 16; + + ps0 = get_scanline (&slb, line); + ps1 = get_scanline (&slb, line + 1); + + if (!ps0) + ps0 = loadyv12_scanline (&slb, line, + srcY, srcStride, srcU, srcV, uvStride, + 0, pSrc->bits.width); + + if (!ps1) + ps1 = loadyv12_scanline (&slb, line + 1, + srcY, srcStride, srcU, srcV, uvStride, + 0, pSrc->bits.width); + + pd = dst; + + x = x0; + w = width; + + if (pSrc->common.filter == PIXMAN_FILTER_BILINEAR) + { + while (w && x < 0) + { + *(uint32_t *) pd = fetch_bilinear_8888 (0, 256, disty, idisty, + ps0, ps1, 0); + + x += xStep; + pd += 4; + w -= 1; + } + + while (w && x < srcEndIndex) + { + distx = (x >> 8) & 0xff; + idistx = 256 - distx; + + *(uint32_t *) pd = fetch_bilinear_8888 (distx, idistx, + disty, idisty, + ps0, ps1, + (x >> 14) & ~3); + + x += xStep; + pd += 4; + w -= 1; + } + + while (w) + { + *(uint32_t *) pd = fetch_bilinear_8888 (256, 0, + disty, idisty, + ps0, ps1, + (x >> 14) & ~3); + + pd += 4; + w -= 1; + } + } + else + { + while (w && x < 0) + { + *(uint32_t *) pd = *(uint32_t *) ps0; + + x += xStep; + pd += 4; + w -= 1; + } + + while (w && x < srcEnd) + { + *(uint32_t *) pd = ((uint32_t *) ps0)[x >> 16]; + + x += xStep; + pd += 4; + w -= 1; + } + + while (w) + { + *(uint32_t *) pd = ((uint32_t *) ps0)[x >> 16]; + + pd += 4; + w -= 1; + } + } + + y += yStep; + dst += dstStride; + + release_scanlines (&slb); + } + + fini_scanline_buffer (&slb); + } + else + { + uint8_t *py, *pu, *pv; + + srcY += srcStride * (ySrc >> 0) + (xSrc >> 0); + srcU += uvStride * (ySrc >> 1) + (xSrc >> 1); + srcV += uvStride * (ySrc >> 1) + (xSrc >> 1); + + while (height) + { + py = srcY; + pu = srcU; + pv = srcV; + pd = dst; + + w = width; + + while (w && (unsigned long) py & 7) + { + *((uint32_t *) pd) = loadyuv (py, pu, pv); + + pd += 4; + py += 1; + + if (w & 1) + { + pu += 1; + pv += 1; + } + + w--; + } + + while (w >= 8) + { + mmx_loadyv12 (py, pu, pv); + mmx_pack8888 (pd); + + py += 8; + pu += 4; + pv += 4; + pd += 32; + + w -= 8; + } + + while (w) + { + *((uint32_t *) pd) = loadyuv (py, pu, pv); + + pd += 4; + py += 1; + + if (w & 1) + { + pu += 1; + pv += 1; + } + + w--; + } + + dst += dstStride; + srcY += srcStride; + + if (height & 1) + { + srcU += uvStride; + srcV += uvStride; + } + + height--; + } + } + + _mm_empty (); +} + +/* TODO: MMX code for yuy2 */ +void +fbCompositeSrc_yuy2x8888mmx (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height) +{ + pixman_transform_t *transform = pSrc->common.transform; + uint8_t *dst, *src; + uint32_t *srcBits = pSrc->bits.bits; + int srcStride; + uint32_t *dstBits = pDst->bits.bits; + int dstStride; + int w; + uint8_t *pd; + + dst = (uint8_t *) dstBits; + dstStride = pDst->bits.rowstride * sizeof (uint32_t); + + src = (uint8_t *) srcBits; + srcStride = pSrc->bits.rowstride * sizeof (uint32_t); + + if (transform) + { + /* transformation is a Y coordinate flip, this is achieved by + moving start offsets for each plane and changing sign of stride */ + if (transform->matrix[0][0] == (1 << 16) && + transform->matrix[1][1] == -(1 << 16) && + transform->matrix[0][2] == 0 && + transform->matrix[1][2] == (pSrc->bits.height << 16)) + { + src = src + (pSrc->bits.height - 1) * srcStride; + + srcStride = -srcStride; + + transform = 0; + } + } + + dst += dstStride * yDst + (xDst << 2); + + if (transform) + { + ScanlineBuf slb; + uint8_t _scanline_buf[8192]; + uint8_t *ps0, *ps1; + int x, x0, y, line, xStep, yStep; + int distx, idistx, disty, idisty; + int srcEnd = pSrc->bits.width << 16; + int srcEndIndex = (pSrc->bits.width - 1) << 16; + + xStep = transform->matrix[0][0]; + yStep = transform->matrix[1][1]; + + x0 = transform->matrix[0][2] + xStep * xSrc; + y = transform->matrix[1][2] + yStep * ySrc; + + init_scanline_buffer (&slb, + _scanline_buf, sizeof (_scanline_buf), + pSrc->bits.width << 2, + pSrc->bits.height); + + while (height--) + { + disty = (y >> 8) & 0xff; + idisty = 256 - disty; + line = y >> 16; + + ps0 = get_scanline (&slb, line); + ps1 = get_scanline (&slb, line + 1); + + if (!ps0) + ps0 = loadyuy2_scanline (&slb, line, + src, srcStride, + 0, pSrc->bits.width); + + if (!ps1) + ps1 = loadyuy2_scanline (&slb, line + 1, + src, srcStride, + 0, pSrc->bits.width); + + pd = dst; + + x = x0; + w = width; + + if (pSrc->common.filter == PIXMAN_FILTER_BILINEAR) + { + while (w && x < 0) + { + *(uint32_t *) pd = fetch_bilinear_8888 (0, 256, disty, idisty, + ps0, ps1, 0); + + x += xStep; + pd += 4; + w -= 1; + } + + while (w && x < srcEndIndex) + { + distx = (x >> 8) & 0xff; + idistx = 256 - distx; + + *(uint32_t *) pd = fetch_bilinear_8888 (distx, idistx, + disty, idisty, + ps0, ps1, + (x >> 14) & ~3); + + x += xStep; + pd += 4; + w -= 1; + } + + while (w) + { + *(uint32_t *) pd = fetch_bilinear_8888 (256, 0, disty, idisty, + ps0, ps1, + (x >> 14) & ~3); + + pd += 4; + w -= 1; + } + } + else + { + while (w && x < 0) + { + *(uint32_t *) pd = *(uint32_t *) ps0; + + x += xStep; + pd += 4; + w -= 1; + } + + while (w && x < srcEnd) + { + *(uint32_t *) pd = ((uint32_t *) ps0)[x >> 16]; + + x += xStep; + pd += 4; + w -= 1; + } + + while (w) + { + *(uint32_t *) pd = ((uint32_t *) ps0)[x >> 16]; + + pd += 4; + w -= 1; + } + } + + y += yStep; + dst += dstStride; + + release_scanlines (&slb); + } + + fini_scanline_buffer (&slb); + } + else + { + uint8_t *py, *pu, *pv; + + src += srcStride * (ySrc >> 0) + xSrc; + + while (height) + { + py = src; + pu = src + 1; + pv = src + 3; + pd = dst; + + w = width; + + while (w) + { + *((uint32_t *) pd) = loadyuv (py, pu, pv); + + pd += 4; + py += 2; + + if (w & 1) + { + pu += 4; + pv += 4; + } + + w--; + } + + dst += dstStride; + src += srcStride; + + height--; + } + } +} #endif /* USE_MMX */ diff --git a/pixman/pixman-mmx.h b/pixman/pixman-mmx.h index a74d4ba..a8e27e4 100644 --- a/pixman/pixman-mmx.h +++ b/pixman/pixman-mmx.h @@ -312,4 +312,32 @@ fbCompositeOver_x888x8x8888mmx (pixman_op_t op, uint16_t width, uint16_t height); +void +fbCompositeSrc_yv12x8888mmx (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + +void +fbCompositeSrc_yuy2x8888mmx (pixman_op_t op, + pixman_image_t * pSrc, + pixman_image_t * pMask, + pixman_image_t * pDst, + int16_t xSrc, + int16_t ySrc, + int16_t xMask, + int16_t yMask, + int16_t xDst, + int16_t yDst, + uint16_t width, + uint16_t height); + #endif /* USE_MMX */ diff --git a/pixman/pixman-pict.c b/pixman/pixman-pict.c index c7d73fc..d2bc3d3 100644 --- a/pixman/pixman-pict.c +++ b/pixman/pixman-pict.c @@ -1437,7 +1437,35 @@ pixman_image_composite (pixman_op_t op, maskTransform = FALSE; } - if ((pSrc->type == BITS || can_get_solid (pSrc)) && (!pMask || pMask->type == BITS) + /* YUV is only used internally for XVideo */ + if (pSrc->bits.format == PIXMAN_yv12 || pSrc->bits.format == PIXMAN_yuy2) + { +#ifdef USE_MMX + /* non rotating transformation */ + if (!pSrc->common.transform || + (pSrc->common.transform->matrix[0][1] == 0 && + pSrc->common.transform->matrix[1][0] == 0 && + pSrc->common.transform->matrix[2][0] == 0 && + pSrc->common.transform->matrix[2][1] == 0 && + pSrc->common.transform->matrix[2][2] == 1 << 16)) + { + switch (pDst->bits.format) { + case PIXMAN_a8r8g8b8: + case PIXMAN_x8r8g8b8: + if (pixman_have_mmx()) + { + if (pSrc->bits.format == PIXMAN_yv12) + func = fbCompositeSrc_yv12x8888mmx; + else + func = fbCompositeSrc_yuy2x8888mmx; + } + default: + break; + } + } +#endif + } + else if ((pSrc->type == BITS || can_get_solid (pSrc)) && (!pMask || pMask->type == BITS) && !srcTransform && !maskTransform && !maskAlphaMap && !srcAlphaMap && !dstAlphaMap && (pSrc->common.filter != PIXMAN_FILTER_CONVOLUTION) -- 2.7.4