gst/videoscale/vs_lanczos.c

   1 /*
   2  * Image Scaling Functions
   3  * Copyright (c) 2011 David A. Schleef <ds@schleef.org>
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
  19  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  23  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  24  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  25  * POSSIBILITY OF SUCH DAMAGE.
  26  */
  27 /*
  28  *
  29  * Modified Lanczos scaling algorithm
  30  * ==================================
  31  *
  32  * This algorithm was developed by the author.  The primary goals of
  33  * the algorithm are high-quality video downscaling for medium scale
  34  * factors (in the range of 1.3x to 5.0x) using methods that can be
  35  * converted to SIMD code.  Concerns with existing algorithms were
  36  * mainly related to either over-soft filtering (Lanczos) or aliasing
  37  * (bilinear or any other method with inadequate sampling).
  38  *
  39  * The problems with bilinear scaling are apparent when downscaling
  40  * more than a factor of 2.  For example, when downscaling by a factor
  41  * of 3, only two-thirds of the input pixels contribute to the output
  42  * pixels.  This is only considering scaling in one direction; after
  43  * scaling both vertically and horizontally in a 2-D image, fewer than
  44  * half of the input pixels contribute to the output, so it should not
  45  * be surprising that the output is suboptimal.
  46  *
  47  * The problems with Lanczos scaling are more subtle.  From a theoretical
  48  * perspective, Lanczos is an optimal algorithm for resampling equally-
  49  * spaced values.  This theoretical perspective is based on analysis
  50  * done in frequency space, thus, Lanczos works very well for audio
  51  * resampling, since the ear hears primarily in frequency space.  The
  52  * human visual system is sensitive primarily in the spatial domain,
  53  * therefore any resampling algorithm should take this into account.
  54  * This difference is immediately clear in the size of resampling
  55  * window or envelope that is chosen for resampling: for audio, an
  56  * envelope of a=64 is typical, in image scaling, the envelope is
  57  * usually a=2 or a=3.
  58  *
  59  * One result of the HVS being sensitive in the spatial domain (and
  60  * also probably due to oversampling capabilities of the retina and
  61  * visual cortex) is that it is less sensitive to the exact magnitude
  62  * of high-frequency visual signals than to the appropriate amount of
  63  * energy in the nearby frequency band.  A Lanczos kernel with a=2
  64  * or a=3 strongly decreases the amount of energy in the high frequency
  65  * bands.  The energy in this area can be increased by increasing a,
  66  * which brings in energy from different areas of the image (bad for
  67  * reasons mentioned above), or by oversampling the input data.  We
  68  * have chosen two methods for doing the latter.  Firstly, there is
  69  * a sharpness parameter, which increases the cutoff frequency of the
  70  * filter, aliasing higher frequency noise into the passband.  And
  71  * secondly, there is the sharpen parameter, which increases the
  72  * contribution of high-frequency (but in-band) components.
  73  *
  74  * An alternate explanation of the usefulness of a sharpening filter
  75  * is that many natural images have a roughly 1/f spectrum.  In order
  76  * for a downsampled image to look more "natural" when high frequencies
  77  * are removed, the frequencies in the pass band near the cutoff
  78  * frequency are amplified, causing the spectrum to be more roughly
  79  * 1/f.  I said "roughly", not "literally".
  80  *
  81  * This alternate explanation is useful for understanding the author's
  82  * secondary motivation for developing this algorithm, namely, as a
  83  * method of video compression.  Several recent techniques (such as
  84  * HTTP Live Streaming and SVC) use image scaling as a method to get
  85  * increased compression out of nominally non-scalable codecs such as
  86  * H.264.  For optimal quality, it is thusly important to consider
  87  * the scaler and encoder as a combined unit.  Tuning of the sharpness
  88  * and sharpen parameters was performed using the Toro encoder tuner,
  89  * where scaled and encoded video was compared to unscaled and encoded
  90  * video.  This tuning suggested values that were very close to the
  91  * values chosen by manual inspection of scaled images and video.
  92  *
  93  * The optimal values of sharpen and sharpness were slightly different
  94  * depending whether the comparison was still images or video.  Video
  95  * comparisons were more sensitive to aliasing, since the aliasing
  96  * artifacts tended to move or "crawl" around the video.  The default
  97  * values are for video; image scaling may prefer higher values.
  98  *
  99  * A number of related techniques were rejected for various reasons.
 100  * An early technique of selecting the sharpness factor locally based
 101  * on edge detection (in order to use a higher sharpness values without
 102  * the corresponding aliasing on edges) worked very well for still
 103  * images, but caused too much "crawling" on textures in video.  Also,
 104  * this method is slow, as it does not parallelize well.
 105  *
 106  * Non-separable techniques were rejected because the fastest would
 107  * have been at least 4x slower.
 108  *
 109  * It is infrequently appreciated that image scaling should ideally be
 110  * done in linear light space.  Converting to linear light space has
 111  * a similar effect to a sharpening filter.  This approach was not
 112  * taken because the added benefit is minor compared to the additional
 113  * computational cost.  Morever, the benefit is decreased by increasing
 114  * the strength of the sharpening filter.
 115  *
 116  */
 117 #include <string.h>
 118
 119 #include "vs_scanline.h"
 120 #include "vs_image.h"
 121
 122 #include "gstvideoscaleorc.h"
 123 #include <gst/gst.h>
 124 #include <math.h>
 125
 126 #define NEED_CLAMP(x,a,b) ((x) < (a) || (x) > (b))
 127
 128 #define ROUND_UP_2(x)  (((x)+1)&~1)
 129 #define ROUND_UP_4(x)  (((x)+3)&~3)
 130 #define ROUND_UP_8(x)  (((x)+7)&~7)
 131
 132 #define SRC_LINE(i) (scale->src->pixels + scale->src->stride * (i))
 133
 134 #define TMP_LINE_S16(i) ((gint16 *)scale->tmpdata + (i)*(scale->dest->width))
 135 #define TMP_LINE_S32(i) ((gint32 *)scale->tmpdata + (i)*(scale->dest->width))
 136 #define TMP_LINE_FLOAT(i) ((float *)scale->tmpdata + (i)*(scale->dest->width))
 137 #define TMP_LINE_DOUBLE(i) ((double *)scale->tmpdata + (i)*(scale->dest->width))
 138 #define TMP_LINE_S16_AYUV(i) ((gint16 *)scale->tmpdata + (i)*4*(scale->dest->width))
 139 #define TMP_LINE_S32_AYUV(i) ((gint32 *)scale->tmpdata + (i)*4*(scale->dest->width))
 140 #define TMP_LINE_FLOAT_AYUV(i) ((float *)scale->tmpdata + (i)*4*(scale->dest->width))
 141 #define TMP_LINE_DOUBLE_AYUV(i) ((double *)scale->tmpdata + (i)*4*(scale->dest->width))
 142
 143 #define PTR_OFFSET(a,b) ((void *)((char *)(a) + (b)))
 144
 145 typedef void (*HorizResampleFunc) (void *dest, const gint32 * offsets,
 146     const void *taps, const void *src, int n_taps, int shift, int n);
 147
 148 typedef struct _Scale1D Scale1D;
 149 struct _Scale1D
 150 {
 151   int n;
 152   double offset;
 153   double scale;
 154
 155   double fx;
 156   double ex;
 157   int dx;
 158
 159   int n_taps;
 160   gint32 *offsets;
 161   void *taps;
 162 };
 163
 164 typedef struct _Scale Scale;
 165 struct _Scale
 166 {
 167   const VSImage *dest;
 168   const VSImage *src;
 169
 170   double sharpness;
 171   gboolean dither;
 172
 173   void *tmpdata;
 174
 175   HorizResampleFunc horiz_resample_func;
 176
 177   Scale1D x_scale1d;
 178   Scale1D y_scale1d;
 179 };
 180
 181 static void
 182 vs_image_scale_lanczos_Y_int16 (const VSImage * dest, const VSImage * src,
 183     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
 184     double sharpen);
 185 static void vs_image_scale_lanczos_Y_int32 (const VSImage * dest,
 186     const VSImage * src, uint8_t * tmpbuf, double sharpness, gboolean dither,
 187     double a, double sharpen);
 188 static void vs_image_scale_lanczos_Y_float (const VSImage * dest,
 189     const VSImage * src, uint8_t * tmpbuf, double sharpness, gboolean dither,
 190     double a, double sharpen);
 191 static void vs_image_scale_lanczos_Y_double (const VSImage * dest,
 192     const VSImage * src, uint8_t * tmpbuf, double sharpness, gboolean dither,
 193     double a, double sharpen);
 194 static void
 195 vs_image_scale_lanczos_AYUV_int16 (const VSImage * dest, const VSImage * src,
 196     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
 197     double sharpen);
 198 static void vs_image_scale_lanczos_AYUV_int32 (const VSImage * dest,
 199     const VSImage * src, uint8_t * tmpbuf, double sharpness, gboolean dither,
 200     double a, double sharpen);
 201 static void vs_image_scale_lanczos_AYUV_float (const VSImage * dest,
 202     const VSImage * src, uint8_t * tmpbuf, double sharpness, gboolean dither,
 203     double a, double sharpen);
 204 static void vs_image_scale_lanczos_AYUV_double (const VSImage * dest,
 205     const VSImage * src, uint8_t * tmpbuf, double sharpness, gboolean dither,
 206     double a, double sharpen);
 207
 208 static double
 209 sinc (double x)
 210 {
 211   if (x == 0)
 212     return 1;
 213   return sin (G_PI * x) / (G_PI * x);
 214 }
 215
 216 static double
 217 envelope (double x)
 218 {
 219   if (x <= -1 || x >= 1)
 220     return 0;
 221   return sinc (x);
 222 }
 223
 224 static int
 225 scale1d_get_n_taps (int src_size, int dest_size, double a, double sharpness)
 226 {
 227   double scale;
 228   double fx;
 229   int dx;
 230
 231   scale = src_size / (double) dest_size;
 232   if (scale > 1.0) {
 233     fx = (1.0 / scale) * sharpness;
 234   } else {
 235     fx = (1.0) * sharpness;
 236   }
 237   dx = ceil (a / fx);
 238
 239   return 2 * dx;
 240 }
 241
 242 static void
 243 scale1d_cleanup (Scale1D * scale)
 244 {
 245   g_free (scale->taps);
 246   g_free (scale->offsets);
 247 }
 248
 249 /*
 250  * Calculates a set of taps for each destination element in double
 251  * format.  Each set of taps sums to 1.0.
 252  *
 253  */
 254 static void
 255 scale1d_calculate_taps (Scale1D * scale, int src_size, int dest_size,
 256     int n_taps, double a, double sharpness, double sharpen)
 257 {
 258   int j;
 259   double *tap_array;
 260   gint32 *offsets;
 261   double scale_offset;
 262   double scale_increment;
 263   int dx;
 264   double fx;
 265   double ex;
 266
 267   scale->scale = src_size / (double) dest_size;
 268   scale->offset = scale->scale / 2 - 0.5;
 269
 270   if (scale->scale > 1.0) {
 271     scale->fx = (1.0 / scale->scale) * sharpness;
 272   } else {
 273     scale->fx = (1.0) * sharpness;
 274   }
 275   scale->ex = scale->fx / a;
 276   scale->dx = ceil (a / scale->fx);
 277
 278   g_assert (n_taps >= 2 * scale->dx);
 279   scale->n_taps = n_taps;
 280
 281   scale->taps = g_malloc (sizeof (double) * scale->n_taps * dest_size);
 282   scale->offsets = g_malloc (sizeof (gint32) * dest_size);
 283   tap_array = scale->taps;
 284   offsets = scale->offsets;
 285
 286   scale_offset = scale->offset;
 287   scale_increment = scale->scale;
 288   dx = scale->dx;
 289   fx = scale->fx;
 290   ex = scale->ex;
 291
 292   for (j = 0; j < dest_size; j++) {
 293     double x;
 294     int xi;
 295     int l;
 296     double weight;
 297     double *taps;
 298
 299     x = scale_offset + scale_increment * j;
 300     x = CLAMP (x, 0, src_size);
 301     xi = ceil (x) - dx;
 302
 303     offsets[j] = xi;
 304     weight = 0;
 305     taps = tap_array + j * n_taps;
 306
 307     for (l = 0; l < n_taps; l++) {
 308       int xl = xi + l;
 309       taps[l] = sinc ((x - xl) * fx) * envelope ((x - xl) * ex);
 310       taps[l] -= sharpen * envelope ((x - xl) * ex);
 311       weight += taps[l];
 312     }
 313     g_assert (envelope ((x - (xi - 1)) * ex) == 0);
 314     g_assert (envelope ((x - (xi + n_taps)) * ex) == 0);
 315     for (l = 0; l < n_taps; l++) {
 316       taps[l] /= weight;
 317     }
 318
 319     if (xi < 0) {
 320       int shift = -xi;
 321
 322       for (l = 0; l < shift; l++) {
 323         taps[shift] += taps[l];
 324       }
 325       for (l = 0; l < n_taps - shift; l++) {
 326         taps[l] = taps[shift + l];
 327       }
 328       for (; l < n_taps; l++) {
 329         taps[l] = 0;
 330       }
 331       offsets[j] += shift;
 332     }
 333
 334     if (xi > src_size - n_taps) {
 335       int shift = xi - (src_size - n_taps);
 336
 337       for (l = 0; l < shift; l++) {
 338         taps[n_taps - shift - 1] += taps[n_taps - shift + l];
 339       }
 340       for (l = 0; l < n_taps - shift; l++) {
 341         taps[n_taps - 1 - l] = taps[n_taps - 1 - shift - l];
 342       }
 343       for (l = 0; l < shift; l++) {
 344         taps[l] = 0;
 345       }
 346       offsets[j] -= shift;
 347     }
 348   }
 349 }
 350
 351 /*
 352  * Calculates a set of taps for each destination element in float
 353  * format.  Each set of taps sums to 1.0.
 354  */
 355 static void
 356 scale1d_calculate_taps_float (Scale1D * scale, int src_size, int dest_size,
 357     int n_taps, double a, double sharpness, double sharpen)
 358 {
 359   double *taps_d;
 360   float *taps_f;
 361   int j;
 362
 363   scale1d_calculate_taps (scale, src_size, dest_size, n_taps, a, sharpness,
 364       sharpen);
 365
 366   taps_d = scale->taps;
 367   taps_f = g_malloc (sizeof (float) * scale->n_taps * dest_size);
 368
 369   for (j = 0; j < dest_size * n_taps; j++) {
 370     taps_f[j] = taps_d[j];
 371   }
 372
 373   g_free (taps_d);
 374   scale->taps = taps_f;
 375 }
 376
 377 /*
 378  * Calculates a set of taps for each destination element in gint32
 379  * format.  Each set of taps sums to (very nearly) (1<<shift).  A
 380  * typical value for shift is 10 to 15, so that applying the taps to
 381  * uint8 values and summing will fit in a (signed) int32.
 382  */
 383 static void
 384 scale1d_calculate_taps_int32 (Scale1D * scale, int src_size, int dest_size,
 385     int n_taps, double a, double sharpness, double sharpen, int shift)
 386 {
 387   double *taps_d;
 388   gint32 *taps_i;
 389   int i;
 390   int j;
 391   double multiplier;
 392
 393   scale1d_calculate_taps (scale, src_size, dest_size, n_taps, a, sharpness,
 394       sharpen);
 395
 396   taps_d = scale->taps;
 397   taps_i = g_malloc (sizeof (gint32) * scale->n_taps * dest_size);
 398
 399   multiplier = (1 << shift);
 400
 401   for (j = 0; j < dest_size; j++) {
 402     for (i = 0; i < n_taps; i++) {
 403       taps_i[j * n_taps + i] =
 404           floor (0.5 + taps_d[j * n_taps + i] * multiplier);
 405     }
 406   }
 407
 408   g_free (taps_d);
 409   scale->taps = taps_i;
 410 }
 411
 412 /*
 413  * Calculates a set of taps for each destination element in gint16
 414  * format.  Each set of taps sums to (1<<shift).  A typical value
 415  * for shift is 7, so that applying the taps to uint8 values and
 416  * summing will fit in a (signed) int16.
 417  */
 418 static void
 419 scale1d_calculate_taps_int16 (Scale1D * scale, int src_size, int dest_size,
 420     int n_taps, double a, double sharpness, double sharpen, int shift)
 421 {
 422   double *taps_d;
 423   gint16 *taps_i;
 424   int i;
 425   int j;
 426   double multiplier;
 427
 428   scale1d_calculate_taps (scale, src_size, dest_size, n_taps, a, sharpness,
 429       sharpen);
 430
 431   taps_d = scale->taps;
 432   taps_i = g_malloc (sizeof (gint16) * scale->n_taps * dest_size);
 433
 434   multiplier = (1 << shift);
 435
 436   /* Various methods for converting floating point taps to integer.
 437    * The dB values are the SSIM value between scaling an image via
 438    * the floating point pathway vs. the integer pathway using the
 439    * given code to generate the taps.  Only one image was tested,
 440    * scaling from 1920x1080 to 640x360.  Several variations of the
 441    * methods were also tested, with nothing appearing useful.  */
 442 #if 0
 443   /* Standard round to integer.  This causes bad DC errors. */
 444   /* 44.588 dB */
 445   for (j = 0; j < dest_size; j++) {
 446     for (i = 0; i < n_taps; i++) {
 447       taps_i[j * n_taps + i] =
 448           floor (0.5 + taps_d[j * n_taps + i] * multiplier);
 449     }
 450   }
 451 #endif
 452 #if 0
 453   /* Dithering via error propogation.  Works pretty well, but
 454    * really we want to propogate errors across rows, which would
 455    * mean having several sets of tap arrays.  Possible, but more work,
 456    * and it may not even be better. */
 457   /* 57.0961 dB */
 458   {
 459     double err = 0;
 460     for (j = 0; j < dest_size; j++) {
 461       for (i = 0; i < n_taps; i++) {
 462         err += taps_d[j * n_taps + i] * multiplier;
 463         taps_i[j * n_taps + i] = floor (err);
 464         err -= floor (err);
 465       }
 466     }
 467   }
 468 #endif
 469 #if 1
 470   /* Round to integer, but with an adjustable bias that we use to
 471    * eliminate the DC error.  This search method is a bit crude, and
 472    * could perhaps be improved somewhat. */
 473   /* 60.4851 dB */
 474   for (j = 0; j < dest_size; j++) {
 475     int k;
 476     for (k = 0; k < 100; k++) {
 477       int sum = 0;
 478       double offset;
 479
 480       offset = k * 0.01;
 481       for (i = 0; i < n_taps; i++) {
 482         taps_i[j * n_taps + i] =
 483             floor (offset + taps_d[j * n_taps + i] * multiplier);
 484         sum += taps_i[j * n_taps + i];
 485       }
 486
 487       if (sum >= (1 << shift))
 488         break;
 489     }
 490   }
 491 #endif
 492 #if 0
 493   /* Round to integer, but adjust the multiplier.  The search method is
 494    * wrong a lot, but was sufficient enough to calculate dB error. */
 495   /* 58.6517 dB */
 496   for (j = 0; j < dest_size; j++) {
 497     int k;
 498     int sum = 0;
 499     for (k = 0; k < 200; k++) {
 500       sum = 0;
 501
 502       multiplier = (1 << shift) - 1.0 + k * 0.01;
 503       for (i = 0; i < n_taps; i++) {
 504         taps_i[j * n_taps + i] =
 505             floor (0.5 + taps_d[j * n_taps + i] * multiplier);
 506         sum += taps_i[j * n_taps + i];
 507       }
 508
 509       if (sum >= (1 << shift))
 510         break;
 511     }
 512     if (sum != (1 << shift)) {
 513       GST_ERROR ("%g %d", multiplier, sum);
 514     }
 515   }
 516 #endif
 517 #if 0
 518   /* Round to integer, but subtract the error from the largest tap */
 519   /* 58.3677 dB */
 520   for (j = 0; j < dest_size; j++) {
 521     int err = -multiplier;
 522     for (i = 0; i < n_taps; i++) {
 523       taps_i[j * n_taps + i] =
 524           floor (0.5 + taps_d[j * n_taps + i] * multiplier);
 525       err += taps_i[j * n_taps + i];
 526     }
 527     if (taps_i[j * n_taps + (n_taps / 2 - 1)] >
 528         taps_i[j * n_taps + (n_taps / 2)]) {
 529       taps_i[j * n_taps + (n_taps / 2 - 1)] -= err;
 530     } else {
 531       taps_i[j * n_taps + (n_taps / 2)] -= err;
 532     }
 533   }
 534 #endif
 535
 536   g_free (taps_d);
 537   scale->taps = taps_i;
 538 }
 539
 540
 541 void
 542 vs_image_scale_lanczos_Y (const VSImage * dest, const VSImage * src,
 543     uint8_t * tmpbuf, double sharpness, gboolean dither, int submethod,
 544     double a, double sharpen)
 545 {
 546   switch (submethod) {
 547     case 0:
 548     default:
 549       vs_image_scale_lanczos_Y_int16 (dest, src, tmpbuf, sharpness, dither, a,
 550           sharpen);
 551       break;
 552     case 1:
 553       vs_image_scale_lanczos_Y_int32 (dest, src, tmpbuf, sharpness, dither, a,
 554           sharpen);
 555       break;
 556     case 2:
 557       vs_image_scale_lanczos_Y_float (dest, src, tmpbuf, sharpness, dither, a,
 558           sharpen);
 559       break;
 560     case 3:
 561       vs_image_scale_lanczos_Y_double (dest, src, tmpbuf, sharpness, dither, a,
 562           sharpen);
 563       break;
 564   }
 565 }
 566
 567 void
 568 vs_image_scale_lanczos_AYUV (const VSImage * dest, const VSImage * src,
 569     uint8_t * tmpbuf, double sharpness, gboolean dither, int submethod,
 570     double a, double sharpen)
 571 {
 572   switch (submethod) {
 573     case 0:
 574     default:
 575       vs_image_scale_lanczos_AYUV_int16 (dest, src, tmpbuf, sharpness, dither,
 576           a, sharpen);
 577       break;
 578     case 1:
 579       vs_image_scale_lanczos_AYUV_int32 (dest, src, tmpbuf, sharpness, dither,
 580           a, sharpen);
 581       break;
 582     case 2:
 583       vs_image_scale_lanczos_AYUV_float (dest, src, tmpbuf, sharpness, dither,
 584           a, sharpen);
 585       break;
 586     case 3:
 587       vs_image_scale_lanczos_AYUV_double (dest, src, tmpbuf, sharpness, dither,
 588           a, sharpen);
 589       break;
 590   }
 591 }
 592
 593
 594
 595 #define RESAMPLE_HORIZ_FLOAT(function, dest_type, tap_type, src_type, _n_taps) \
 596 static void \
 597 function (dest_type *dest, const gint32 *offsets, \
 598     const tap_type *taps, const src_type *src, int n_taps, int shift, int n) \
 599 { \
 600   int i; \
 601   int k; \
 602   dest_type sum; \
 603   const src_type *srcline; \
 604   const tap_type *tapsline; \
 605   for (i = 0; i < n; i++) { \
 606     srcline = src + offsets[i]; \
 607     tapsline = taps + i * _n_taps; \
 608     sum = 0; \
 609     for (k = 0; k < _n_taps; k++) { \
 610       sum += srcline[k] * tapsline[k]; \
 611     } \
 612     dest[i] = sum; \
 613   } \
 614 }
 615
 616 #define RESAMPLE_HORIZ(function, dest_type, tap_type, src_type, _n_taps, _shift) \
 617 static void \
 618 function (dest_type *dest, const gint32 *offsets, \
 619     const tap_type *taps, const src_type *src, int n_taps, int shift, int n) \
 620 { \
 621   int i; \
 622   int k; \
 623   dest_type sum; \
 624   const src_type *srcline; \
 625   const tap_type *tapsline; \
 626   int offset; \
 627   if (_shift > 0) offset = (1<<_shift)>>1; \
 628   else offset = 0; \
 629   for (i = 0; i < n; i++) { \
 630     srcline = src + offsets[i]; \
 631     tapsline = taps + i * _n_taps; \
 632     sum = 0; \
 633     for (k = 0; k < _n_taps; k++) { \
 634       sum += srcline[k] * tapsline[k]; \
 635     } \
 636     dest[i] = (sum + offset) >> _shift; \
 637   } \
 638 }
 639
 640 #define RESAMPLE_HORIZ_AYUV_FLOAT(function, dest_type, tap_type, src_type, _n_taps) \
 641 static void \
 642 function (dest_type *dest, const gint32 *offsets, \
 643     const tap_type *taps, const src_type *src, int n_taps, int shift, int n) \
 644 { \
 645   int i; \
 646   int k; \
 647   dest_type sum1; \
 648   dest_type sum2; \
 649   dest_type sum3; \
 650   dest_type sum4; \
 651   const src_type *srcline; \
 652   const tap_type *tapsline; \
 653   for (i = 0; i < n; i++) { \
 654     srcline = src + 4*offsets[i]; \
 655     tapsline = taps + i * _n_taps; \
 656     sum1 = 0; \
 657     sum2 = 0; \
 658     sum3 = 0; \
 659     sum4 = 0; \
 660     for (k = 0; k < _n_taps; k++) { \
 661       sum1 += srcline[k*4+0] * tapsline[k]; \
 662       sum2 += srcline[k*4+1] * tapsline[k]; \
 663       sum3 += srcline[k*4+2] * tapsline[k]; \
 664       sum4 += srcline[k*4+3] * tapsline[k]; \
 665     } \
 666     dest[i*4+0] = sum1; \
 667     dest[i*4+1] = sum2; \
 668     dest[i*4+2] = sum3; \
 669     dest[i*4+3] = sum4; \
 670   } \
 671 }
 672
 673 #define RESAMPLE_HORIZ_AYUV(function, dest_type, tap_type, src_type, _n_taps, _shift) \
 674 static void \
 675 function (dest_type *dest, const gint32 *offsets, \
 676     const tap_type *taps, const src_type *src, int n_taps, int shift, int n) \
 677 { \
 678   int i; \
 679   int k; \
 680   dest_type sum1; \
 681   dest_type sum2; \
 682   dest_type sum3; \
 683   dest_type sum4; \
 684   const src_type *srcline; \
 685   const tap_type *tapsline; \
 686   int offset; \
 687   if (_shift > 0) offset = (1<<_shift)>>1; \
 688   else offset = 0; \
 689   for (i = 0; i < n; i++) { \
 690     srcline = src + 4*offsets[i]; \
 691     tapsline = taps + i * _n_taps; \
 692     sum1 = 0; \
 693     sum2 = 0; \
 694     sum3 = 0; \
 695     sum4 = 0; \
 696     for (k = 0; k < _n_taps; k++) { \
 697       sum1 += srcline[k*4+0] * tapsline[k]; \
 698       sum2 += srcline[k*4+1] * tapsline[k]; \
 699       sum3 += srcline[k*4+2] * tapsline[k]; \
 700       sum4 += srcline[k*4+3] * tapsline[k]; \
 701     } \
 702     dest[i*4+0] = (sum1 + offset) >> _shift; \
 703     dest[i*4+1] = (sum2 + offset) >> _shift; \
 704     dest[i*4+2] = (sum3 + offset) >> _shift; \
 705     dest[i*4+3] = (sum4 + offset) >> _shift; \
 706   } \
 707 }
 708
 709 /* *INDENT-OFF* */
 710 RESAMPLE_HORIZ_FLOAT (resample_horiz_double_u8_generic, double, double,
 711     guint8, n_taps)
 712 RESAMPLE_HORIZ_FLOAT (resample_horiz_float_u8_generic, float, float,
 713     guint8, n_taps)
 714 RESAMPLE_HORIZ_AYUV_FLOAT (resample_horiz_double_ayuv_generic, double, double,
 715     guint8, n_taps)
 716 RESAMPLE_HORIZ_AYUV_FLOAT (resample_horiz_float_ayuv_generic, float, float,
 717     guint8, n_taps)
 718
 719 RESAMPLE_HORIZ (resample_horiz_int32_int32_u8_generic, gint32, gint32,
 720     guint8, n_taps, shift)
 721 RESAMPLE_HORIZ (resample_horiz_int16_int16_u8_generic, gint16, gint16,
 722     guint8, n_taps, shift)
 723 RESAMPLE_HORIZ_AYUV (resample_horiz_int32_int32_ayuv_generic, gint32, gint32,
 724     guint8, n_taps, shift)
 725 RESAMPLE_HORIZ_AYUV (resample_horiz_int16_int16_ayuv_generic, gint16, gint16,
 726     guint8, n_taps, shift)
 727
 728 /* Candidates for orcification */
 729 RESAMPLE_HORIZ (resample_horiz_int32_int32_u8_taps16_shift0, gint32, gint32,
 730     guint8, 16, 0)
 731 RESAMPLE_HORIZ (resample_horiz_int32_int32_u8_taps12_shift0, gint32, gint32,
 732     guint8, 12, 0)
 733 RESAMPLE_HORIZ (resample_horiz_int32_int32_u8_taps8_shift0, gint32, gint32,
 734     guint8, 8, 0)
 735 RESAMPLE_HORIZ (resample_horiz_int32_int32_u8_taps4_shift0, gint32, gint32,
 736     guint8, 4, 0)
 737 RESAMPLE_HORIZ (resample_horiz_int16_int16_u8_taps16_shift0, gint16, gint16,
 738     guint8, 16, 0)
 739 RESAMPLE_HORIZ (resample_horiz_int16_int16_u8_taps12_shift0, gint16, gint16,
 740     guint8, 12, 0)
 741 RESAMPLE_HORIZ (resample_horiz_int16_int16_u8_taps8_shift0, gint16, gint16,
 742     guint8, 8, 0)
 743 RESAMPLE_HORIZ (resample_horiz_int16_int16_u8_taps4_shift0, gint16, gint16,
 744     guint8, 4, 0)
 745
 746 RESAMPLE_HORIZ_AYUV (resample_horiz_int32_int32_ayuv_taps16_shift0, gint32, gint32,
 747     guint8, 16, 0)
 748 RESAMPLE_HORIZ_AYUV (resample_horiz_int32_int32_ayuv_taps12_shift0, gint32, gint32,
 749     guint8, 12, 0)
 750 RESAMPLE_HORIZ_AYUV (resample_horiz_int32_int32_ayuv_taps8_shift0, gint32, gint32,
 751     guint8, 8, 0)
 752 RESAMPLE_HORIZ_AYUV (resample_horiz_int32_int32_ayuv_taps4_shift0, gint32, gint32,
 753     guint8, 4, 0)
 754 RESAMPLE_HORIZ_AYUV (resample_horiz_int16_int16_ayuv_taps16_shift0, gint16, gint16,
 755     guint8, 16, 0)
 756 RESAMPLE_HORIZ_AYUV (resample_horiz_int16_int16_ayuv_taps12_shift0, gint16, gint16,
 757     guint8, 12, 0)
 758 RESAMPLE_HORIZ_AYUV (resample_horiz_int16_int16_ayuv_taps8_shift0, gint16, gint16,
 759     guint8, 8, 0)
 760 RESAMPLE_HORIZ_AYUV (resample_horiz_int16_int16_ayuv_taps4_shift0, gint16, gint16,
 761     guint8, 4, 0)
 762 /* *INDENT-ON* */
 763
 764 #define RESAMPLE_VERT(function, tap_type, src_type, _n_taps, _shift) \
 765 static void \
 766 function (guint8 *dest, \
 767     const tap_type *taps, const src_type *src, int stride, int n_taps, \
 768     int shift, int n) \
 769 { \
 770   int i; \
 771   int l; \
 772   gint32 sum_y; \
 773   gint32 offset = (1<<_shift) >> 1; \
 774   for (i = 0; i < n; i++) { \
 775     sum_y = 0; \
 776     for (l = 0; l < n_taps; l++) { \
 777       const src_type *line = PTR_OFFSET(src, stride * l); \
 778       sum_y += line[i] * taps[l]; \
 779     } \
 780     dest[i] = CLAMP ((sum_y + offset) >> _shift, 0, 255); \
 781   } \
 782 }
 783
 784 #define RESAMPLE_VERT_DITHER(function, tap_type, src_type, _n_taps, _shift) \
 785 static void \
 786 function (guint8 *dest, \
 787     const tap_type *taps, const src_type *src, int stride, int n_taps, \
 788     int shift, int n) \
 789 { \
 790   int i; \
 791   int l; \
 792   gint32 sum_y; \
 793   gint32 err_y = 0; \
 794   gint32 mask = (1<<_shift) - 1; \
 795   for (i = 0; i < n; i++) { \
 796     sum_y = 0; \
 797     for (l = 0; l < n_taps; l++) { \
 798       const src_type *line = PTR_OFFSET(src, stride * l); \
 799       sum_y += line[i] * taps[l]; \
 800     } \
 801     err_y += sum_y; \
 802     dest[i] = CLAMP (err_y >> _shift, 0, 255); \
 803     err_y &= mask; \
 804   } \
 805 }
 806
 807 /* *INDENT-OFF* */
 808 RESAMPLE_VERT (resample_vert_int32_generic, gint32, gint32, n_taps, shift)
 809 RESAMPLE_VERT_DITHER (resample_vert_dither_int32_generic, gint32, gint32,
 810     n_taps, shift)
 811 RESAMPLE_VERT (resample_vert_int16_generic, gint16, gint16, n_taps, shift);
 812 RESAMPLE_VERT_DITHER (resample_vert_dither_int16_generic, gint16, gint16,
 813     n_taps, shift)
 814 /* *INDENT-ON* */
 815
 816 #define RESAMPLE_VERT_FLOAT(function, tap_type, src_type, _n_taps, _shift) \
 817 static void \
 818 function (guint8 *dest, \
 819     const tap_type *taps, const src_type *src, int stride, int n_taps, \
 820     int shift, int n) \
 821 { \
 822   int i; \
 823   int l; \
 824   src_type sum_y; \
 825   for (i = 0; i < n; i++) { \
 826     sum_y = 0; \
 827     for (l = 0; l < n_taps; l++) { \
 828       const src_type *line = PTR_OFFSET(src, stride * l); \
 829       sum_y += line[i] * taps[l]; \
 830     } \
 831     dest[i] = CLAMP (floor(0.5 + sum_y), 0, 255); \
 832   } \
 833 }
 834
 835 #define RESAMPLE_VERT_FLOAT_DITHER(function, tap_type, src_type, _n_taps, _shift) \
 836 static void \
 837 function (guint8 *dest, \
 838     const tap_type *taps, const src_type *src, int stride, int n_taps, \
 839     int shift, int n) \
 840 { \
 841   int i; \
 842   int l; \
 843   src_type sum_y; \
 844   src_type err_y = 0; \
 845   for (i = 0; i < n; i++) { \
 846     sum_y = 0; \
 847     for (l = 0; l < n_taps; l++) { \
 848       const src_type *line = PTR_OFFSET(src, stride * l); \
 849       sum_y += line[i] * taps[l]; \
 850     } \
 851     err_y += sum_y; \
 852     dest[i] = CLAMP (floor (err_y), 0, 255); \
 853     err_y -= floor (err_y); \
 854   } \
 855 }
 856
 857 /* *INDENT-OFF* */
 858 RESAMPLE_VERT_FLOAT (resample_vert_double_generic, double, double, n_taps,
 859     shift)
 860 RESAMPLE_VERT_FLOAT_DITHER (resample_vert_dither_double_generic, double, double,
 861     n_taps, shift)
 862
 863 RESAMPLE_VERT_FLOAT (resample_vert_float_generic, float, float, n_taps, shift)
 864 RESAMPLE_VERT_FLOAT_DITHER (resample_vert_dither_float_generic, float, float,
 865     n_taps, shift)
 866 /* *INDENT-ON* */
 867
 868 #define S16_SHIFT1 7
 869 #define S16_SHIFT2 7
 870 #define S16_MIDSHIFT 0
 871 #define S16_POSTSHIFT (S16_SHIFT1+S16_SHIFT2-S16_MIDSHIFT)
 872
 873 static void
 874 vs_scale_lanczos_Y_int16 (Scale * scale)
 875 {
 876   int j;
 877   int yi;
 878   int tmp_yi;
 879
 880   tmp_yi = 0;
 881
 882   for (j = 0; j < scale->dest->height; j++) {
 883     guint8 *destline;
 884     gint16 *taps;
 885
 886     destline = scale->dest->pixels + scale->dest->stride * j;
 887
 888     yi = scale->y_scale1d.offsets[j];
 889
 890     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
 891       scale->horiz_resample_func (TMP_LINE_S16 (tmp_yi),
 892           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
 893           scale->x_scale1d.n_taps, S16_MIDSHIFT, scale->dest->width);
 894       tmp_yi++;
 895     }
 896
 897     taps = (gint16 *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
 898     if (scale->dither) {
 899       resample_vert_dither_int16_generic (destline,
 900           taps, TMP_LINE_S16 (scale->y_scale1d.offsets[j]),
 901           sizeof (gint16) * scale->dest->width, scale->y_scale1d.n_taps,
 902           S16_POSTSHIFT, scale->dest->width);
 903     } else {
 904       resample_vert_int16_generic (destline,
 905           taps, TMP_LINE_S16 (scale->y_scale1d.offsets[j]),
 906           sizeof (gint16) * scale->dest->width, scale->y_scale1d.n_taps,
 907           S16_POSTSHIFT, scale->dest->width);
 908     }
 909   }
 910 }
 911
 912 void
 913 vs_image_scale_lanczos_Y_int16 (const VSImage * dest, const VSImage * src,
 914     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
 915     double sharpen)
 916 {
 917   Scale s = { 0 };
 918   Scale *scale = &s;
 919   int n_taps;
 920
 921   scale->dest = dest;
 922   scale->src = src;
 923
 924   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
 925   n_taps = ROUND_UP_4 (n_taps);
 926   scale1d_calculate_taps_int16 (&scale->x_scale1d,
 927       src->width, dest->width, n_taps, a, sharpness, sharpen, S16_SHIFT1);
 928
 929   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
 930   scale1d_calculate_taps_int16 (&scale->y_scale1d,
 931       src->height, dest->height, n_taps, a, sharpness, sharpen, S16_SHIFT2);
 932
 933   scale->dither = dither;
 934
 935   switch (scale->x_scale1d.n_taps) {
 936     case 4:
 937       scale->horiz_resample_func =
 938           (HorizResampleFunc) resample_horiz_int16_int16_u8_taps4_shift0;
 939       break;
 940     case 8:
 941       scale->horiz_resample_func =
 942           (HorizResampleFunc) resample_horiz_int16_int16_u8_taps8_shift0;
 943       break;
 944     case 12:
 945       scale->horiz_resample_func =
 946           (HorizResampleFunc) resample_horiz_int16_int16_u8_taps12_shift0;
 947       break;
 948     case 16:
 949       scale->horiz_resample_func =
 950           (HorizResampleFunc) resample_horiz_int16_int16_u8_taps16_shift0;
 951       break;
 952     default:
 953       scale->horiz_resample_func =
 954           (HorizResampleFunc) resample_horiz_int16_int16_u8_generic;
 955       break;
 956   }
 957
 958   scale->tmpdata =
 959       g_malloc (sizeof (gint16) * scale->dest->width * scale->src->height);
 960
 961   vs_scale_lanczos_Y_int16 (scale);
 962
 963   scale1d_cleanup (&scale->x_scale1d);
 964   scale1d_cleanup (&scale->y_scale1d);
 965   g_free (scale->tmpdata);
 966 }
 967
 968
 969 #define S32_SHIFT1 11
 970 #define S32_SHIFT2 11
 971 #define S32_MIDSHIFT 0
 972 #define S32_POSTSHIFT (S32_SHIFT1+S32_SHIFT2-S32_MIDSHIFT)
 973
 974 static void
 975 vs_scale_lanczos_Y_int32 (Scale * scale)
 976 {
 977   int j;
 978   int yi;
 979   int tmp_yi;
 980
 981   tmp_yi = 0;
 982
 983   for (j = 0; j < scale->dest->height; j++) {
 984     guint8 *destline;
 985     gint32 *taps;
 986
 987     destline = scale->dest->pixels + scale->dest->stride * j;
 988
 989     yi = scale->y_scale1d.offsets[j];
 990
 991     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
 992       scale->horiz_resample_func (TMP_LINE_S32 (tmp_yi),
 993           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
 994           scale->x_scale1d.n_taps, S32_MIDSHIFT, scale->dest->width);
 995       tmp_yi++;
 996     }
 997
 998     taps = (gint32 *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
 999     if (scale->dither) {
1000       resample_vert_dither_int32_generic (destline,
1001           taps, TMP_LINE_S32 (scale->y_scale1d.offsets[j]),
1002           sizeof (gint32) * scale->dest->width,
1003           scale->y_scale1d.n_taps, S32_POSTSHIFT, scale->dest->width);
1004     } else {
1005       resample_vert_int32_generic (destline,
1006           taps, TMP_LINE_S32 (scale->y_scale1d.offsets[j]),
1007           sizeof (gint32) * scale->dest->width,
1008           scale->y_scale1d.n_taps, S32_POSTSHIFT, scale->dest->width);
1009     }
1010   }
1011 }
1012
1013 void
1014 vs_image_scale_lanczos_Y_int32 (const VSImage * dest, const VSImage * src,
1015     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
1016     double sharpen)
1017 {
1018   Scale s = { 0 };
1019   Scale *scale = &s;
1020   int n_taps;
1021
1022   scale->dest = dest;
1023   scale->src = src;
1024
1025   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
1026   n_taps = ROUND_UP_4 (n_taps);
1027   scale1d_calculate_taps_int32 (&scale->x_scale1d,
1028       src->width, dest->width, n_taps, a, sharpness, sharpen, S32_SHIFT1);
1029
1030   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
1031   scale1d_calculate_taps_int32 (&scale->y_scale1d,
1032       src->height, dest->height, n_taps, a, sharpness, sharpen, S32_SHIFT2);
1033
1034   scale->dither = dither;
1035
1036   switch (scale->x_scale1d.n_taps) {
1037     case 4:
1038       scale->horiz_resample_func =
1039           (HorizResampleFunc) resample_horiz_int32_int32_u8_taps4_shift0;
1040       break;
1041     case 8:
1042       scale->horiz_resample_func =
1043           (HorizResampleFunc) resample_horiz_int32_int32_u8_taps8_shift0;
1044       break;
1045     case 12:
1046       scale->horiz_resample_func =
1047           (HorizResampleFunc) resample_horiz_int32_int32_u8_taps12_shift0;
1048       break;
1049     case 16:
1050       scale->horiz_resample_func =
1051           (HorizResampleFunc) resample_horiz_int32_int32_u8_taps16_shift0;
1052       break;
1053     default:
1054       scale->horiz_resample_func =
1055           (HorizResampleFunc) resample_horiz_int32_int32_u8_generic;
1056       break;
1057   }
1058
1059   scale->tmpdata =
1060       g_malloc (sizeof (int32_t) * scale->dest->width * scale->src->height);
1061
1062   vs_scale_lanczos_Y_int32 (scale);
1063
1064   scale1d_cleanup (&scale->x_scale1d);
1065   scale1d_cleanup (&scale->y_scale1d);
1066   g_free (scale->tmpdata);
1067 }
1068
1069 static void
1070 vs_scale_lanczos_Y_double (Scale * scale)
1071 {
1072   int j;
1073   int yi;
1074   int tmp_yi;
1075
1076   tmp_yi = 0;
1077
1078   for (j = 0; j < scale->dest->height; j++) {
1079     guint8 *destline;
1080     double *taps;
1081
1082     destline = scale->dest->pixels + scale->dest->stride * j;
1083
1084     yi = scale->y_scale1d.offsets[j];
1085
1086     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
1087       scale->horiz_resample_func (TMP_LINE_DOUBLE (tmp_yi),
1088           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
1089           scale->x_scale1d.n_taps, 0, scale->dest->width);
1090       tmp_yi++;
1091     }
1092
1093     taps = (double *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
1094     if (scale->dither) {
1095       resample_vert_dither_double_generic (destline,
1096           taps, TMP_LINE_DOUBLE (scale->y_scale1d.offsets[j]),
1097           sizeof (double) * scale->dest->width,
1098           scale->y_scale1d.n_taps, 0, scale->dest->width);
1099     } else {
1100       resample_vert_double_generic (destline,
1101           taps, TMP_LINE_DOUBLE (scale->y_scale1d.offsets[j]),
1102           sizeof (double) * scale->dest->width,
1103           scale->y_scale1d.n_taps, 0, scale->dest->width);
1104     }
1105   }
1106 }
1107
1108 void
1109 vs_image_scale_lanczos_Y_double (const VSImage * dest, const VSImage * src,
1110     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
1111     double sharpen)
1112 {
1113   Scale s = { 0 };
1114   Scale *scale = &s;
1115   int n_taps;
1116
1117   scale->dest = dest;
1118   scale->src = src;
1119
1120   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
1121   scale1d_calculate_taps (&scale->x_scale1d,
1122       src->width, dest->width, n_taps, a, sharpness, sharpen);
1123
1124   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
1125   scale1d_calculate_taps (&scale->y_scale1d,
1126       src->height, dest->height, n_taps, a, sharpness, sharpen);
1127
1128   scale->dither = dither;
1129
1130   scale->horiz_resample_func =
1131       (HorizResampleFunc) resample_horiz_double_u8_generic;
1132
1133   scale->tmpdata =
1134       g_malloc (sizeof (double) * scale->dest->width * scale->src->height);
1135
1136   vs_scale_lanczos_Y_double (scale);
1137
1138   scale1d_cleanup (&scale->x_scale1d);
1139   scale1d_cleanup (&scale->y_scale1d);
1140   g_free (scale->tmpdata);
1141 }
1142
1143 static void
1144 vs_scale_lanczos_Y_float (Scale * scale)
1145 {
1146   int j;
1147   int yi;
1148   int tmp_yi;
1149
1150   tmp_yi = 0;
1151
1152   for (j = 0; j < scale->dest->height; j++) {
1153     guint8 *destline;
1154     float *taps;
1155
1156     destline = scale->dest->pixels + scale->dest->stride * j;
1157
1158     yi = scale->y_scale1d.offsets[j];
1159
1160     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
1161       scale->horiz_resample_func (TMP_LINE_FLOAT (tmp_yi),
1162           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
1163           scale->x_scale1d.n_taps, 0, scale->dest->width);
1164       tmp_yi++;
1165     }
1166
1167     taps = (float *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
1168     if (scale->dither) {
1169       resample_vert_dither_float_generic (destline,
1170           taps, TMP_LINE_FLOAT (scale->y_scale1d.offsets[j]),
1171           sizeof (float) * scale->dest->width,
1172           scale->y_scale1d.n_taps, 0, scale->dest->width);
1173     } else {
1174       resample_vert_float_generic (destline,
1175           taps, TMP_LINE_FLOAT (scale->y_scale1d.offsets[j]),
1176           sizeof (float) * scale->dest->width,
1177           scale->y_scale1d.n_taps, 0, scale->dest->width);
1178     }
1179   }
1180 }
1181
1182 void
1183 vs_image_scale_lanczos_Y_float (const VSImage * dest, const VSImage * src,
1184     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
1185     double sharpen)
1186 {
1187   Scale s = { 0 };
1188   Scale *scale = &s;
1189   int n_taps;
1190
1191   scale->dest = dest;
1192   scale->src = src;
1193
1194   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
1195   scale1d_calculate_taps_float (&scale->x_scale1d,
1196       src->width, dest->width, n_taps, a, sharpness, sharpen);
1197
1198   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
1199   scale1d_calculate_taps_float (&scale->y_scale1d,
1200       src->height, dest->height, n_taps, a, sharpness, sharpen);
1201
1202   scale->dither = dither;
1203
1204   scale->horiz_resample_func =
1205       (HorizResampleFunc) resample_horiz_float_u8_generic;
1206
1207   scale->tmpdata =
1208       g_malloc (sizeof (float) * scale->dest->width * scale->src->height);
1209
1210   vs_scale_lanczos_Y_float (scale);
1211
1212   scale1d_cleanup (&scale->x_scale1d);
1213   scale1d_cleanup (&scale->y_scale1d);
1214   g_free (scale->tmpdata);
1215 }
1216
1217
1218
1219
1220
1221 static void
1222 vs_scale_lanczos_AYUV_int16 (Scale * scale)
1223 {
1224   int j;
1225   int yi;
1226   int tmp_yi;
1227
1228   tmp_yi = 0;
1229
1230   for (j = 0; j < scale->dest->height; j++) {
1231     guint8 *destline;
1232     gint16 *taps;
1233
1234     destline = scale->dest->pixels + scale->dest->stride * j;
1235
1236     yi = scale->y_scale1d.offsets[j];
1237
1238     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
1239       scale->horiz_resample_func (TMP_LINE_S16_AYUV (tmp_yi),
1240           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
1241           scale->x_scale1d.n_taps, S16_MIDSHIFT, scale->dest->width);
1242       tmp_yi++;
1243     }
1244
1245     taps = (gint16 *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
1246     if (scale->dither) {
1247       resample_vert_dither_int16_generic (destline,
1248           taps, TMP_LINE_S16_AYUV (scale->y_scale1d.offsets[j]),
1249           sizeof (gint16) * 4 * scale->dest->width,
1250           scale->y_scale1d.n_taps, S16_POSTSHIFT, scale->dest->width * 4);
1251     } else {
1252       resample_vert_int16_generic (destline,
1253           taps, TMP_LINE_S16_AYUV (scale->y_scale1d.offsets[j]),
1254           sizeof (gint16) * 4 * scale->dest->width,
1255           scale->y_scale1d.n_taps, S16_POSTSHIFT, scale->dest->width * 4);
1256     }
1257   }
1258 }
1259
1260 void
1261 vs_image_scale_lanczos_AYUV_int16 (const VSImage * dest, const VSImage * src,
1262     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
1263     double sharpen)
1264 {
1265   Scale s = { 0 };
1266   Scale *scale = &s;
1267   int n_taps;
1268
1269   scale->dest = dest;
1270   scale->src = src;
1271
1272   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
1273   n_taps = ROUND_UP_4 (n_taps);
1274   scale1d_calculate_taps_int16 (&scale->x_scale1d,
1275       src->width, dest->width, n_taps, a, sharpness, sharpen, S16_SHIFT1);
1276
1277   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
1278   scale1d_calculate_taps_int16 (&scale->y_scale1d,
1279       src->height, dest->height, n_taps, a, sharpness, sharpen, S16_SHIFT2);
1280
1281   scale->dither = dither;
1282
1283   switch (scale->x_scale1d.n_taps) {
1284     case 4:
1285       scale->horiz_resample_func =
1286           (HorizResampleFunc) resample_horiz_int16_int16_ayuv_taps4_shift0;
1287       break;
1288     case 8:
1289       scale->horiz_resample_func =
1290           (HorizResampleFunc) resample_horiz_int16_int16_ayuv_taps8_shift0;
1291       break;
1292     case 12:
1293       scale->horiz_resample_func =
1294           (HorizResampleFunc) resample_horiz_int16_int16_ayuv_taps12_shift0;
1295       break;
1296     case 16:
1297       scale->horiz_resample_func =
1298           (HorizResampleFunc) resample_horiz_int16_int16_ayuv_taps16_shift0;
1299       break;
1300     default:
1301       scale->horiz_resample_func =
1302           (HorizResampleFunc) resample_horiz_int16_int16_ayuv_generic;
1303       break;
1304   }
1305
1306   scale->tmpdata =
1307       g_malloc (sizeof (gint16) * scale->dest->width * scale->src->height * 4);
1308
1309   vs_scale_lanczos_AYUV_int16 (scale);
1310
1311   scale1d_cleanup (&scale->x_scale1d);
1312   scale1d_cleanup (&scale->y_scale1d);
1313   g_free (scale->tmpdata);
1314 }
1315
1316
1317 static void
1318 vs_scale_lanczos_AYUV_int32 (Scale * scale)
1319 {
1320   int j;
1321   int yi;
1322   int tmp_yi;
1323
1324   tmp_yi = 0;
1325
1326   for (j = 0; j < scale->dest->height; j++) {
1327     guint8 *destline;
1328     gint32 *taps;
1329
1330     destline = scale->dest->pixels + scale->dest->stride * j;
1331
1332     yi = scale->y_scale1d.offsets[j];
1333
1334     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
1335       scale->horiz_resample_func (TMP_LINE_S32_AYUV (tmp_yi),
1336           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
1337           scale->x_scale1d.n_taps, S32_MIDSHIFT, scale->dest->width);
1338       tmp_yi++;
1339     }
1340
1341     taps = (gint32 *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
1342     if (scale->dither) {
1343       resample_vert_dither_int32_generic (destline,
1344           taps, TMP_LINE_S32_AYUV (scale->y_scale1d.offsets[j]),
1345           sizeof (gint32) * 4 * scale->dest->width, scale->y_scale1d.n_taps,
1346           S32_POSTSHIFT, scale->dest->width * 4);
1347     } else {
1348       resample_vert_int32_generic (destline,
1349           taps, TMP_LINE_S32_AYUV (scale->y_scale1d.offsets[j]),
1350           sizeof (gint32) * 4 * scale->dest->width, scale->y_scale1d.n_taps,
1351           S32_POSTSHIFT, scale->dest->width * 4);
1352     }
1353   }
1354 }
1355
1356 void
1357 vs_image_scale_lanczos_AYUV_int32 (const VSImage * dest, const VSImage * src,
1358     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
1359     double sharpen)
1360 {
1361   Scale s = { 0 };
1362   Scale *scale = &s;
1363   int n_taps;
1364
1365   scale->dest = dest;
1366   scale->src = src;
1367
1368   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
1369   n_taps = ROUND_UP_4 (n_taps);
1370   scale1d_calculate_taps_int32 (&scale->x_scale1d,
1371       src->width, dest->width, n_taps, a, sharpness, sharpen, S32_SHIFT1);
1372
1373   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
1374   scale1d_calculate_taps_int32 (&scale->y_scale1d,
1375       src->height, dest->height, n_taps, a, sharpness, sharpen, S32_SHIFT2);
1376
1377   scale->dither = dither;
1378
1379   switch (scale->x_scale1d.n_taps) {
1380     case 4:
1381       scale->horiz_resample_func =
1382           (HorizResampleFunc) resample_horiz_int32_int32_ayuv_taps4_shift0;
1383       break;
1384     case 8:
1385       scale->horiz_resample_func =
1386           (HorizResampleFunc) resample_horiz_int32_int32_ayuv_taps8_shift0;
1387       break;
1388     case 12:
1389       scale->horiz_resample_func =
1390           (HorizResampleFunc) resample_horiz_int32_int32_ayuv_taps12_shift0;
1391       break;
1392     case 16:
1393       scale->horiz_resample_func =
1394           (HorizResampleFunc) resample_horiz_int32_int32_ayuv_taps16_shift0;
1395       break;
1396     default:
1397       scale->horiz_resample_func =
1398           (HorizResampleFunc) resample_horiz_int32_int32_ayuv_generic;
1399       break;
1400   }
1401
1402   scale->tmpdata =
1403       g_malloc (sizeof (int32_t) * scale->dest->width * scale->src->height * 4);
1404
1405   vs_scale_lanczos_AYUV_int32 (scale);
1406
1407   scale1d_cleanup (&scale->x_scale1d);
1408   scale1d_cleanup (&scale->y_scale1d);
1409   g_free (scale->tmpdata);
1410 }
1411
1412 static void
1413 vs_scale_lanczos_AYUV_double (Scale * scale)
1414 {
1415   int j;
1416   int yi;
1417   int tmp_yi;
1418
1419   tmp_yi = 0;
1420
1421   for (j = 0; j < scale->dest->height; j++) {
1422     guint8 *destline;
1423     double *taps;
1424
1425     destline = scale->dest->pixels + scale->dest->stride * j;
1426
1427     yi = scale->y_scale1d.offsets[j];
1428
1429     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
1430       scale->horiz_resample_func (TMP_LINE_DOUBLE_AYUV (tmp_yi),
1431           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
1432           scale->x_scale1d.n_taps, 0, scale->dest->width);
1433       tmp_yi++;
1434     }
1435
1436     taps = (double *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
1437     if (scale->dither) {
1438       resample_vert_dither_double_generic (destline,
1439           taps, TMP_LINE_DOUBLE_AYUV (scale->y_scale1d.offsets[j]),
1440           sizeof (double) * 4 * scale->dest->width,
1441           scale->y_scale1d.n_taps, 0, scale->dest->width * 4);
1442     } else {
1443       resample_vert_double_generic (destline,
1444           taps, TMP_LINE_DOUBLE_AYUV (scale->y_scale1d.offsets[j]),
1445           sizeof (double) * 4 * scale->dest->width,
1446           scale->y_scale1d.n_taps, 0, scale->dest->width * 4);
1447     }
1448   }
1449 }
1450
1451 void
1452 vs_image_scale_lanczos_AYUV_double (const VSImage * dest, const VSImage * src,
1453     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
1454     double sharpen)
1455 {
1456   Scale s = { 0 };
1457   Scale *scale = &s;
1458   int n_taps;
1459
1460   scale->dest = dest;
1461   scale->src = src;
1462
1463   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
1464   scale1d_calculate_taps (&scale->x_scale1d,
1465       src->width, dest->width, n_taps, a, sharpness, sharpen);
1466
1467   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
1468   scale1d_calculate_taps (&scale->y_scale1d,
1469       src->height, dest->height, n_taps, a, sharpness, sharpen);
1470
1471   scale->dither = dither;
1472
1473   scale->horiz_resample_func =
1474       (HorizResampleFunc) resample_horiz_double_ayuv_generic;
1475
1476   scale->tmpdata =
1477       g_malloc (sizeof (double) * scale->dest->width * scale->src->height * 4);
1478
1479   vs_scale_lanczos_AYUV_double (scale);
1480
1481   scale1d_cleanup (&scale->x_scale1d);
1482   scale1d_cleanup (&scale->y_scale1d);
1483   g_free (scale->tmpdata);
1484 }
1485
1486 static void
1487 vs_scale_lanczos_AYUV_float (Scale * scale)
1488 {
1489   int j;
1490   int yi;
1491   int tmp_yi;
1492
1493   tmp_yi = 0;
1494
1495   for (j = 0; j < scale->dest->height; j++) {
1496     guint8 *destline;
1497     float *taps;
1498
1499     destline = scale->dest->pixels + scale->dest->stride * j;
1500
1501     yi = scale->y_scale1d.offsets[j];
1502
1503     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
1504       scale->horiz_resample_func (TMP_LINE_FLOAT_AYUV (tmp_yi),
1505           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
1506           scale->x_scale1d.n_taps, 0, scale->dest->width);
1507       tmp_yi++;
1508     }
1509
1510     taps = (float *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
1511     if (scale->dither) {
1512       resample_vert_dither_float_generic (destline,
1513           taps, TMP_LINE_FLOAT_AYUV (scale->y_scale1d.offsets[j]),
1514           sizeof (float) * 4 * scale->dest->width, scale->y_scale1d.n_taps, 0,
1515           scale->dest->width * 4);
1516     } else {
1517       resample_vert_float_generic (destline,
1518           taps, TMP_LINE_FLOAT_AYUV (scale->y_scale1d.offsets[j]),
1519           sizeof (float) * 4 * scale->dest->width, scale->y_scale1d.n_taps, 0,
1520           scale->dest->width * 4);
1521     }
1522   }
1523 }
1524
1525 void
1526 vs_image_scale_lanczos_AYUV_float (const VSImage * dest, const VSImage * src,
1527     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
1528     double sharpen)
1529 {
1530   Scale s = { 0 };
1531   Scale *scale = &s;
1532   int n_taps;
1533
1534   scale->dest = dest;
1535   scale->src = src;
1536
1537   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
1538   scale1d_calculate_taps_float (&scale->x_scale1d,
1539       src->width, dest->width, n_taps, a, sharpness, sharpen);
1540
1541   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
1542   scale1d_calculate_taps_float (&scale->y_scale1d,
1543       src->height, dest->height, n_taps, a, sharpness, sharpen);
1544
1545   scale->dither = dither;
1546
1547   scale->horiz_resample_func =
1548       (HorizResampleFunc) resample_horiz_float_ayuv_generic;
1549
1550   scale->tmpdata =
1551       g_malloc (sizeof (float) * scale->dest->width * scale->src->height * 4);
1552
1553   vs_scale_lanczos_AYUV_float (scale);
1554
1555   scale1d_cleanup (&scale->x_scale1d);
1556   scale1d_cleanup (&scale->y_scale1d);
1557   g_free (scale->tmpdata);
1558 }