gst/videoscale/vs_lanczos.c

   1 /*
   2  * Image Scaling Functions
   3  * Copyright (c) 2011 David A. Schleef <ds@schleef.org>
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
  19  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  23  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  24  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  25  * POSSIBILITY OF SUCH DAMAGE.
  26  */
  27 /*
  28  *
  29  * Modified Lanczos scaling algorithm
  30  * ==================================
  31  *
  32  * This algorithm was developed by the author.  The primary goals of
  33  * the algorithm are high-quality video downscaling for medium scale
  34  * factors (in the range of 1.3x to 5.0x) using methods that can be
  35  * converted to SIMD code.  Concerns with existing algorithms were
  36  * mainly related to either over-soft filtering (Lanczos) or aliasing
  37  * (bilinear or any other method with inadequate sampling).
  38  *
  39  * The problems with bilinear scaling are apparent when downscaling
  40  * more than a factor of 2.  For example, when downscaling by a factor
  41  * of 3, only two-thirds of the input pixels contribute to the output
  42  * pixels.  This is only considering scaling in one direction; after
  43  * scaling both vertically and horizontally in a 2-D image, fewer than
  44  * half of the input pixels contribute to the output, so it should not
  45  * be surprising that the output is suboptimal.
  46  *
  47  * The problems with Lanczos scaling are more subtle.  From a theoretical
  48  * perspective, Lanczos is an optimal algorithm for resampling equally-
  49  * spaced values.  This theoretical perspective is based on analysis
  50  * done in frequency space, thus, Lanczos works very well for audio
  51  * resampling, since the ear hears primarily in frequency space.  The
  52  * human visual system is sensitive primarily in the spatial domain,
  53  * therefore any resampling algorithm should take this into account.
  54  * This difference is immediately clear in the size of resampling
  55  * window or envelope that is chosen for resampling: for audio, an
  56  * envelope of a=64 is typical, in image scaling, the envelope is
  57  * usually a=2 or a=3.
  58  *
  59  * One result of the HVS being sensitive in the spatial domain (and
  60  * also probably due to oversampling capabilities of the retina and
  61  * visual cortex) is that it is less sensitive to the exact magnitude
  62  * of high-frequency visual signals than to the appropriate amount of
  63  * energy in the nearby frequency band.  A Lanczos kernel with a=2
  64  * or a=3 strongly decreases the amount of energy in the high frequency
  65  * bands.  The energy in this area can be increased by increasing a,
  66  * which brings in energy from different areas of the image (bad for
  67  * reasons mentioned above), or by oversampling the input data.  We
  68  * have chosen two methods for doing the latter.  Firstly, there is
  69  * a sharpness parameter, which increases the cutoff frequency of the
  70  * filter, aliasing higher frequency noise into the passband.  And
  71  * secondly, there is the sharpen parameter, which increases the
  72  * contribution of high-frequency (but in-band) components.
  73  *
  74  * An alternate explanation of the usefulness of a sharpening filter
  75  * is that many natural images have a roughly 1/f spectrum.  In order
  76  * for a downsampled image to look more "natural" when high frequencies
  77  * are removed, the frequencies in the pass band near the cutoff
  78  * frequency are amplified, causing the spectrum to be more roughly
  79  * 1/f.  I said "roughly", not "literally".
  80  *
  81  * This alternate explanation is useful for understanding the author's
  82  * secondary motivation for developing this algorithm, namely, as a
  83  * method of video compression.  Several recent techniques (such as
  84  * HTTP Live Streaming and SVC) use image scaling as a method to get
  85  * increased compression out of nominally non-scalable codecs such as
  86  * H.264.  For optimal quality, it is thusly important to consider
  87  * the scaler and encoder as a combined unit.  Tuning of the sharpness
  88  * and sharpen parameters was performed using the Toro encoder tuner,
  89  * where scaled and encoded video was compared to unscaled and encoded
  90  * video.  This tuning suggested values that were very close to the
  91  * values chosen by manual inspection of scaled images and video.
  92  *
  93  * The optimal values of sharpen and sharpness were slightly different
  94  * depending whether the comparison was still images or video.  Video
  95  * comparisons were more sensitive to aliasing, since the aliasing
  96  * artifacts tended to move or "crawl" around the video.  The default
  97  * values are for video; image scaling may prefer higher values.
  98  *
  99  * A number of related techniques were rejected for various reasons.
 100  * An early technique of selecting the sharpness factor locally based
 101  * on edge detection (in order to use a higher sharpness values without
 102  * the corresponding aliasing on edges) worked very well for still
 103  * images, but caused too much "crawling" on textures in video.  Also,
 104  * this method is slow, as it does not parallelize well.
 105  *
 106  * Non-separable techniques were rejected because the fastest would
 107  * have been at least 4x slower.
 108  *
 109  * It is infrequently appreciated that image scaling should ideally be
 110  * done in linear light space.  Converting to linear light space has
 111  * a similar effect to a sharpening filter.  This approach was not
 112  * taken because the added benefit is minor compared to the additional
 113  * computational cost.  Morever, the benefit is decreased by increasing
 114  * the strength of the sharpening filter.
 115  *
 116  */
 117 #ifdef HAVE_CONFIG_H
 118 #include "config.h"
 119 #endif
 120
 121 #include <string.h>
 122
 123 #include "vs_scanline.h"
 124 #include "vs_image.h"
 125
 126 #include "gstvideoscaleorc.h"
 127 #include <gst/gst.h>
 128 #include <math.h>
 129
 130 #define NEED_CLAMP(x,a,b) ((x) < (a) || (x) > (b))
 131
 132 #define ROUND_UP_2(x)  (((x)+1)&~1)
 133 #define ROUND_UP_4(x)  (((x)+3)&~3)
 134 #define ROUND_UP_8(x)  (((x)+7)&~7)
 135
 136 #define SRC_LINE(i) (scale->src->pixels + scale->src->stride * (i))
 137
 138 #define TMP_LINE_S16(i) ((gint16 *)scale->tmpdata + (i)*(scale->dest->width))
 139 #define TMP_LINE_S32(i) ((gint32 *)scale->tmpdata + (i)*(scale->dest->width))
 140 #define TMP_LINE_FLOAT(i) ((float *)scale->tmpdata + (i)*(scale->dest->width))
 141 #define TMP_LINE_DOUBLE(i) ((double *)scale->tmpdata + (i)*(scale->dest->width))
 142 #define TMP_LINE_S16_AYUV(i) ((gint16 *)scale->tmpdata + (i)*4*(scale->dest->width))
 143 #define TMP_LINE_S32_AYUV(i) ((gint32 *)scale->tmpdata + (i)*4*(scale->dest->width))
 144 #define TMP_LINE_FLOAT_AYUV(i) ((float *)scale->tmpdata + (i)*4*(scale->dest->width))
 145 #define TMP_LINE_DOUBLE_AYUV(i) ((double *)scale->tmpdata + (i)*4*(scale->dest->width))
 146
 147 #define PTR_OFFSET(a,b) ((void *)((char *)(a) + (b)))
 148
 149 typedef void (*HorizResampleFunc) (void *dest, const gint32 * offsets,
 150     const void *taps, const void *src, int n_taps, int shift, int n);
 151
 152 typedef struct _Scale1D Scale1D;
 153 struct _Scale1D
 154 {
 155   int n;
 156   double offset;
 157   double scale;
 158
 159   double fx;
 160   double ex;
 161   int dx;
 162
 163   int n_taps;
 164   gint32 *offsets;
 165   void *taps;
 166 };
 167
 168 typedef struct _Scale Scale;
 169 struct _Scale
 170 {
 171   const VSImage *dest;
 172   const VSImage *src;
 173
 174   double sharpness;
 175   gboolean dither;
 176
 177   void *tmpdata;
 178
 179   HorizResampleFunc horiz_resample_func;
 180
 181   Scale1D x_scale1d;
 182   Scale1D y_scale1d;
 183 };
 184
 185 static void
 186 vs_image_scale_lanczos_Y_int16 (const VSImage * dest, const VSImage * src,
 187     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
 188     double sharpen);
 189 static void vs_image_scale_lanczos_Y_int32 (const VSImage * dest,
 190     const VSImage * src, uint8_t * tmpbuf, double sharpness, gboolean dither,
 191     double a, double sharpen);
 192 static void vs_image_scale_lanczos_Y_float (const VSImage * dest,
 193     const VSImage * src, uint8_t * tmpbuf, double sharpness, gboolean dither,
 194     double a, double sharpen);
 195 static void vs_image_scale_lanczos_Y_double (const VSImage * dest,
 196     const VSImage * src, uint8_t * tmpbuf, double sharpness, gboolean dither,
 197     double a, double sharpen);
 198 static void
 199 vs_image_scale_lanczos_AYUV_int16 (const VSImage * dest, const VSImage * src,
 200     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
 201     double sharpen);
 202 static void vs_image_scale_lanczos_AYUV_int32 (const VSImage * dest,
 203     const VSImage * src, uint8_t * tmpbuf, double sharpness, gboolean dither,
 204     double a, double sharpen);
 205 static void vs_image_scale_lanczos_AYUV_float (const VSImage * dest,
 206     const VSImage * src, uint8_t * tmpbuf, double sharpness, gboolean dither,
 207     double a, double sharpen);
 208 static void vs_image_scale_lanczos_AYUV_double (const VSImage * dest,
 209     const VSImage * src, uint8_t * tmpbuf, double sharpness, gboolean dither,
 210     double a, double sharpen);
 211 static void vs_image_scale_lanczos_AYUV64_double (const VSImage * dest,
 212     const VSImage * src, uint8_t * tmpbuf, double sharpness, gboolean dither,
 213     double a, double sharpen);
 214
 215 static double
 216 sinc (double x)
 217 {
 218   if (x == 0)
 219     return 1;
 220   return sin (G_PI * x) / (G_PI * x);
 221 }
 222
 223 static double
 224 envelope (double x)
 225 {
 226   if (x <= -1 || x >= 1)
 227     return 0;
 228   return sinc (x);
 229 }
 230
 231 static int
 232 scale1d_get_n_taps (int src_size, int dest_size, double a, double sharpness)
 233 {
 234   double scale;
 235   double fx;
 236   int dx;
 237
 238   scale = src_size / (double) dest_size;
 239   if (scale > 1.0) {
 240     fx = (1.0 / scale) * sharpness;
 241   } else {
 242     fx = (1.0) * sharpness;
 243   }
 244   dx = ceil (a / fx);
 245
 246   return 2 * dx;
 247 }
 248
 249 static void
 250 scale1d_cleanup (Scale1D * scale)
 251 {
 252   g_free (scale->taps);
 253   g_free (scale->offsets);
 254 }
 255
 256 /*
 257  * Calculates a set of taps for each destination element in double
 258  * format.  Each set of taps sums to 1.0.
 259  *
 260  */
 261 static void
 262 scale1d_calculate_taps (Scale1D * scale, int src_size, int dest_size,
 263     int n_taps, double a, double sharpness, double sharpen)
 264 {
 265   int j;
 266   double *tap_array;
 267   gint32 *offsets;
 268   double scale_offset;
 269   double scale_increment;
 270   int dx;
 271   double fx;
 272   double ex;
 273
 274   scale->scale = src_size / (double) dest_size;
 275   scale->offset = scale->scale / 2 - 0.5;
 276
 277   if (scale->scale > 1.0) {
 278     scale->fx = (1.0 / scale->scale) * sharpness;
 279   } else {
 280     scale->fx = (1.0) * sharpness;
 281   }
 282   scale->ex = scale->fx / a;
 283   scale->dx = ceil (a / scale->fx);
 284
 285   g_assert (n_taps >= 2 * scale->dx);
 286   scale->n_taps = n_taps;
 287
 288   scale->taps = g_malloc (sizeof (double) * scale->n_taps * dest_size);
 289   scale->offsets = g_malloc (sizeof (gint32) * dest_size);
 290   tap_array = scale->taps;
 291   offsets = scale->offsets;
 292
 293   scale_offset = scale->offset;
 294   scale_increment = scale->scale;
 295   dx = scale->dx;
 296   fx = scale->fx;
 297   ex = scale->ex;
 298
 299   for (j = 0; j < dest_size; j++) {
 300     double x;
 301     int xi;
 302     int l;
 303     double weight;
 304     double *taps;
 305
 306     x = scale_offset + scale_increment * j;
 307     x = CLAMP (x, 0, src_size);
 308     xi = ceil (x) - dx;
 309
 310     offsets[j] = xi;
 311     weight = 0;
 312     taps = tap_array + j * n_taps;
 313
 314     for (l = 0; l < n_taps; l++) {
 315       int xl = xi + l;
 316       taps[l] = sinc ((x - xl) * fx) * envelope ((x - xl) * ex);
 317       taps[l] -= sharpen * envelope ((x - xl) * ex);
 318       weight += taps[l];
 319     }
 320     g_assert (envelope ((x - (xi - 1)) * ex) == 0);
 321     g_assert (envelope ((x - (xi + n_taps)) * ex) == 0);
 322     for (l = 0; l < n_taps; l++) {
 323       taps[l] /= weight;
 324     }
 325
 326     if (xi < 0) {
 327       int shift = -xi;
 328
 329       for (l = 0; l < shift; l++) {
 330         taps[shift] += taps[l];
 331       }
 332       for (l = 0; l < n_taps - shift; l++) {
 333         taps[l] = taps[shift + l];
 334       }
 335       for (; l < n_taps; l++) {
 336         taps[l] = 0;
 337       }
 338       offsets[j] += shift;
 339     }
 340
 341     if (xi > src_size - n_taps) {
 342       int shift = xi - (src_size - n_taps);
 343
 344       for (l = 0; l < shift; l++) {
 345         taps[n_taps - shift - 1] += taps[n_taps - shift + l];
 346       }
 347       for (l = 0; l < n_taps - shift; l++) {
 348         taps[n_taps - 1 - l] = taps[n_taps - 1 - shift - l];
 349       }
 350       for (l = 0; l < shift; l++) {
 351         taps[l] = 0;
 352       }
 353       offsets[j] -= shift;
 354     }
 355   }
 356 }
 357
 358 /*
 359  * Calculates a set of taps for each destination element in float
 360  * format.  Each set of taps sums to 1.0.
 361  */
 362 static void
 363 scale1d_calculate_taps_float (Scale1D * scale, int src_size, int dest_size,
 364     int n_taps, double a, double sharpness, double sharpen)
 365 {
 366   double *taps_d;
 367   float *taps_f;
 368   int j;
 369
 370   scale1d_calculate_taps (scale, src_size, dest_size, n_taps, a, sharpness,
 371       sharpen);
 372
 373   taps_d = scale->taps;
 374   taps_f = g_malloc (sizeof (float) * scale->n_taps * dest_size);
 375
 376   for (j = 0; j < dest_size * n_taps; j++) {
 377     taps_f[j] = taps_d[j];
 378   }
 379
 380   g_free (taps_d);
 381   scale->taps = taps_f;
 382 }
 383
 384 /*
 385  * Calculates a set of taps for each destination element in gint32
 386  * format.  Each set of taps sums to (very nearly) (1<<shift).  A
 387  * typical value for shift is 10 to 15, so that applying the taps to
 388  * uint8 values and summing will fit in a (signed) int32.
 389  */
 390 static void
 391 scale1d_calculate_taps_int32 (Scale1D * scale, int src_size, int dest_size,
 392     int n_taps, double a, double sharpness, double sharpen, int shift)
 393 {
 394   double *taps_d;
 395   gint32 *taps_i;
 396   int i;
 397   int j;
 398   double multiplier;
 399
 400   scale1d_calculate_taps (scale, src_size, dest_size, n_taps, a, sharpness,
 401       sharpen);
 402
 403   taps_d = scale->taps;
 404   taps_i = g_malloc (sizeof (gint32) * scale->n_taps * dest_size);
 405
 406   multiplier = (1 << shift);
 407
 408   for (j = 0; j < dest_size; j++) {
 409     for (i = 0; i < n_taps; i++) {
 410       taps_i[j * n_taps + i] =
 411           floor (0.5 + taps_d[j * n_taps + i] * multiplier);
 412     }
 413   }
 414
 415   g_free (taps_d);
 416   scale->taps = taps_i;
 417 }
 418
 419 /*
 420  * Calculates a set of taps for each destination element in gint16
 421  * format.  Each set of taps sums to (1<<shift).  A typical value
 422  * for shift is 7, so that applying the taps to uint8 values and
 423  * summing will fit in a (signed) int16.
 424  */
 425 static void
 426 scale1d_calculate_taps_int16 (Scale1D * scale, int src_size, int dest_size,
 427     int n_taps, double a, double sharpness, double sharpen, int shift)
 428 {
 429   double *taps_d;
 430   gint16 *taps_i;
 431   int i;
 432   int j;
 433   double multiplier;
 434
 435   scale1d_calculate_taps (scale, src_size, dest_size, n_taps, a, sharpness,
 436       sharpen);
 437
 438   taps_d = scale->taps;
 439   taps_i = g_malloc (sizeof (gint16) * scale->n_taps * dest_size);
 440
 441   multiplier = (1 << shift);
 442
 443   /* Various methods for converting floating point taps to integer.
 444    * The dB values are the SSIM value between scaling an image via
 445    * the floating point pathway vs. the integer pathway using the
 446    * given code to generate the taps.  Only one image was tested,
 447    * scaling from 1920x1080 to 640x360.  Several variations of the
 448    * methods were also tested, with nothing appearing useful.  */
 449 #if 0
 450   /* Standard round to integer.  This causes bad DC errors. */
 451   /* 44.588 dB */
 452   for (j = 0; j < dest_size; j++) {
 453     for (i = 0; i < n_taps; i++) {
 454       taps_i[j * n_taps + i] =
 455           floor (0.5 + taps_d[j * n_taps + i] * multiplier);
 456     }
 457   }
 458 #endif
 459 #if 0
 460   /* Dithering via error propogation.  Works pretty well, but
 461    * really we want to propogate errors across rows, which would
 462    * mean having several sets of tap arrays.  Possible, but more work,
 463    * and it may not even be better. */
 464   /* 57.0961 dB */
 465   {
 466     double err = 0;
 467     for (j = 0; j < dest_size; j++) {
 468       for (i = 0; i < n_taps; i++) {
 469         err += taps_d[j * n_taps + i] * multiplier;
 470         taps_i[j * n_taps + i] = floor (err);
 471         err -= floor (err);
 472       }
 473     }
 474   }
 475 #endif
 476 #if 1
 477   /* Round to integer, but with an adjustable bias that we use to
 478    * eliminate the DC error.  This search method is a bit crude, and
 479    * could perhaps be improved somewhat. */
 480   /* 60.4851 dB */
 481   for (j = 0; j < dest_size; j++) {
 482     int k;
 483     for (k = 0; k < 100; k++) {
 484       int sum = 0;
 485       double offset;
 486
 487       offset = k * 0.01;
 488       for (i = 0; i < n_taps; i++) {
 489         taps_i[j * n_taps + i] =
 490             floor (offset + taps_d[j * n_taps + i] * multiplier);
 491         sum += taps_i[j * n_taps + i];
 492       }
 493
 494       if (sum >= (1 << shift))
 495         break;
 496     }
 497   }
 498 #endif
 499 #if 0
 500   /* Round to integer, but adjust the multiplier.  The search method is
 501    * wrong a lot, but was sufficient enough to calculate dB error. */
 502   /* 58.6517 dB */
 503   for (j = 0; j < dest_size; j++) {
 504     int k;
 505     int sum = 0;
 506     for (k = 0; k < 200; k++) {
 507       sum = 0;
 508
 509       multiplier = (1 << shift) - 1.0 + k * 0.01;
 510       for (i = 0; i < n_taps; i++) {
 511         taps_i[j * n_taps + i] =
 512             floor (0.5 + taps_d[j * n_taps + i] * multiplier);
 513         sum += taps_i[j * n_taps + i];
 514       }
 515
 516       if (sum >= (1 << shift))
 517         break;
 518     }
 519     if (sum != (1 << shift)) {
 520       GST_ERROR ("%g %d", multiplier, sum);
 521     }
 522   }
 523 #endif
 524 #if 0
 525   /* Round to integer, but subtract the error from the largest tap */
 526   /* 58.3677 dB */
 527   for (j = 0; j < dest_size; j++) {
 528     int err = -multiplier;
 529     for (i = 0; i < n_taps; i++) {
 530       taps_i[j * n_taps + i] =
 531           floor (0.5 + taps_d[j * n_taps + i] * multiplier);
 532       err += taps_i[j * n_taps + i];
 533     }
 534     if (taps_i[j * n_taps + (n_taps / 2 - 1)] >
 535         taps_i[j * n_taps + (n_taps / 2)]) {
 536       taps_i[j * n_taps + (n_taps / 2 - 1)] -= err;
 537     } else {
 538       taps_i[j * n_taps + (n_taps / 2)] -= err;
 539     }
 540   }
 541 #endif
 542
 543   g_free (taps_d);
 544   scale->taps = taps_i;
 545 }
 546
 547
 548 void
 549 vs_image_scale_lanczos_Y (const VSImage * dest, const VSImage * src,
 550     uint8_t * tmpbuf, double sharpness, gboolean dither, int submethod,
 551     double a, double sharpen)
 552 {
 553   switch (submethod) {
 554     case 0:
 555     default:
 556       vs_image_scale_lanczos_Y_int16 (dest, src, tmpbuf, sharpness, dither, a,
 557           sharpen);
 558       break;
 559     case 1:
 560       vs_image_scale_lanczos_Y_int32 (dest, src, tmpbuf, sharpness, dither, a,
 561           sharpen);
 562       break;
 563     case 2:
 564       vs_image_scale_lanczos_Y_float (dest, src, tmpbuf, sharpness, dither, a,
 565           sharpen);
 566       break;
 567     case 3:
 568       vs_image_scale_lanczos_Y_double (dest, src, tmpbuf, sharpness, dither, a,
 569           sharpen);
 570       break;
 571   }
 572 }
 573
 574 void
 575 vs_image_scale_lanczos_AYUV (const VSImage * dest, const VSImage * src,
 576     uint8_t * tmpbuf, double sharpness, gboolean dither, int submethod,
 577     double a, double sharpen)
 578 {
 579   switch (submethod) {
 580     case 0:
 581     default:
 582       vs_image_scale_lanczos_AYUV_int16 (dest, src, tmpbuf, sharpness, dither,
 583           a, sharpen);
 584       break;
 585     case 1:
 586       vs_image_scale_lanczos_AYUV_int32 (dest, src, tmpbuf, sharpness, dither,
 587           a, sharpen);
 588       break;
 589     case 2:
 590       vs_image_scale_lanczos_AYUV_float (dest, src, tmpbuf, sharpness, dither,
 591           a, sharpen);
 592       break;
 593     case 3:
 594       vs_image_scale_lanczos_AYUV_double (dest, src, tmpbuf, sharpness, dither,
 595           a, sharpen);
 596       break;
 597   }
 598 }
 599
 600 void
 601 vs_image_scale_lanczos_AYUV64 (const VSImage * dest, const VSImage * src,
 602     uint8_t * tmpbuf, double sharpness, gboolean dither, int submethod,
 603     double a, double sharpen)
 604 {
 605   vs_image_scale_lanczos_AYUV64_double (dest, src, tmpbuf, sharpness, dither,
 606       a, sharpen);
 607 }
 608
 609
 610
 611 #define RESAMPLE_HORIZ_FLOAT(function, dest_type, tap_type, src_type, _n_taps) \
 612 static void \
 613 function (dest_type *dest, const gint32 *offsets, \
 614     const tap_type *taps, const src_type *src, int n_taps, int shift, int n) \
 615 { \
 616   int i; \
 617   int k; \
 618   dest_type sum; \
 619   const src_type *srcline; \
 620   const tap_type *tapsline; \
 621   for (i = 0; i < n; i++) { \
 622     srcline = src + offsets[i]; \
 623     tapsline = taps + i * _n_taps; \
 624     sum = 0; \
 625     for (k = 0; k < _n_taps; k++) { \
 626       sum += srcline[k] * tapsline[k]; \
 627     } \
 628     dest[i] = sum; \
 629   } \
 630 }
 631
 632 #define RESAMPLE_HORIZ(function, dest_type, tap_type, src_type, _n_taps, _shift) \
 633 static void \
 634 function (dest_type *dest, const gint32 *offsets, \
 635     const tap_type *taps, const src_type *src, int n_taps, int shift, int n) \
 636 { \
 637   int i; \
 638   int k; \
 639   dest_type sum; \
 640   const src_type *srcline; \
 641   const tap_type *tapsline; \
 642   int offset; \
 643   if (_shift > 0) offset = (1<<_shift)>>1; \
 644   else offset = 0; \
 645   for (i = 0; i < n; i++) { \
 646     srcline = src + offsets[i]; \
 647     tapsline = taps + i * _n_taps; \
 648     sum = 0; \
 649     for (k = 0; k < _n_taps; k++) { \
 650       sum += srcline[k] * tapsline[k]; \
 651     } \
 652     dest[i] = (sum + offset) >> _shift; \
 653   } \
 654 }
 655
 656 #define RESAMPLE_HORIZ_AYUV_FLOAT(function, dest_type, tap_type, src_type, _n_taps) \
 657 static void \
 658 function (dest_type *dest, const gint32 *offsets, \
 659     const tap_type *taps, const src_type *src, int n_taps, int shift, int n) \
 660 { \
 661   int i; \
 662   int k; \
 663   dest_type sum1; \
 664   dest_type sum2; \
 665   dest_type sum3; \
 666   dest_type sum4; \
 667   const src_type *srcline; \
 668   const tap_type *tapsline; \
 669   for (i = 0; i < n; i++) { \
 670     srcline = src + 4*offsets[i]; \
 671     tapsline = taps + i * _n_taps; \
 672     sum1 = 0; \
 673     sum2 = 0; \
 674     sum3 = 0; \
 675     sum4 = 0; \
 676     for (k = 0; k < _n_taps; k++) { \
 677       sum1 += srcline[k*4+0] * tapsline[k]; \
 678       sum2 += srcline[k*4+1] * tapsline[k]; \
 679       sum3 += srcline[k*4+2] * tapsline[k]; \
 680       sum4 += srcline[k*4+3] * tapsline[k]; \
 681     } \
 682     dest[i*4+0] = sum1; \
 683     dest[i*4+1] = sum2; \
 684     dest[i*4+2] = sum3; \
 685     dest[i*4+3] = sum4; \
 686   } \
 687 }
 688
 689 #define RESAMPLE_HORIZ_AYUV(function, dest_type, tap_type, src_type, _n_taps, _shift) \
 690 static void \
 691 function (dest_type *dest, const gint32 *offsets, \
 692     const tap_type *taps, const src_type *src, int n_taps, int shift, int n) \
 693 { \
 694   int i; \
 695   int k; \
 696   dest_type sum1; \
 697   dest_type sum2; \
 698   dest_type sum3; \
 699   dest_type sum4; \
 700   const src_type *srcline; \
 701   const tap_type *tapsline; \
 702   int offset; \
 703   if (_shift > 0) offset = (1<<_shift)>>1; \
 704   else offset = 0; \
 705   for (i = 0; i < n; i++) { \
 706     srcline = src + 4*offsets[i]; \
 707     tapsline = taps + i * _n_taps; \
 708     sum1 = 0; \
 709     sum2 = 0; \
 710     sum3 = 0; \
 711     sum4 = 0; \
 712     for (k = 0; k < _n_taps; k++) { \
 713       sum1 += srcline[k*4+0] * tapsline[k]; \
 714       sum2 += srcline[k*4+1] * tapsline[k]; \
 715       sum3 += srcline[k*4+2] * tapsline[k]; \
 716       sum4 += srcline[k*4+3] * tapsline[k]; \
 717     } \
 718     dest[i*4+0] = (sum1 + offset) >> _shift; \
 719     dest[i*4+1] = (sum2 + offset) >> _shift; \
 720     dest[i*4+2] = (sum3 + offset) >> _shift; \
 721     dest[i*4+3] = (sum4 + offset) >> _shift; \
 722   } \
 723 }
 724
 725 /* *INDENT-OFF* */
 726 RESAMPLE_HORIZ_FLOAT (resample_horiz_double_u8_generic, double, double,
 727     guint8, n_taps)
 728 RESAMPLE_HORIZ_FLOAT (resample_horiz_float_u8_generic, float, float,
 729     guint8, n_taps)
 730 RESAMPLE_HORIZ_AYUV_FLOAT (resample_horiz_double_ayuv_generic, double, double,
 731     guint8, n_taps)
 732 RESAMPLE_HORIZ_AYUV_FLOAT (resample_horiz_float_ayuv_generic, float, float,
 733     guint8, n_taps)
 734
 735 RESAMPLE_HORIZ_AYUV_FLOAT (resample_horiz_double_ayuv_generic_s16, double, double,
 736     guint16, n_taps)
 737
 738 RESAMPLE_HORIZ (resample_horiz_int32_int32_u8_generic, gint32, gint32,
 739     guint8, n_taps, shift)
 740 RESAMPLE_HORIZ (resample_horiz_int16_int16_u8_generic, gint16, gint16,
 741     guint8, n_taps, shift)
 742 RESAMPLE_HORIZ_AYUV (resample_horiz_int32_int32_ayuv_generic, gint32, gint32,
 743     guint8, n_taps, shift)
 744 RESAMPLE_HORIZ_AYUV (resample_horiz_int16_int16_ayuv_generic, gint16, gint16,
 745     guint8, n_taps, shift)
 746
 747 /* Candidates for orcification */
 748 RESAMPLE_HORIZ (resample_horiz_int32_int32_u8_taps16_shift0, gint32, gint32,
 749     guint8, 16, 0)
 750 RESAMPLE_HORIZ (resample_horiz_int32_int32_u8_taps12_shift0, gint32, gint32,
 751     guint8, 12, 0)
 752 RESAMPLE_HORIZ (resample_horiz_int32_int32_u8_taps8_shift0, gint32, gint32,
 753     guint8, 8, 0)
 754 RESAMPLE_HORIZ (resample_horiz_int32_int32_u8_taps4_shift0, gint32, gint32,
 755     guint8, 4, 0)
 756 RESAMPLE_HORIZ (resample_horiz_int16_int16_u8_taps16_shift0, gint16, gint16,
 757     guint8, 16, 0)
 758 RESAMPLE_HORIZ (resample_horiz_int16_int16_u8_taps12_shift0, gint16, gint16,
 759     guint8, 12, 0)
 760 RESAMPLE_HORIZ (resample_horiz_int16_int16_u8_taps8_shift0, gint16, gint16,
 761     guint8, 8, 0)
 762 RESAMPLE_HORIZ (resample_horiz_int16_int16_u8_taps4_shift0, gint16, gint16,
 763     guint8, 4, 0)
 764
 765 RESAMPLE_HORIZ_AYUV (resample_horiz_int32_int32_ayuv_taps16_shift0, gint32, gint32,
 766     guint8, 16, 0)
 767 RESAMPLE_HORIZ_AYUV (resample_horiz_int32_int32_ayuv_taps12_shift0, gint32, gint32,
 768     guint8, 12, 0)
 769 RESAMPLE_HORIZ_AYUV (resample_horiz_int32_int32_ayuv_taps8_shift0, gint32, gint32,
 770     guint8, 8, 0)
 771 RESAMPLE_HORIZ_AYUV (resample_horiz_int32_int32_ayuv_taps4_shift0, gint32, gint32,
 772     guint8, 4, 0)
 773 RESAMPLE_HORIZ_AYUV (resample_horiz_int16_int16_ayuv_taps16_shift0, gint16, gint16,
 774     guint8, 16, 0)
 775 RESAMPLE_HORIZ_AYUV (resample_horiz_int16_int16_ayuv_taps12_shift0, gint16, gint16,
 776     guint8, 12, 0)
 777 RESAMPLE_HORIZ_AYUV (resample_horiz_int16_int16_ayuv_taps8_shift0, gint16, gint16,
 778     guint8, 8, 0)
 779 RESAMPLE_HORIZ_AYUV (resample_horiz_int16_int16_ayuv_taps4_shift0, gint16, gint16,
 780     guint8, 4, 0)
 781 /* *INDENT-ON* */
 782
 783 #define RESAMPLE_VERT(function, tap_type, src_type, _n_taps, _shift) \
 784 static void \
 785 function (guint8 *dest, \
 786     const tap_type *taps, const src_type *src, int stride, int n_taps, \
 787     int shift, int n) \
 788 { \
 789   int i; \
 790   int l; \
 791   gint32 sum_y; \
 792   gint32 offset = (1<<_shift) >> 1; \
 793   for (i = 0; i < n; i++) { \
 794     sum_y = 0; \
 795     for (l = 0; l < n_taps; l++) { \
 796       const src_type *line = PTR_OFFSET(src, stride * l); \
 797       sum_y += line[i] * taps[l]; \
 798     } \
 799     dest[i] = CLAMP ((sum_y + offset) >> _shift, 0, 255); \
 800   } \
 801 }
 802
 803 #define RESAMPLE_VERT_DITHER(function, tap_type, src_type, _n_taps, _shift) \
 804 static void \
 805 function (guint8 *dest, \
 806     const tap_type *taps, const src_type *src, int stride, int n_taps, \
 807     int shift, int n) \
 808 { \
 809   int i; \
 810   int l; \
 811   gint32 sum_y; \
 812   gint32 err_y = 0; \
 813   gint32 mask = (1<<_shift) - 1; \
 814   for (i = 0; i < n; i++) { \
 815     sum_y = 0; \
 816     for (l = 0; l < n_taps; l++) { \
 817       const src_type *line = PTR_OFFSET(src, stride * l); \
 818       sum_y += line[i] * taps[l]; \
 819     } \
 820     err_y += sum_y; \
 821     dest[i] = CLAMP (err_y >> _shift, 0, 255); \
 822     err_y &= mask; \
 823   } \
 824 }
 825
 826 /* *INDENT-OFF* */
 827 RESAMPLE_VERT (resample_vert_int32_generic, gint32, gint32, n_taps, shift)
 828 RESAMPLE_VERT_DITHER (resample_vert_dither_int32_generic, gint32, gint32,
 829     n_taps, shift)
 830 RESAMPLE_VERT (resample_vert_int16_generic, gint16, gint16, n_taps, shift);
 831 RESAMPLE_VERT_DITHER (resample_vert_dither_int16_generic, gint16, gint16,
 832     n_taps, shift)
 833 /* *INDENT-ON* */
 834
 835 #define RESAMPLE_VERT_FLOAT(function, dest_type, clamp, tap_type, src_type, _n_taps, _shift) \
 836 static void \
 837 function (dest_type *dest, \
 838     const tap_type *taps, const src_type *src, int stride, int n_taps, \
 839     int shift, int n) \
 840 { \
 841   int i; \
 842   int l; \
 843   src_type sum_y; \
 844   for (i = 0; i < n; i++) { \
 845     sum_y = 0; \
 846     for (l = 0; l < n_taps; l++) { \
 847       const src_type *line = PTR_OFFSET(src, stride * l); \
 848       sum_y += line[i] * taps[l]; \
 849     } \
 850     dest[i] = CLAMP (floor(0.5 + sum_y), 0, clamp); \
 851   } \
 852 }
 853
 854 #define RESAMPLE_VERT_FLOAT_DITHER(function, dest_type, clamp, tap_type, src_type, _n_taps, _shift) \
 855 static void \
 856 function (dest_type *dest, \
 857     const tap_type *taps, const src_type *src, int stride, int n_taps, \
 858     int shift, int n) \
 859 { \
 860   int i; \
 861   int l; \
 862   src_type sum_y; \
 863   src_type err_y = 0; \
 864   for (i = 0; i < n; i++) { \
 865     sum_y = 0; \
 866     for (l = 0; l < n_taps; l++) { \
 867       const src_type *line = PTR_OFFSET(src, stride * l); \
 868       sum_y += line[i] * taps[l]; \
 869     } \
 870     err_y += sum_y; \
 871     dest[i] = CLAMP (floor (err_y), 0, clamp); \
 872     err_y -= floor (err_y); \
 873   } \
 874 }
 875
 876 /* *INDENT-OFF* */
 877 RESAMPLE_VERT_FLOAT (resample_vert_double_generic, guint8, 255, double, double, n_taps,
 878     shift)
 879 RESAMPLE_VERT_FLOAT_DITHER (resample_vert_dither_double_generic, guint8, 255, double, double,
 880     n_taps, shift)
 881
 882 RESAMPLE_VERT_FLOAT (resample_vert_double_generic_u16, guint16, 65535, double, double, n_taps,
 883     shift)
 884 RESAMPLE_VERT_FLOAT_DITHER (resample_vert_dither_double_generic_u16, guint16, 65535, double, double,
 885     n_taps, shift)
 886
 887 RESAMPLE_VERT_FLOAT (resample_vert_float_generic, guint8, 255, float, float, n_taps, shift)
 888 RESAMPLE_VERT_FLOAT_DITHER (resample_vert_dither_float_generic, guint8, 255, float, float,
 889     n_taps, shift)
 890 /* *INDENT-ON* */
 891
 892 #define S16_SHIFT1 7
 893 #define S16_SHIFT2 7
 894 #define S16_MIDSHIFT 0
 895 #define S16_POSTSHIFT (S16_SHIFT1+S16_SHIFT2-S16_MIDSHIFT)
 896
 897 static void
 898 vs_scale_lanczos_Y_int16 (Scale * scale)
 899 {
 900   int j;
 901   int yi;
 902   int tmp_yi;
 903
 904   tmp_yi = 0;
 905
 906   for (j = 0; j < scale->dest->height; j++) {
 907     guint8 *destline;
 908     gint16 *taps;
 909
 910     destline = scale->dest->pixels + scale->dest->stride * j;
 911
 912     yi = scale->y_scale1d.offsets[j];
 913
 914     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
 915       scale->horiz_resample_func (TMP_LINE_S16 (tmp_yi),
 916           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
 917           scale->x_scale1d.n_taps, S16_MIDSHIFT, scale->dest->width);
 918       tmp_yi++;
 919     }
 920
 921     taps = (gint16 *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
 922     if (scale->dither) {
 923       resample_vert_dither_int16_generic (destline,
 924           taps, TMP_LINE_S16 (scale->y_scale1d.offsets[j]),
 925           sizeof (gint16) * scale->dest->width, scale->y_scale1d.n_taps,
 926           S16_POSTSHIFT, scale->dest->width);
 927     } else {
 928       resample_vert_int16_generic (destline,
 929           taps, TMP_LINE_S16 (scale->y_scale1d.offsets[j]),
 930           sizeof (gint16) * scale->dest->width, scale->y_scale1d.n_taps,
 931           S16_POSTSHIFT, scale->dest->width);
 932     }
 933   }
 934 }
 935
 936 void
 937 vs_image_scale_lanczos_Y_int16 (const VSImage * dest, const VSImage * src,
 938     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
 939     double sharpen)
 940 {
 941   Scale s = { 0 };
 942   Scale *scale = &s;
 943   int n_taps;
 944
 945   scale->dest = dest;
 946   scale->src = src;
 947
 948   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
 949   n_taps = ROUND_UP_4 (n_taps);
 950   scale1d_calculate_taps_int16 (&scale->x_scale1d,
 951       src->width, dest->width, n_taps, a, sharpness, sharpen, S16_SHIFT1);
 952
 953   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
 954   scale1d_calculate_taps_int16 (&scale->y_scale1d,
 955       src->height, dest->height, n_taps, a, sharpness, sharpen, S16_SHIFT2);
 956
 957   scale->dither = dither;
 958
 959   switch (scale->x_scale1d.n_taps) {
 960     case 4:
 961       scale->horiz_resample_func =
 962           (HorizResampleFunc) resample_horiz_int16_int16_u8_taps4_shift0;
 963       break;
 964     case 8:
 965       scale->horiz_resample_func =
 966           (HorizResampleFunc) resample_horiz_int16_int16_u8_taps8_shift0;
 967       break;
 968     case 12:
 969       scale->horiz_resample_func =
 970           (HorizResampleFunc) resample_horiz_int16_int16_u8_taps12_shift0;
 971       break;
 972     case 16:
 973       scale->horiz_resample_func =
 974           (HorizResampleFunc) resample_horiz_int16_int16_u8_taps16_shift0;
 975       break;
 976     default:
 977       scale->horiz_resample_func =
 978           (HorizResampleFunc) resample_horiz_int16_int16_u8_generic;
 979       break;
 980   }
 981
 982   scale->tmpdata =
 983       g_malloc (sizeof (gint16) * scale->dest->width * scale->src->height);
 984
 985   vs_scale_lanczos_Y_int16 (scale);
 986
 987   scale1d_cleanup (&scale->x_scale1d);
 988   scale1d_cleanup (&scale->y_scale1d);
 989   g_free (scale->tmpdata);
 990 }
 991
 992
 993 #define S32_SHIFT1 11
 994 #define S32_SHIFT2 11
 995 #define S32_MIDSHIFT 0
 996 #define S32_POSTSHIFT (S32_SHIFT1+S32_SHIFT2-S32_MIDSHIFT)
 997
 998 static void
 999 vs_scale_lanczos_Y_int32 (Scale * scale)
1000 {
1001   int j;
1002   int yi;
1003   int tmp_yi;
1004
1005   tmp_yi = 0;
1006
1007   for (j = 0; j < scale->dest->height; j++) {
1008     guint8 *destline;
1009     gint32 *taps;
1010
1011     destline = scale->dest->pixels + scale->dest->stride * j;
1012
1013     yi = scale->y_scale1d.offsets[j];
1014
1015     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
1016       scale->horiz_resample_func (TMP_LINE_S32 (tmp_yi),
1017           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
1018           scale->x_scale1d.n_taps, S32_MIDSHIFT, scale->dest->width);
1019       tmp_yi++;
1020     }
1021
1022     taps = (gint32 *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
1023     if (scale->dither) {
1024       resample_vert_dither_int32_generic (destline,
1025           taps, TMP_LINE_S32 (scale->y_scale1d.offsets[j]),
1026           sizeof (gint32) * scale->dest->width,
1027           scale->y_scale1d.n_taps, S32_POSTSHIFT, scale->dest->width);
1028     } else {
1029       resample_vert_int32_generic (destline,
1030           taps, TMP_LINE_S32 (scale->y_scale1d.offsets[j]),
1031           sizeof (gint32) * scale->dest->width,
1032           scale->y_scale1d.n_taps, S32_POSTSHIFT, scale->dest->width);
1033     }
1034   }
1035 }
1036
1037 void
1038 vs_image_scale_lanczos_Y_int32 (const VSImage * dest, const VSImage * src,
1039     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
1040     double sharpen)
1041 {
1042   Scale s = { 0 };
1043   Scale *scale = &s;
1044   int n_taps;
1045
1046   scale->dest = dest;
1047   scale->src = src;
1048
1049   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
1050   n_taps = ROUND_UP_4 (n_taps);
1051   scale1d_calculate_taps_int32 (&scale->x_scale1d,
1052       src->width, dest->width, n_taps, a, sharpness, sharpen, S32_SHIFT1);
1053
1054   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
1055   scale1d_calculate_taps_int32 (&scale->y_scale1d,
1056       src->height, dest->height, n_taps, a, sharpness, sharpen, S32_SHIFT2);
1057
1058   scale->dither = dither;
1059
1060   switch (scale->x_scale1d.n_taps) {
1061     case 4:
1062       scale->horiz_resample_func =
1063           (HorizResampleFunc) resample_horiz_int32_int32_u8_taps4_shift0;
1064       break;
1065     case 8:
1066       scale->horiz_resample_func =
1067           (HorizResampleFunc) resample_horiz_int32_int32_u8_taps8_shift0;
1068       break;
1069     case 12:
1070       scale->horiz_resample_func =
1071           (HorizResampleFunc) resample_horiz_int32_int32_u8_taps12_shift0;
1072       break;
1073     case 16:
1074       scale->horiz_resample_func =
1075           (HorizResampleFunc) resample_horiz_int32_int32_u8_taps16_shift0;
1076       break;
1077     default:
1078       scale->horiz_resample_func =
1079           (HorizResampleFunc) resample_horiz_int32_int32_u8_generic;
1080       break;
1081   }
1082
1083   scale->tmpdata =
1084       g_malloc (sizeof (int32_t) * scale->dest->width * scale->src->height);
1085
1086   vs_scale_lanczos_Y_int32 (scale);
1087
1088   scale1d_cleanup (&scale->x_scale1d);
1089   scale1d_cleanup (&scale->y_scale1d);
1090   g_free (scale->tmpdata);
1091 }
1092
1093 static void
1094 vs_scale_lanczos_Y_double (Scale * scale)
1095 {
1096   int j;
1097   int yi;
1098   int tmp_yi;
1099
1100   tmp_yi = 0;
1101
1102   for (j = 0; j < scale->dest->height; j++) {
1103     guint8 *destline;
1104     double *taps;
1105
1106     destline = scale->dest->pixels + scale->dest->stride * j;
1107
1108     yi = scale->y_scale1d.offsets[j];
1109
1110     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
1111       scale->horiz_resample_func (TMP_LINE_DOUBLE (tmp_yi),
1112           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
1113           scale->x_scale1d.n_taps, 0, scale->dest->width);
1114       tmp_yi++;
1115     }
1116
1117     taps = (double *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
1118     if (scale->dither) {
1119       resample_vert_dither_double_generic (destline,
1120           taps, TMP_LINE_DOUBLE (scale->y_scale1d.offsets[j]),
1121           sizeof (double) * scale->dest->width,
1122           scale->y_scale1d.n_taps, 0, scale->dest->width);
1123     } else {
1124       resample_vert_double_generic (destline,
1125           taps, TMP_LINE_DOUBLE (scale->y_scale1d.offsets[j]),
1126           sizeof (double) * scale->dest->width,
1127           scale->y_scale1d.n_taps, 0, scale->dest->width);
1128     }
1129   }
1130 }
1131
1132 void
1133 vs_image_scale_lanczos_Y_double (const VSImage * dest, const VSImage * src,
1134     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
1135     double sharpen)
1136 {
1137   Scale s = { 0 };
1138   Scale *scale = &s;
1139   int n_taps;
1140
1141   scale->dest = dest;
1142   scale->src = src;
1143
1144   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
1145   scale1d_calculate_taps (&scale->x_scale1d,
1146       src->width, dest->width, n_taps, a, sharpness, sharpen);
1147
1148   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
1149   scale1d_calculate_taps (&scale->y_scale1d,
1150       src->height, dest->height, n_taps, a, sharpness, sharpen);
1151
1152   scale->dither = dither;
1153
1154   scale->horiz_resample_func =
1155       (HorizResampleFunc) resample_horiz_double_u8_generic;
1156
1157   scale->tmpdata =
1158       g_malloc (sizeof (double) * scale->dest->width * scale->src->height);
1159
1160   vs_scale_lanczos_Y_double (scale);
1161
1162   scale1d_cleanup (&scale->x_scale1d);
1163   scale1d_cleanup (&scale->y_scale1d);
1164   g_free (scale->tmpdata);
1165 }
1166
1167 static void
1168 vs_scale_lanczos_Y_float (Scale * scale)
1169 {
1170   int j;
1171   int yi;
1172   int tmp_yi;
1173
1174   tmp_yi = 0;
1175
1176   for (j = 0; j < scale->dest->height; j++) {
1177     guint8 *destline;
1178     float *taps;
1179
1180     destline = scale->dest->pixels + scale->dest->stride * j;
1181
1182     yi = scale->y_scale1d.offsets[j];
1183
1184     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
1185       scale->horiz_resample_func (TMP_LINE_FLOAT (tmp_yi),
1186           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
1187           scale->x_scale1d.n_taps, 0, scale->dest->width);
1188       tmp_yi++;
1189     }
1190
1191     taps = (float *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
1192     if (scale->dither) {
1193       resample_vert_dither_float_generic (destline,
1194           taps, TMP_LINE_FLOAT (scale->y_scale1d.offsets[j]),
1195           sizeof (float) * scale->dest->width,
1196           scale->y_scale1d.n_taps, 0, scale->dest->width);
1197     } else {
1198       resample_vert_float_generic (destline,
1199           taps, TMP_LINE_FLOAT (scale->y_scale1d.offsets[j]),
1200           sizeof (float) * scale->dest->width,
1201           scale->y_scale1d.n_taps, 0, scale->dest->width);
1202     }
1203   }
1204 }
1205
1206 void
1207 vs_image_scale_lanczos_Y_float (const VSImage * dest, const VSImage * src,
1208     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
1209     double sharpen)
1210 {
1211   Scale s = { 0 };
1212   Scale *scale = &s;
1213   int n_taps;
1214
1215   scale->dest = dest;
1216   scale->src = src;
1217
1218   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
1219   scale1d_calculate_taps_float (&scale->x_scale1d,
1220       src->width, dest->width, n_taps, a, sharpness, sharpen);
1221
1222   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
1223   scale1d_calculate_taps_float (&scale->y_scale1d,
1224       src->height, dest->height, n_taps, a, sharpness, sharpen);
1225
1226   scale->dither = dither;
1227
1228   scale->horiz_resample_func =
1229       (HorizResampleFunc) resample_horiz_float_u8_generic;
1230
1231   scale->tmpdata =
1232       g_malloc (sizeof (float) * scale->dest->width * scale->src->height);
1233
1234   vs_scale_lanczos_Y_float (scale);
1235
1236   scale1d_cleanup (&scale->x_scale1d);
1237   scale1d_cleanup (&scale->y_scale1d);
1238   g_free (scale->tmpdata);
1239 }
1240
1241
1242
1243
1244
1245 static void
1246 vs_scale_lanczos_AYUV_int16 (Scale * scale)
1247 {
1248   int j;
1249   int yi;
1250   int tmp_yi;
1251
1252   tmp_yi = 0;
1253
1254   for (j = 0; j < scale->dest->height; j++) {
1255     guint8 *destline;
1256     gint16 *taps;
1257
1258     destline = scale->dest->pixels + scale->dest->stride * j;
1259
1260     yi = scale->y_scale1d.offsets[j];
1261
1262     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
1263       scale->horiz_resample_func (TMP_LINE_S16_AYUV (tmp_yi),
1264           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
1265           scale->x_scale1d.n_taps, S16_MIDSHIFT, scale->dest->width);
1266       tmp_yi++;
1267     }
1268
1269     taps = (gint16 *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
1270     if (scale->dither) {
1271       resample_vert_dither_int16_generic (destline,
1272           taps, TMP_LINE_S16_AYUV (scale->y_scale1d.offsets[j]),
1273           sizeof (gint16) * 4 * scale->dest->width,
1274           scale->y_scale1d.n_taps, S16_POSTSHIFT, scale->dest->width * 4);
1275     } else {
1276       resample_vert_int16_generic (destline,
1277           taps, TMP_LINE_S16_AYUV (scale->y_scale1d.offsets[j]),
1278           sizeof (gint16) * 4 * scale->dest->width,
1279           scale->y_scale1d.n_taps, S16_POSTSHIFT, scale->dest->width * 4);
1280     }
1281   }
1282 }
1283
1284 void
1285 vs_image_scale_lanczos_AYUV_int16 (const VSImage * dest, const VSImage * src,
1286     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
1287     double sharpen)
1288 {
1289   Scale s = { 0 };
1290   Scale *scale = &s;
1291   int n_taps;
1292
1293   scale->dest = dest;
1294   scale->src = src;
1295
1296   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
1297   n_taps = ROUND_UP_4 (n_taps);
1298   scale1d_calculate_taps_int16 (&scale->x_scale1d,
1299       src->width, dest->width, n_taps, a, sharpness, sharpen, S16_SHIFT1);
1300
1301   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
1302   scale1d_calculate_taps_int16 (&scale->y_scale1d,
1303       src->height, dest->height, n_taps, a, sharpness, sharpen, S16_SHIFT2);
1304
1305   scale->dither = dither;
1306
1307   switch (scale->x_scale1d.n_taps) {
1308     case 4:
1309       scale->horiz_resample_func =
1310           (HorizResampleFunc) resample_horiz_int16_int16_ayuv_taps4_shift0;
1311       break;
1312     case 8:
1313       scale->horiz_resample_func =
1314           (HorizResampleFunc) resample_horiz_int16_int16_ayuv_taps8_shift0;
1315       break;
1316     case 12:
1317       scale->horiz_resample_func =
1318           (HorizResampleFunc) resample_horiz_int16_int16_ayuv_taps12_shift0;
1319       break;
1320     case 16:
1321       scale->horiz_resample_func =
1322           (HorizResampleFunc) resample_horiz_int16_int16_ayuv_taps16_shift0;
1323       break;
1324     default:
1325       scale->horiz_resample_func =
1326           (HorizResampleFunc) resample_horiz_int16_int16_ayuv_generic;
1327       break;
1328   }
1329
1330   scale->tmpdata =
1331       g_malloc (sizeof (gint16) * scale->dest->width * scale->src->height * 4);
1332
1333   vs_scale_lanczos_AYUV_int16 (scale);
1334
1335   scale1d_cleanup (&scale->x_scale1d);
1336   scale1d_cleanup (&scale->y_scale1d);
1337   g_free (scale->tmpdata);
1338 }
1339
1340
1341 static void
1342 vs_scale_lanczos_AYUV_int32 (Scale * scale)
1343 {
1344   int j;
1345   int yi;
1346   int tmp_yi;
1347
1348   tmp_yi = 0;
1349
1350   for (j = 0; j < scale->dest->height; j++) {
1351     guint8 *destline;
1352     gint32 *taps;
1353
1354     destline = scale->dest->pixels + scale->dest->stride * j;
1355
1356     yi = scale->y_scale1d.offsets[j];
1357
1358     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
1359       scale->horiz_resample_func (TMP_LINE_S32_AYUV (tmp_yi),
1360           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
1361           scale->x_scale1d.n_taps, S32_MIDSHIFT, scale->dest->width);
1362       tmp_yi++;
1363     }
1364
1365     taps = (gint32 *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
1366     if (scale->dither) {
1367       resample_vert_dither_int32_generic (destline,
1368           taps, TMP_LINE_S32_AYUV (scale->y_scale1d.offsets[j]),
1369           sizeof (gint32) * 4 * scale->dest->width, scale->y_scale1d.n_taps,
1370           S32_POSTSHIFT, scale->dest->width * 4);
1371     } else {
1372       resample_vert_int32_generic (destline,
1373           taps, TMP_LINE_S32_AYUV (scale->y_scale1d.offsets[j]),
1374           sizeof (gint32) * 4 * scale->dest->width, scale->y_scale1d.n_taps,
1375           S32_POSTSHIFT, scale->dest->width * 4);
1376     }
1377   }
1378 }
1379
1380 void
1381 vs_image_scale_lanczos_AYUV_int32 (const VSImage * dest, const VSImage * src,
1382     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
1383     double sharpen)
1384 {
1385   Scale s = { 0 };
1386   Scale *scale = &s;
1387   int n_taps;
1388
1389   scale->dest = dest;
1390   scale->src = src;
1391
1392   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
1393   n_taps = ROUND_UP_4 (n_taps);
1394   scale1d_calculate_taps_int32 (&scale->x_scale1d,
1395       src->width, dest->width, n_taps, a, sharpness, sharpen, S32_SHIFT1);
1396
1397   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
1398   scale1d_calculate_taps_int32 (&scale->y_scale1d,
1399       src->height, dest->height, n_taps, a, sharpness, sharpen, S32_SHIFT2);
1400
1401   scale->dither = dither;
1402
1403   switch (scale->x_scale1d.n_taps) {
1404     case 4:
1405       scale->horiz_resample_func =
1406           (HorizResampleFunc) resample_horiz_int32_int32_ayuv_taps4_shift0;
1407       break;
1408     case 8:
1409       scale->horiz_resample_func =
1410           (HorizResampleFunc) resample_horiz_int32_int32_ayuv_taps8_shift0;
1411       break;
1412     case 12:
1413       scale->horiz_resample_func =
1414           (HorizResampleFunc) resample_horiz_int32_int32_ayuv_taps12_shift0;
1415       break;
1416     case 16:
1417       scale->horiz_resample_func =
1418           (HorizResampleFunc) resample_horiz_int32_int32_ayuv_taps16_shift0;
1419       break;
1420     default:
1421       scale->horiz_resample_func =
1422           (HorizResampleFunc) resample_horiz_int32_int32_ayuv_generic;
1423       break;
1424   }
1425
1426   scale->tmpdata =
1427       g_malloc (sizeof (int32_t) * scale->dest->width * scale->src->height * 4);
1428
1429   vs_scale_lanczos_AYUV_int32 (scale);
1430
1431   scale1d_cleanup (&scale->x_scale1d);
1432   scale1d_cleanup (&scale->y_scale1d);
1433   g_free (scale->tmpdata);
1434 }
1435
1436 static void
1437 vs_scale_lanczos_AYUV_double (Scale * scale)
1438 {
1439   int j;
1440   int yi;
1441   int tmp_yi;
1442
1443   tmp_yi = 0;
1444
1445   for (j = 0; j < scale->dest->height; j++) {
1446     guint8 *destline;
1447     double *taps;
1448
1449     destline = scale->dest->pixels + scale->dest->stride * j;
1450
1451     yi = scale->y_scale1d.offsets[j];
1452
1453     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
1454       scale->horiz_resample_func (TMP_LINE_DOUBLE_AYUV (tmp_yi),
1455           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
1456           scale->x_scale1d.n_taps, 0, scale->dest->width);
1457       tmp_yi++;
1458     }
1459
1460     taps = (double *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
1461     if (scale->dither) {
1462       resample_vert_dither_double_generic (destline,
1463           taps, TMP_LINE_DOUBLE_AYUV (scale->y_scale1d.offsets[j]),
1464           sizeof (double) * 4 * scale->dest->width,
1465           scale->y_scale1d.n_taps, 0, scale->dest->width * 4);
1466     } else {
1467       resample_vert_double_generic (destline,
1468           taps, TMP_LINE_DOUBLE_AYUV (scale->y_scale1d.offsets[j]),
1469           sizeof (double) * 4 * scale->dest->width,
1470           scale->y_scale1d.n_taps, 0, scale->dest->width * 4);
1471     }
1472   }
1473 }
1474
1475 void
1476 vs_image_scale_lanczos_AYUV_double (const VSImage * dest, const VSImage * src,
1477     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
1478     double sharpen)
1479 {
1480   Scale s = { 0 };
1481   Scale *scale = &s;
1482   int n_taps;
1483
1484   scale->dest = dest;
1485   scale->src = src;
1486
1487   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
1488   scale1d_calculate_taps (&scale->x_scale1d,
1489       src->width, dest->width, n_taps, a, sharpness, sharpen);
1490
1491   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
1492   scale1d_calculate_taps (&scale->y_scale1d,
1493       src->height, dest->height, n_taps, a, sharpness, sharpen);
1494
1495   scale->dither = dither;
1496
1497   scale->horiz_resample_func =
1498       (HorizResampleFunc) resample_horiz_double_ayuv_generic;
1499
1500   scale->tmpdata =
1501       g_malloc (sizeof (double) * scale->dest->width * scale->src->height * 4);
1502
1503   vs_scale_lanczos_AYUV_double (scale);
1504
1505   scale1d_cleanup (&scale->x_scale1d);
1506   scale1d_cleanup (&scale->y_scale1d);
1507   g_free (scale->tmpdata);
1508 }
1509
1510 static void
1511 vs_scale_lanczos_AYUV_float (Scale * scale)
1512 {
1513   int j;
1514   int yi;
1515   int tmp_yi;
1516
1517   tmp_yi = 0;
1518
1519   for (j = 0; j < scale->dest->height; j++) {
1520     guint8 *destline;
1521     float *taps;
1522
1523     destline = scale->dest->pixels + scale->dest->stride * j;
1524
1525     yi = scale->y_scale1d.offsets[j];
1526
1527     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
1528       scale->horiz_resample_func (TMP_LINE_FLOAT_AYUV (tmp_yi),
1529           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
1530           scale->x_scale1d.n_taps, 0, scale->dest->width);
1531       tmp_yi++;
1532     }
1533
1534     taps = (float *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
1535     if (scale->dither) {
1536       resample_vert_dither_float_generic (destline,
1537           taps, TMP_LINE_FLOAT_AYUV (scale->y_scale1d.offsets[j]),
1538           sizeof (float) * 4 * scale->dest->width, scale->y_scale1d.n_taps, 0,
1539           scale->dest->width * 4);
1540     } else {
1541       resample_vert_float_generic (destline,
1542           taps, TMP_LINE_FLOAT_AYUV (scale->y_scale1d.offsets[j]),
1543           sizeof (float) * 4 * scale->dest->width, scale->y_scale1d.n_taps, 0,
1544           scale->dest->width * 4);
1545     }
1546   }
1547 }
1548
1549 void
1550 vs_image_scale_lanczos_AYUV_float (const VSImage * dest, const VSImage * src,
1551     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
1552     double sharpen)
1553 {
1554   Scale s = { 0 };
1555   Scale *scale = &s;
1556   int n_taps;
1557
1558   scale->dest = dest;
1559   scale->src = src;
1560
1561   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
1562   scale1d_calculate_taps_float (&scale->x_scale1d,
1563       src->width, dest->width, n_taps, a, sharpness, sharpen);
1564
1565   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
1566   scale1d_calculate_taps_float (&scale->y_scale1d,
1567       src->height, dest->height, n_taps, a, sharpness, sharpen);
1568
1569   scale->dither = dither;
1570
1571   scale->horiz_resample_func =
1572       (HorizResampleFunc) resample_horiz_float_ayuv_generic;
1573
1574   scale->tmpdata =
1575       g_malloc (sizeof (float) * scale->dest->width * scale->src->height * 4);
1576
1577   vs_scale_lanczos_AYUV_float (scale);
1578
1579   scale1d_cleanup (&scale->x_scale1d);
1580   scale1d_cleanup (&scale->y_scale1d);
1581   g_free (scale->tmpdata);
1582 }
1583
1584 static void
1585 vs_scale_lanczos_AYUV64_double (Scale * scale)
1586 {
1587   int j;
1588   int yi;
1589   int tmp_yi;
1590
1591   tmp_yi = 0;
1592
1593   for (j = 0; j < scale->dest->height; j++) {
1594     guint16 *destline;
1595     double *taps;
1596
1597     destline = (guint16 *) (scale->dest->pixels + scale->dest->stride * j);
1598
1599     yi = scale->y_scale1d.offsets[j];
1600
1601     while (tmp_yi < yi + scale->y_scale1d.n_taps) {
1602       scale->horiz_resample_func (TMP_LINE_DOUBLE_AYUV (tmp_yi),
1603           scale->x_scale1d.offsets, scale->x_scale1d.taps, SRC_LINE (tmp_yi),
1604           scale->x_scale1d.n_taps, 0, scale->dest->width);
1605       tmp_yi++;
1606     }
1607
1608     taps = (double *) scale->y_scale1d.taps + j * scale->y_scale1d.n_taps;
1609     if (scale->dither) {
1610       resample_vert_dither_double_generic_u16 (destline,
1611           taps, TMP_LINE_DOUBLE_AYUV (scale->y_scale1d.offsets[j]),
1612           sizeof (double) * 4 * scale->dest->width,
1613           scale->y_scale1d.n_taps, 0, scale->dest->width * 4);
1614     } else {
1615       resample_vert_double_generic_u16 (destline,
1616           taps, TMP_LINE_DOUBLE_AYUV (scale->y_scale1d.offsets[j]),
1617           sizeof (double) * 4 * scale->dest->width,
1618           scale->y_scale1d.n_taps, 0, scale->dest->width * 4);
1619     }
1620   }
1621 }
1622
1623 void
1624 vs_image_scale_lanczos_AYUV64_double (const VSImage * dest, const VSImage * src,
1625     uint8_t * tmpbuf, double sharpness, gboolean dither, double a,
1626     double sharpen)
1627 {
1628   Scale s = { 0 };
1629   Scale *scale = &s;
1630   int n_taps;
1631
1632   scale->dest = dest;
1633   scale->src = src;
1634
1635   n_taps = scale1d_get_n_taps (src->width, dest->width, a, sharpness);
1636   scale1d_calculate_taps (&scale->x_scale1d,
1637       src->width, dest->width, n_taps, a, sharpness, sharpen);
1638
1639   n_taps = scale1d_get_n_taps (src->height, dest->height, a, sharpness);
1640   scale1d_calculate_taps (&scale->y_scale1d,
1641       src->height, dest->height, n_taps, a, sharpness, sharpen);
1642
1643   scale->dither = dither;
1644
1645   scale->horiz_resample_func =
1646       (HorizResampleFunc) resample_horiz_double_ayuv_generic_s16;
1647
1648   scale->tmpdata =
1649       g_malloc (sizeof (double) * scale->dest->width * scale->src->height * 4);
1650
1651   vs_scale_lanczos_AYUV64_double (scale);
1652
1653   scale1d_cleanup (&scale->x_scale1d);
1654   scale1d_cleanup (&scale->y_scale1d);
1655   g_free (scale->tmpdata);
1656 }