src/third_party/ffmpeg/libavcodec/dct-test.c

   1 /*
   2  * (c) 2001 Fabrice Bellard
   3  *     2007 Marc Hoffman <marc.hoffman@analog.com>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * DCT test (c) 2001 Fabrice Bellard
  25  * Started from sample code by Juan J. Sierralta P.
  26  */
  27
  28 #include "config.h"
  29 #include <stdlib.h>
  30 #include <stdio.h>
  31 #include <string.h>
  32 #if HAVE_UNISTD_H
  33 #include <unistd.h>
  34 #endif
  35 #include <math.h>
  36
  37 #include "libavutil/cpu.h"
  38 #include "libavutil/common.h"
  39 #include "libavutil/lfg.h"
  40 #include "libavutil/time.h"
  41
  42 #include "dct.h"
  43 #include "simple_idct.h"
  44 #include "aandcttab.h"
  45 #include "faandct.h"
  46 #include "faanidct.h"
  47 #include "x86/idct_xvid.h"
  48 #include "dctref.h"
  49
  50 // BFIN
  51 void ff_bfin_idct(int16_t *block);
  52 void ff_bfin_fdct(int16_t *block);
  53
  54 // ALTIVEC
  55 void ff_fdct_altivec(int16_t *block);
  56
  57 // ARM
  58 void ff_j_rev_dct_arm(int16_t *data);
  59 void ff_simple_idct_arm(int16_t *data);
  60 void ff_simple_idct_armv5te(int16_t *data);
  61 void ff_simple_idct_armv6(int16_t *data);
  62 void ff_simple_idct_neon(int16_t *data);
  63
  64 struct algo {
  65     const char *name;
  66     void (*func)(int16_t *block);
  67     enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
  68                      SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
  69     int mm_support;
  70     int nonspec;
  71 };
  72
  73 static int cpu_flags;
  74
  75 static const struct algo fdct_tab[] = {
  76     { "REF-DBL",        ff_ref_fdct,           NO_PERM    },
  77     { "FAAN",           ff_faandct,            NO_PERM    },
  78     { "IJG-AAN-INT",    ff_fdct_ifast,         SCALE_PERM },
  79     { "IJG-LLM-INT",    ff_jpeg_fdct_islow_8,  NO_PERM    },
  80
  81 #if HAVE_MMX_INLINE
  82     { "MMX",            ff_fdct_mmx,           NO_PERM,   AV_CPU_FLAG_MMX     },
  83 #endif
  84 #if HAVE_MMXEXT_INLINE
  85     { "MMXEXT",         ff_fdct_mmxext,        NO_PERM,   AV_CPU_FLAG_MMXEXT  },
  86 #endif
  87 #if HAVE_SSE2_INLINE
  88     { "SSE2",           ff_fdct_sse2,          NO_PERM,   AV_CPU_FLAG_SSE2    },
  89 #endif
  90
  91 #if HAVE_ALTIVEC
  92     { "altivecfdct",    ff_fdct_altivec,       NO_PERM,   AV_CPU_FLAG_ALTIVEC },
  93 #endif
  94
  95 #if ARCH_BFIN
  96     { "BFINfdct",       ff_bfin_fdct,          NO_PERM  },
  97 #endif
  98
  99     { 0 }
 100 };
 101
 102 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
 103 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
 104                                 int16_t *block, int16_t *qmat);
 105
 106 static void ff_prores_idct_put_10_sse2_wrap(int16_t *dst){
 107     DECLARE_ALIGNED(16, static int16_t, qmat)[64];
 108     DECLARE_ALIGNED(16, static int16_t, tmp)[64];
 109     int i;
 110
 111     for(i=0; i<64; i++){
 112         qmat[i]=4;
 113         tmp[i]= dst[i];
 114     }
 115     ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
 116 }
 117 #endif
 118
 119 static const struct algo idct_tab[] = {
 120     { "FAANI",          ff_faanidct,           NO_PERM  },
 121     { "REF-DBL",        ff_ref_idct,           NO_PERM  },
 122     { "INT",            ff_j_rev_dct,          MMX_PERM },
 123     { "SIMPLE-C",       ff_simple_idct_8,      NO_PERM  },
 124
 125 #if HAVE_MMX_INLINE
 126     { "SIMPLE-MMX",     ff_simple_idct_mmx,  MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
 127     { "XVID-MMX",       ff_idct_xvid_mmx,      NO_PERM,   AV_CPU_FLAG_MMX,  1 },
 128 #endif
 129 #if HAVE_MMXEXT_INLINE
 130     { "XVID-MMXEXT",    ff_idct_xvid_mmxext,   NO_PERM,   AV_CPU_FLAG_MMXEXT, 1 },
 131 #endif
 132 #if HAVE_SSE2_INLINE
 133     { "XVID-SSE2",      ff_idct_xvid_sse2,     SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
 134 #if ARCH_X86_64 && HAVE_YASM
 135     { "PR-SSE2",        ff_prores_idct_put_10_sse2_wrap,     TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
 136 #endif
 137 #endif
 138
 139 #if ARCH_BFIN
 140     { "BFINidct",       ff_bfin_idct,          NO_PERM  },
 141 #endif
 142
 143 #if ARCH_ARM
 144     { "SIMPLE-ARM",     ff_simple_idct_arm,    NO_PERM  },
 145     { "INT-ARM",        ff_j_rev_dct_arm,      MMX_PERM },
 146 #endif
 147 #if HAVE_ARMV5TE
 148     { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM,   AV_CPU_FLAG_ARMV5TE },
 149 #endif
 150 #if HAVE_ARMV6
 151     { "SIMPLE-ARMV6",   ff_simple_idct_armv6,  MMX_PERM,  AV_CPU_FLAG_ARMV6   },
 152 #endif
 153 #if HAVE_NEON
 154     { "SIMPLE-NEON",    ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON },
 155 #endif
 156
 157     { 0 }
 158 };
 159
 160 #define AANSCALE_BITS 12
 161
 162 #define NB_ITS 20000
 163 #define NB_ITS_SPEED 50000
 164
 165 static short idct_mmx_perm[64];
 166
 167 static short idct_simple_mmx_perm[64] = {
 168     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 169     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 170     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 171     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 172     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 173     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 174     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 175     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 176 };
 177
 178 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
 179
 180 static void idct_mmx_init(void)
 181 {
 182     int i;
 183
 184     /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
 185     for (i = 0; i < 64; i++) {
 186         idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 187     }
 188 }
 189
 190 DECLARE_ALIGNED(16, static int16_t, block)[64];
 191 DECLARE_ALIGNED(8,  static int16_t, block1)[64];
 192
 193 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals)
 194 {
 195     int i, j;
 196
 197     memset(block, 0, 64 * sizeof(*block));
 198
 199     switch (test) {
 200     case 0:
 201         for (i = 0; i < 64; i++)
 202             block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
 203         if (is_idct) {
 204             ff_ref_fdct(block);
 205             for (i = 0; i < 64; i++)
 206                 block[i] >>= 3;
 207         }
 208         break;
 209     case 1:
 210         j = av_lfg_get(prng) % 10 + 1;
 211         for (i = 0; i < j; i++) {
 212             int idx = av_lfg_get(prng) % 64;
 213             block[idx] = av_lfg_get(prng) % (2*vals) -vals;
 214         }
 215         break;
 216     case 2:
 217         block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
 218         block[63] = (block[0] & 1) ^ 1;
 219         break;
 220     }
 221 }
 222
 223 static void permute(int16_t dst[64], const int16_t src[64], int perm)
 224 {
 225     int i;
 226
 227     if (perm == MMX_PERM) {
 228         for (i = 0; i < 64; i++)
 229             dst[idct_mmx_perm[i]] = src[i];
 230     } else if (perm == MMX_SIMPLE_PERM) {
 231         for (i = 0; i < 64; i++)
 232             dst[idct_simple_mmx_perm[i]] = src[i];
 233     } else if (perm == SSE2_PERM) {
 234         for (i = 0; i < 64; i++)
 235             dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
 236     } else if (perm == PARTTRANS_PERM) {
 237         for (i = 0; i < 64; i++)
 238             dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
 239     } else if (perm == TRANSPOSE_PERM) {
 240         for (i = 0; i < 64; i++)
 241             dst[(i>>3) | ((i<<3)&0x38)] = src[i];
 242     } else {
 243         for (i = 0; i < 64; i++)
 244             dst[i] = src[i];
 245     }
 246 }
 247
 248 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
 249 {
 250     void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
 251     int it, i, scale;
 252     int err_inf, v;
 253     int64_t err2, ti, ti1, it1, err_sum = 0;
 254     int64_t sysErr[64], sysErrMax = 0;
 255     int maxout = 0;
 256     int blockSumErrMax = 0, blockSumErr;
 257     AVLFG prng;
 258     const int vals=1<<bits;
 259     double omse, ome;
 260     int spec_err;
 261
 262     av_lfg_init(&prng, 1);
 263
 264     err_inf = 0;
 265     err2 = 0;
 266     for (i = 0; i < 64; i++)
 267         sysErr[i] = 0;
 268     for (it = 0; it < NB_ITS; it++) {
 269         init_block(block1, test, is_idct, &prng, vals);
 270         permute(block, block1, dct->format);
 271
 272         dct->func(block);
 273         emms_c();
 274
 275         if (dct->format == SCALE_PERM) {
 276             for (i = 0; i < 64; i++) {
 277                 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
 278                 block[i] = (block[i] * scale) >> AANSCALE_BITS;
 279             }
 280         }
 281
 282         ref(block1);
 283
 284         blockSumErr = 0;
 285         for (i = 0; i < 64; i++) {
 286             int err = block[i] - block1[i];
 287             err_sum += err;
 288             v = abs(err);
 289             if (v > err_inf)
 290                 err_inf = v;
 291             err2 += v * v;
 292             sysErr[i] += block[i] - block1[i];
 293             blockSumErr += v;
 294             if (abs(block[i]) > maxout)
 295                 maxout = abs(block[i]);
 296         }
 297         if (blockSumErrMax < blockSumErr)
 298             blockSumErrMax = blockSumErr;
 299     }
 300     for (i = 0; i < 64; i++)
 301         sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
 302
 303     for (i = 0; i < 64; i++) {
 304         if (i % 8 == 0)
 305             printf("\n");
 306         printf("%7d ", (int) sysErr[i]);
 307     }
 308     printf("\n");
 309
 310     omse = (double) err2 / NB_ITS / 64;
 311     ome  = (double) err_sum / NB_ITS / 64;
 312
 313     spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
 314
 315     printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
 316            is_idct ? "IDCT" : "DCT", dct->name, err_inf,
 317            omse, ome, (double) sysErrMax / NB_ITS,
 318            maxout, blockSumErrMax);
 319
 320     if (spec_err && !dct->nonspec)
 321         return 1;
 322
 323     if (!speed)
 324         return 0;
 325
 326     /* speed test */
 327
 328     init_block(block, test, is_idct, &prng, vals);
 329     permute(block1, block, dct->format);
 330
 331     ti = av_gettime();
 332     it1 = 0;
 333     do {
 334         for (it = 0; it < NB_ITS_SPEED; it++) {
 335             memcpy(block, block1, sizeof(block));
 336             dct->func(block);
 337         }
 338         emms_c();
 339         it1 += NB_ITS_SPEED;
 340         ti1 = av_gettime() - ti;
 341     } while (ti1 < 1000000);
 342
 343     printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
 344            (double) it1 * 1000.0 / (double) ti1);
 345
 346     return 0;
 347 }
 348
 349 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
 350 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
 351
 352 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
 353 {
 354     static int init;
 355     static double c8[8][8];
 356     static double c4[4][4];
 357     double block1[64], block2[64], block3[64];
 358     double s, sum, v;
 359     int i, j, k;
 360
 361     if (!init) {
 362         init = 1;
 363
 364         for (i = 0; i < 8; i++) {
 365             sum = 0;
 366             for (j = 0; j < 8; j++) {
 367                 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
 368                 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
 369                 sum += c8[i][j] * c8[i][j];
 370             }
 371         }
 372
 373         for (i = 0; i < 4; i++) {
 374             sum = 0;
 375             for (j = 0; j < 4; j++) {
 376                 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
 377                 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
 378                 sum += c4[i][j] * c4[i][j];
 379             }
 380         }
 381     }
 382
 383     /* butterfly */
 384     s = 0.5 * sqrt(2.0);
 385     for (i = 0; i < 4; i++) {
 386         for (j = 0; j < 8; j++) {
 387             block1[8 * (2 * i) + j] =
 388                 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
 389             block1[8 * (2 * i + 1) + j] =
 390                 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
 391         }
 392     }
 393
 394     /* idct8 on lines */
 395     for (i = 0; i < 8; i++) {
 396         for (j = 0; j < 8; j++) {
 397             sum = 0;
 398             for (k = 0; k < 8; k++)
 399                 sum += c8[k][j] * block1[8 * i + k];
 400             block2[8 * i + j] = sum;
 401         }
 402     }
 403
 404     /* idct4 */
 405     for (i = 0; i < 8; i++) {
 406         for (j = 0; j < 4; j++) {
 407             /* top */
 408             sum = 0;
 409             for (k = 0; k < 4; k++)
 410                 sum += c4[k][j] * block2[8 * (2 * k) + i];
 411             block3[8 * (2 * j) + i] = sum;
 412
 413             /* bottom */
 414             sum = 0;
 415             for (k = 0; k < 4; k++)
 416                 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
 417             block3[8 * (2 * j + 1) + i] = sum;
 418         }
 419     }
 420
 421     /* clamp and store the result */
 422     for (i = 0; i < 8; i++) {
 423         for (j = 0; j < 8; j++) {
 424             v = block3[8 * i + j];
 425             if      (v < 0)   v = 0;
 426             else if (v > 255) v = 255;
 427             dest[i * linesize + j] = (int) rint(v);
 428         }
 429     }
 430 }
 431
 432 static void idct248_error(const char *name,
 433                           void (*idct248_put)(uint8_t *dest, int line_size,
 434                                               int16_t *block),
 435                           int speed)
 436 {
 437     int it, i, it1, ti, ti1, err_max, v;
 438     AVLFG prng;
 439
 440     av_lfg_init(&prng, 1);
 441
 442     /* just one test to see if code is correct (precision is less
 443        important here) */
 444     err_max = 0;
 445     for (it = 0; it < NB_ITS; it++) {
 446         /* XXX: use forward transform to generate values */
 447         for (i = 0; i < 64; i++)
 448             block1[i] = av_lfg_get(&prng) % 256 - 128;
 449         block1[0] += 1024;
 450
 451         for (i = 0; i < 64; i++)
 452             block[i] = block1[i];
 453         idct248_ref(img_dest1, 8, block);
 454
 455         for (i = 0; i < 64; i++)
 456             block[i] = block1[i];
 457         idct248_put(img_dest, 8, block);
 458
 459         for (i = 0; i < 64; i++) {
 460             v = abs((int) img_dest[i] - (int) img_dest1[i]);
 461             if (v == 255)
 462                 printf("%d %d\n", img_dest[i], img_dest1[i]);
 463             if (v > err_max)
 464                 err_max = v;
 465         }
 466 #if 0
 467         printf("ref=\n");
 468         for(i=0;i<8;i++) {
 469             int j;
 470             for(j=0;j<8;j++) {
 471                 printf(" %3d", img_dest1[i*8+j]);
 472             }
 473             printf("\n");
 474         }
 475
 476         printf("out=\n");
 477         for(i=0;i<8;i++) {
 478             int j;
 479             for(j=0;j<8;j++) {
 480                 printf(" %3d", img_dest[i*8+j]);
 481             }
 482             printf("\n");
 483         }
 484 #endif
 485     }
 486     printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
 487
 488     if (!speed)
 489         return;
 490
 491     ti = av_gettime();
 492     it1 = 0;
 493     do {
 494         for (it = 0; it < NB_ITS_SPEED; it++) {
 495             for (i = 0; i < 64; i++)
 496                 block[i] = block1[i];
 497             idct248_put(img_dest, 8, block);
 498         }
 499         emms_c();
 500         it1 += NB_ITS_SPEED;
 501         ti1 = av_gettime() - ti;
 502     } while (ti1 < 1000000);
 503
 504     printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
 505            (double) it1 * 1000.0 / (double) ti1);
 506 }
 507
 508 static void help(void)
 509 {
 510     printf("dct-test [-i] [<test-number>] [<bits>]\n"
 511            "test-number 0 -> test with random matrixes\n"
 512            "            1 -> test with random sparse matrixes\n"
 513            "            2 -> do 3. test from mpeg4 std\n"
 514            "bits        Number of time domain bits to use, 8 is default\n"
 515            "-i          test IDCT implementations\n"
 516            "-4          test IDCT248 implementations\n"
 517            "-t          speed test\n");
 518 }
 519
 520 #if !HAVE_GETOPT
 521 #include "compat/getopt.c"
 522 #endif
 523
 524 int main(int argc, char **argv)
 525 {
 526     int test_idct = 0, test_248_dct = 0;
 527     int c, i;
 528     int test = 1;
 529     int speed = 0;
 530     int err = 0;
 531     int bits=8;
 532
 533     cpu_flags = av_get_cpu_flags();
 534
 535     ff_ref_dct_init();
 536     idct_mmx_init();
 537
 538     for (;;) {
 539         c = getopt(argc, argv, "ih4t");
 540         if (c == -1)
 541             break;
 542         switch (c) {
 543         case 'i':
 544             test_idct = 1;
 545             break;
 546         case '4':
 547             test_248_dct = 1;
 548             break;
 549         case 't':
 550             speed = 1;
 551             break;
 552         default:
 553         case 'h':
 554             help();
 555             return 0;
 556         }
 557     }
 558
 559     if (optind < argc)
 560         test = atoi(argv[optind]);
 561     if(optind+1 < argc) bits= atoi(argv[optind+1]);
 562
 563     printf("ffmpeg DCT/IDCT test\n");
 564
 565     if (test_248_dct) {
 566         idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
 567     } else {
 568         const struct algo *algos = test_idct ? idct_tab : fdct_tab;
 569         for (i = 0; algos[i].name; i++)
 570             if (!(~cpu_flags & algos[i].mm_support)) {
 571                 err |= dct_error(&algos[i], test, test_idct, speed, bits);
 572             }
 573     }
 574
 575     if (err)
 576         printf("Error: %d.\n", err);
 577
 578     return !!err;
 579 }