Merge Upstream version 1.5.3 from branch 'upstream' into tizen 14/184014/1 accepted/tizen/5.0/unified/20181102.030522 accepted/tizen/unified/20180725.060406 submit/tizen/20180724.072604 submit/tizen_5.0/20181101.000007
authorjiyong.min <jiyong.min@samsung.com>
Fri, 13 Jul 2018 04:55:02 +0000 (13:55 +0900)
committerjiyong.min <jiyong.min@samsung.com>
Fri, 13 Jul 2018 04:56:26 +0000 (13:56 +0900)
Change-Id: I68097b2480baa04758cec3d8078082ed9b98b2f7

16 files changed:
1  2 
Makefile.am
bmp.c
configure.ac
jconfig.h.in
jdcoefct.c
jdmerge.c
jpeglib.h
jquant2.c
md5/md5hl.c
packaging/libjpeg-turbo.spec
rdppm.c
simd/jsimd.h
simd/jsimd_arm.c
simd/jsimd_arm_neon.S
tjbench.c
tjunittest.c

diff --cc Makefile.am
@@@ -10,6 -10,11 +10,11 @@@ endi
  
  nodist_include_HEADERS = jconfig.h
  
 -pkgconfig_DATA = pkgscripts/libjpeg.pc
 -if WITH_TURBOJPEG
 -pkgconfig_DATA += pkgscripts/libturbojpeg.pc
 -endif
+ pkgconfigdir = $(libdir)/pkgconfig
++pkgconfig_DATA = pkgscripts/turbojpeg.pc
++#if WITH_TURBOJPEG
++#pkgconfig_DATA += pkgscripts/libturbojpeg.pc
++#endif
  
  HDRS = jchuff.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
        jpegint.h jpeglib.h jversion.h jsimd.h jsimddct.h jpegcomp.h \
diff --cc bmp.c
Simple merge
diff --cc configure.ac
@@@ -576,6 -600,8 +605,9 @@@ AC_CONFIG_FILES([pkgscripts/makecygwinp
  AC_CONFIG_FILES([pkgscripts/makedpkg.tmpl:release/makedpkg.in])
  AC_CONFIG_FILES([pkgscripts/makemacpkg.tmpl:release/makemacpkg.in])
  AC_CONFIG_FILES([pkgscripts/uninstall.tmpl:release/uninstall.in])
 -AC_CONFIG_FILES([pkgscripts/libjpeg.pc:release/libjpeg.pc.in])
 -AC_CONFIG_FILES([pkgscripts/libturbojpeg.pc:release/libturbojpeg.pc.in])
++AC_CONFIG_FILES([pkgscripts/turbojpeg.pc:pkgconfig/turbojpeg.pc.in])
++# AC_CONFIG_FILES([pkgscripts/libjpeg.pc:release/libjpeg.pc.in])
++# AC_CONFIG_FILES([pkgscripts/libturbojpeg.pc:release/libturbojpeg.pc.in])
  if test "x$with_turbojpeg" != "xno"; then
    AC_CONFIG_FILES([tjbenchtest])
  fi
diff --cc jconfig.h.in
@@@ -68,8 -71,3 +71,5 @@@
  
  /* Define to `unsigned int' if <sys/types.h> does not define. */
  #undef size_t
- /* The size of `size_t', as computed by sizeof. */
- #undef SIZEOF_SIZE_T
 +
 +#define COLOR_PICKER_ENABLE 0
diff --cc jdcoefct.c
@@@ -219,44 -109,46 +155,52 @@@ decompress_onepass (j_decompress_ptr ci
          coef->MCU_ctr = MCU_col_num;
          return JPEG_SUSPENDED;
        }
-       /* Determine where data should go in output_buf and do the IDCT thing.
-        * We skip dummy blocks at the right and bottom edges (but blkn gets
-        * incremented past them!).  Note the inner loop relies on having
-        * allocated the MCU_buffer[] blocks sequentially.
 +#if _USE_PRODUCT_TV
 +      /* region decoding. this limits decode to the set of blocks +- 1 outside
 +       * bounding blocks around the desired region to decode */
 +      if (skip)
 +        continue;
 +#endif
+       /* Only perform the IDCT on blocks that are contained within the desired
+        * cropping region.
         */
-       blkn = 0;                 /* index of current DCT block within MCU */
-       for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
-         compptr = cinfo->cur_comp_info[ci];
-         /* Don't bother to IDCT an uninteresting component. */
-         if (! compptr->component_needed) {
-           blkn += compptr->MCU_blocks;
-           continue;
-         }
-         inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
-         useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
-                                                     : compptr->last_col_width;
-         output_ptr = output_buf[compptr->component_index] +
-           yoffset * compptr->_DCT_scaled_size;
-         start_col = MCU_col_num * compptr->MCU_sample_width;
-         for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
-           if (cinfo->input_iMCU_row < last_iMCU_row ||
-               yoffset+yindex < compptr->last_row_height) {
-             output_col = start_col;
-             for (xindex = 0; xindex < useful_width; xindex++) {
-               (*inverse_DCT) (cinfo, compptr,
-                               (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
-                               output_ptr, output_col);
-               output_col += compptr->_DCT_scaled_size;
+       if (MCU_col_num >= cinfo->master->first_iMCU_col &&
+           MCU_col_num <= cinfo->master->last_iMCU_col) {
+         /* Determine where data should go in output_buf and do the IDCT thing.
+          * We skip dummy blocks at the right and bottom edges (but blkn gets
+          * incremented past them!).  Note the inner loop relies on having
+          * allocated the MCU_buffer[] blocks sequentially.
+          */
+         blkn = 0;                 /* index of current DCT block within MCU */
+         for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+           compptr = cinfo->cur_comp_info[ci];
+           /* Don't bother to IDCT an uninteresting component. */
+           if (! compptr->component_needed) {
+             blkn += compptr->MCU_blocks;
+             continue;
+           }
+           inverse_DCT = cinfo->idct->inverse_DCT[compptr->component_index];
+           useful_width = (MCU_col_num < last_MCU_col) ? compptr->MCU_width
+                                                       : compptr->last_col_width;
+           output_ptr = output_buf[compptr->component_index] +
+             yoffset * compptr->_DCT_scaled_size;
+           start_col = (MCU_col_num - cinfo->master->first_iMCU_col) *
+               compptr->MCU_sample_width;
+           for (yindex = 0; yindex < compptr->MCU_height; yindex++) {
+             if (cinfo->input_iMCU_row < last_iMCU_row ||
+                 yoffset+yindex < compptr->last_row_height) {
+               output_col = start_col;
+               for (xindex = 0; xindex < useful_width; xindex++) {
+                 (*inverse_DCT) (cinfo, compptr,
+                                 (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
+                                 output_ptr, output_col);
+                 output_col += compptr->_DCT_scaled_size;
+               }
              }
+             blkn += compptr->MCU_width;
+             output_ptr += compptr->_DCT_scaled_size;
            }
-           blkn += compptr->MCU_width;
-           output_ptr += compptr->_DCT_scaled_size;
          }
        }
      }
diff --cc jdmerge.c
Simple merge
diff --cc jpeglib.h
+++ b/jpeglib.h
@@@ -284,22 -287,10 +286,22 @@@ struct jpeg_common_struct 
     */
  };
  
- typedef struct jpeg_common_struct * j_common_ptr;
- typedef struct jpeg_compress_struct * j_compress_ptr;
- typedef struct jpeg_decompress_struct * j_decompress_ptr;
+ typedef struct jpeg_common_struct *j_common_ptr;
+ typedef struct jpeg_compress_struct *j_compress_ptr;
+ typedef struct jpeg_decompress_struct *j_decompress_ptr;
  
 +typedef struct _Pick_Color_
 +{
 +    unsigned int sumR;
 +    unsigned int sumG;
 +    unsigned int sumB;
 +    int enablePickColor;
 +    int perc;
 +    int x1;
 +    int y1;
 +    int x2;
 +    int y2;
 +}PickColor;
  
  /* Master record for a compression instance */
  
@@@ -706,21 -693,17 +708,21 @@@ struct jpeg_decompress_struct 
    /*
     * Links to decompression subobjects (methods, private variables of modules)
     */
-   struct jpeg_decomp_master * master;
-   struct jpeg_d_main_controller * main;
-   struct jpeg_d_coef_controller * coef;
-   struct jpeg_d_post_controller * post;
-   struct jpeg_input_controller * inputctl;
-   struct jpeg_marker_reader * marker;
-   struct jpeg_entropy_decoder * entropy;
-   struct jpeg_inverse_dct * idct;
-   struct jpeg_upsampler * upsample;
-   struct jpeg_color_deconverter * cconvert;
-   struct jpeg_color_quantizer * cquantize;
+   struct jpeg_decomp_master *master;
+   struct jpeg_d_main_controller *main;
+   struct jpeg_d_coef_controller *coef;
+   struct jpeg_d_post_controller *post;
+   struct jpeg_input_controller *inputctl;
+   struct jpeg_marker_reader *marker;
+   struct jpeg_entropy_decoder *entropy;
+   struct jpeg_inverse_dct *idct;
+   struct jpeg_upsampler *upsample;
+   struct jpeg_color_deconverter *cconvert;
+   struct jpeg_color_quantizer *cquantize;
 +
 +#if COLOR_PICKER_ENABLE == 1
 +  struct _Pick_Color_ *pick_color_data;
 +#endif
  };
  
  
diff --cc jquant2.c
Simple merge
diff --cc md5/md5hl.c
@@@ -55,13 -68,15 +68,17 @@@ MD5FileChunk(const char *filename, cha
        off_t n;
  
        MD5Init(&ctx);
+ #if _WIN32
+       f = _open(filename, O_RDONLY|O_BINARY);
+ #else
        f = open(filename, O_RDONLY);
+ #endif
        if (f < 0)
                return 0;
 -      if (fstat(f, &stbuf) < 0)
 -              return 0;
 +      if (fstat(f, &stbuf) < 0){
 +        close(f);
 +        return 0;
 +    }
        if (ofs > stbuf.st_size)
                ofs = stbuf.st_size;
        if ((len == 0) || (len > stbuf.st_size - ofs))
index 389c39b,0000000..1352490
mode 100755,000000..100644
--- /dev/null
@@@ -1,123 -1,0 +1,118 @@@
- %define minor   0
 +%define major   8
- #%check
- #make test libdir=%{_libdir}
++%define minor   1
 +%define micro   2
 +%define srcver  1.4.2
 +%define libver  %{major}.%{minor}.%{micro}
 +# major number of library from jpeg8
 +%define cmajor  8
 +
 +Name:           libjpeg-turbo
 +Version:        %{srcver}
 +Release:        1
 +Summary:        A MMX/SSE2 accelerated library for manipulating JPEG image files
 +License:        BSD License (BSD 3-clause, Historic Permission Notice and Disclaimer, libjpeg License)
 +Group:          Graphics & UI Framework/Libraries
 +Url:            http://sourceforge.net/projects/libjpeg-turbo
 +Source0:        http://downloads.sourceforge.net/project/%{name}/%{version}/%{name}-%{version}.tar.gz
 +Source1:        baselibs.conf
 +Source1001:   libjpeg-turbo.manifest
 +BuildRequires:  gcc-c++
 +BuildRequires:  libtool
 +BuildRequires:  yasm
 +
 +%description
 +The libjpeg-turbo package contains a library of functions for manipulating
 +JPEG images.
 +
 +%package -n libjpeg
 +
 +Version:        %{libver}
 +Release:        0
 +Summary:        The MMX/SSE accelerated JPEG compression/decompression library
 +Group:          Graphics & UI Framework/Libraries
 +
 +Provides:       libjpeg = %{version}
 +Provides:       libjpeg8
 +Obsoletes:      libjpeg < %{version}
 +
 +%description -n libjpeg
 +This library contains MMX/SSE accelerated functions for manipulating
 +JPEG images.
 +
 +%package -n libjpeg-devel
 +Version:        %{libver}
 +Release:        0
 +Summary:        Development Tools for applications which will use the Libjpeg Library
 +Group:          Graphics & UI Framework/Development
 +
 +Provides:       libjpeg-turbo-devel
 +Requires:       libjpeg = %{version}
 +Provides:       libjpeg-devel = %{version}
 +Provides:       libjpeg8-devel
 +Obsoletes:      libjpeg-devel < %{version}
 +%if "%{major}" != "%{cmajor}"
 +Conflicts:      libjpeg-devel
 +%endif
 +
 +%description -n libjpeg-devel
 +The libjpeg-devel package includes the header files and libraries
 +necessary for compiling and linking programs which will manipulate JPEG
 +files using the libjpeg library.
 +
 +%prep
 +%setup -q
 +cp %{SOURCE1001} .
 +
 +%build
 +%if "%{tizen_profile_name}" == "tv"
 +echo "tizen_product_tv"
 +export CFLAGS="$CFLAGS -D_TIZEN_PRODUCT_TV -D_USE_PRODUCT_TV"
 +%endif
 +autoreconf -fiv
 +%configure --enable-shared --disable-static --with-jpeg8
 +make %{?_smp_mflags}
 +
- # Fix perms
- chmod -x README-turbo.txt
 +%install
 +%makeinstall
- %license COPYING
- %license README
 +
 +# Remove unwanted files
 +rm -f %{buildroot}%{_libdir}/lib{,turbo}jpeg.la
 +
 +rm %{buildroot}%{_bindir}/tjbench
 +
 +# Remove docs, we'll select docs manually
 +rm -rf %{buildroot}%{_datadir}/doc/
 +
 +%clean
 +rm -rf $RPM_BUILD_ROOT
 +
 +%post -n libjpeg -p /sbin/ldconfig
 +
 +%postun -n libjpeg -p /sbin/ldconfig
 +
 +%docs_package
 +
 +%files
 +%manifest %{name}.manifest
 +%defattr(-,root,root)
- %license COPYING
- %license README
++%license README.ijg
++%license LICENSE.md
 +
 +%files -n libjpeg
 +%manifest %{name}.manifest
 +%defattr(-,root,root)
 +%{_libdir}/libturbojpeg.so.*
 +%{_libdir}/libjpeg.so.%{libver}
 +%{_libdir}/libjpeg.so.%{major}
++%license README.ijg
++%license LICENSE.md
 +
 +%files -n libjpeg-devel
 +%defattr(-,root,root)
 +%{_includedir}/*.h
 +%{_libdir}/pkgconfig/turbojpeg.pc
 +%{_libdir}/libturbojpeg.so
 +%{_libdir}/libjpeg.so
 +%doc coderules.txt jconfig.txt libjpeg.txt structure.txt example.c
 +
 +%changelog
diff --cc rdppm.c
+++ b/rdppm.c
@@@ -414,21 -425,14 +425,23 @@@ start_input_ppm (j_compress_ptr cinfo, 
      /* On 16-bit-int machines we have to be careful of maxval = 65535 */
      source->rescale = (JSAMPLE *)
        (*cinfo->mem->alloc_small) ((j_common_ptr) cinfo, JPOOL_IMAGE,
-                                   (size_t) (((long) maxval + 1L) * sizeof(JSAMPLE)));
+                                   (size_t) (((long) maxval + 1L) *
+                                             sizeof(JSAMPLE)));
      half_maxval = maxval / 2;
-       for (val = 0; val <= (INT32) maxval; val++) {
 +#if _USE_PRODUCT_TV
 +    if(maxval > 0) {
-     for (val = 0; val <= (INT32) maxval; val++) {
++      for (val = 0; val <= (long) maxval; val++) {
 +        /* The multiplication here must be done in 32 bits to avoid overflow */
 +        source->rescale[val] = (JSAMPLE) ((val*MAXJSAMPLE + half_maxval)/maxval);
 +      }
 +    }
 +#else
+     for (val = 0; val <= (long) maxval; val++) {
        /* The multiplication here must be done in 32 bits to avoid overflow */
-       source->rescale[val] = (JSAMPLE) ((val*MAXJSAMPLE + half_maxval)/maxval);
+       source->rescale[val] = (JSAMPLE) ((val * MAXJSAMPLE + half_maxval) /
+                                         maxval);
      }
 +#endif
    }
  }
  
diff --cc simd/jsimd.h
@@@ -674,15 -853,19 +853,29 @@@ EXTERN(void) jsimd_idct_float_ss
  
  extern const int jconst_idct_float_sse2[];
  EXTERN(void) jsimd_idct_float_sse2
-         (void * dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+         (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
           JDIMENSION output_col);
  
+ /* Huffman coding */
+ extern const int jconst_huff_encode_one_block[];
+ EXTERN(JOCTET*) jsimd_huff_encode_one_block_sse2
+         (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+          c_derived_tbl *dctbl, c_derived_tbl *actbl);
+ EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon
+         (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+          c_derived_tbl *dctbl, c_derived_tbl *actbl);
+ EXTERN(JOCTET*) jsimd_huff_encode_one_block_neon_slowtbl
+         (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
+          c_derived_tbl *dctbl, c_derived_tbl *actbl);
++
 +/* TIZEN_PRODUCT_TV */
 +EXTERN(void) jsimd_pick_color
 +        JPP((JSAMPARRAY output_buf,
 +                       void *pickColor,
 +                       JDIMENSION out_width));
 +
 +EXTERN(void) jsimd_h2v1_fancy_upsample_neon
 +        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
 +             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
  #include <string.h>
  #include <ctype.h>
  
 +#if _USE_PRODUCT_TV
 +//Changes for JPEG GAMMA enhancement in thumbnail
 +#include <unistd.h>
 +#endif
 +
  static unsigned int simd_support = ~0;
+ static unsigned int simd_huffman = 1;
  
  #if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
  
@@@ -278,24 -278,7 +283,23 @@@ jsimd_ycc_rgb_convert (j_decompress_pt
        break;
    }
  
-   if (simd_support & JSIMD_ARM_NEON)
-     neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
 +#if _USE_PRODUCT_TV
 +  if (simd_support & JSIMD_ARM_NEON) {
 +    neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
 +    PickColor* pickColor = cinfo->pick_color_data;
 +    if(pickColor && pickColor->enablePickColor && output_buf) {
 +      int w = cinfo->output_width;
 +      unsigned char *ptr = *output_buf;
 +      if(pickColor->perc <= 0) {
 +        w = pickColor->x2 - pickColor->x1 + 1;
 +        ptr = (*output_buf) + (pickColor->x1 * 3);
 +      }
 +      jsimd_pick_color(ptr, pickColor, w);
 +    }
 +  }
 +#else
+   neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
 +#endif
  }
  
  GLOBAL(void)
@@@ -2438,160 -2440,439 +2440,597 @@@ asm_function jsimd_h2v1_fancy_upsample_
  .purgem upsample32
  .purgem upsample_row
  
- #endif
+ /*****************************************************************************/
+ /*
+  * GLOBAL(JOCTET*)
+  * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
+  *                              JCOEFPTR block, int last_dc_val,
+  *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
+  *
+  */
+ .macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
+     sub             \PUT_BITS, \PUT_BITS, #0x8
+     lsr             \TMP, \PUT_BUFFER, \PUT_BITS
+     uxtb            \TMP, \TMP
+     strb            \TMP, [\BUFFER, #1]!
+     cmp             \TMP, #0xff
+     /*it eq*/
+     strbeq          \ZERO, [\BUFFER, #1]!
+ .endm
+ .macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
+     /*lsl             \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
+     add             \PUT_BITS, \SIZE
+     /*orr             \PUT_BUFFER, \PUT_BUFFER, \CODE*/
+     orr             \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
+ .endm
+ .macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
+   cmp               \PUT_BITS, #0x10
+   blt               15f
+     eor               \ZERO, \ZERO, \ZERO
+     emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
+     emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
+ 15:
+ .endm
+ .balign 16
+ jsimd_huff_encode_one_block_neon_consts:
+   .byte 0x01
+   .byte 0x02
+   .byte 0x04
+   .byte 0x08
+   .byte 0x10
+   .byte 0x20
+   .byte 0x40
+   .byte 0x80
+ asm_function jsimd_huff_encode_one_block_neon
+     push            {r4, r5, r6, r7, r8, r9, r10, r11, lr}
+     add             r7, sp, #0x1c
+     sub             r4, sp, #0x40
+     bfc             r4, #0, #5
+     mov             sp, r4           /* align sp on 32 bytes */
+     vst1.64         {d8, d9, d10, d11}, [r4, :128]!
+     vst1.64         {d12, d13, d14, d15}, [r4, :128]
+     sub             sp, #0x140       /* reserve 320 bytes */
+     str             r0, [sp, #0x18]  /* working state > sp + Ox18 */
+     add             r4, sp, #0x20    /* r4 = t1 */
+     ldr             lr, [r7, #0x8]   /* lr = dctbl */
+     sub             r10, r1, #0x1    /* r10=buffer-- */
+     ldrsh           r1, [r2]
+     mov             r9, #0x10
+     mov             r8, #0x1
+     adr             r5, jsimd_huff_encode_one_block_neon_consts
+     /* prepare data */
+     vld1.8          {d26}, [r5, :64]
+     veor            q8, q8, q8
+     veor            q9, q9, q9
+     vdup.16         q14, r9
+     vdup.16         q15, r8
+     veor            q10, q10, q10
+     veor            q11, q11, q11
+     sub             r1, r1, r3
+     add             r9, r2, #0x22
+     add             r8, r2, #0x18
+     add             r3, r2, #0x36
+     vmov.16         d0[0], r1
+     vld1.16         {d2[0]}, [r9, :16]
+     vld1.16         {d4[0]}, [r8, :16]
+     vld1.16         {d6[0]}, [r3, :16]
+     add             r1, r2, #0x2
+     add             r9, r2, #0x30
+     add             r8, r2, #0x26
+     add             r3, r2, #0x28
+     vld1.16         {d0[1]}, [r1, :16]
+     vld1.16         {d2[1]}, [r9, :16]
+     vld1.16         {d4[1]}, [r8, :16]
+     vld1.16         {d6[1]}, [r3, :16]
+     add             r1, r2, #0x10
+     add             r9, r2, #0x40
+     add             r8, r2, #0x34
+     add             r3, r2, #0x1a
+     vld1.16         {d0[2]}, [r1, :16]
+     vld1.16         {d2[2]}, [r9, :16]
+     vld1.16         {d4[2]}, [r8, :16]
+     vld1.16         {d6[2]}, [r3, :16]
+     add             r1, r2, #0x20
+     add             r9, r2, #0x32
+     add             r8, r2, #0x42
+     add             r3, r2, #0xc
+     vld1.16         {d0[3]}, [r1, :16]
+     vld1.16         {d2[3]}, [r9, :16]
+     vld1.16         {d4[3]}, [r8, :16]
+     vld1.16         {d6[3]}, [r3, :16]
+     add             r1, r2, #0x12
+     add             r9, r2, #0x24
+     add             r8, r2, #0x50
+     add             r3, r2, #0xe
+     vld1.16         {d1[0]}, [r1, :16]
+     vld1.16         {d3[0]}, [r9, :16]
+     vld1.16         {d5[0]}, [r8, :16]
+     vld1.16         {d7[0]}, [r3, :16]
+     add             r1, r2, #0x4
+     add             r9, r2, #0x16
+     add             r8, r2, #0x60
+     add             r3, r2, #0x1c
+     vld1.16         {d1[1]}, [r1, :16]
+     vld1.16         {d3[1]}, [r9, :16]
+     vld1.16         {d5[1]}, [r8, :16]
+     vld1.16         {d7[1]}, [r3, :16]
+     add             r1, r2, #0x6
+     add             r9, r2, #0x8
+     add             r8, r2, #0x52
+     add             r3, r2, #0x2a
+     vld1.16         {d1[2]}, [r1, :16]
+     vld1.16         {d3[2]}, [r9, :16]
+     vld1.16         {d5[2]}, [r8, :16]
+     vld1.16         {d7[2]}, [r3, :16]
+     add             r1, r2, #0x14
+     add             r9, r2, #0xa
+     add             r8, r2, #0x44
+     add             r3, r2, #0x38
+     vld1.16         {d1[3]}, [r1, :16]
+     vld1.16         {d3[3]}, [r9, :16]
+     vld1.16         {d5[3]}, [r8, :16]
+     vld1.16         {d7[3]}, [r3, :16]
+     vcgt.s16        q8, q8, q0
+     vcgt.s16        q9, q9, q1
+     vcgt.s16        q10, q10, q2
+     vcgt.s16        q11, q11, q3
+     vabs.s16        q0, q0
+     vabs.s16        q1, q1
+     vabs.s16        q2, q2
+     vabs.s16        q3, q3
+     veor            q8, q8, q0
+     veor            q9, q9, q1
+     veor            q10, q10, q2
+     veor            q11, q11, q3
+     add             r9, r4, #0x20
+     add             r8, r4, #0x80
+     add             r3, r4, #0xa0
+     vclz.i16        q0, q0
+     vclz.i16        q1, q1
+     vclz.i16        q2, q2
+     vclz.i16        q3, q3
+     vsub.i16        q0, q14, q0
+     vsub.i16        q1, q14, q1
+     vsub.i16        q2, q14, q2
+     vsub.i16        q3, q14, q3
+     vst1.16         {d0, d1, d2, d3}, [r4, :256]
+     vst1.16         {d4, d5, d6, d7}, [r9, :256]
+     vshl.s16        q0, q15, q0
+     vshl.s16        q1, q15, q1
+     vshl.s16        q2, q15, q2
+     vshl.s16        q3, q15, q3
+     vsub.i16        q0, q0, q15
+     vsub.i16        q1, q1, q15
+     vsub.i16        q2, q2, q15
+     vsub.i16        q3, q3, q15
+     vand            q8, q8, q0
+     vand            q9, q9, q1
+     vand            q10, q10, q2
+     vand            q11, q11, q3
+     vst1.16         {d16, d17, d18, d19}, [r8, :256]
+     vst1.16         {d20, d21, d22, d23}, [r3, :256]
+     add             r1, r2, #0x46
+     add             r9, r2, #0x3a
+     add             r8, r2, #0x74
+     add             r3, r2, #0x6a
+     vld1.16         {d8[0]}, [r1, :16]
+     vld1.16         {d10[0]}, [r9, :16]
+     vld1.16         {d12[0]}, [r8, :16]
+     vld1.16         {d14[0]}, [r3, :16]
+     veor            q8, q8, q8
+     veor            q9, q9, q9
+     veor            q10, q10, q10
+     veor            q11, q11, q11
+     add             r1, r2, #0x54
+     add             r9, r2, #0x2c
+     add             r8, r2, #0x76
+     add             r3, r2, #0x78
+     vld1.16         {d8[1]}, [r1, :16]
+     vld1.16         {d10[1]}, [r9, :16]
+     vld1.16         {d12[1]}, [r8, :16]
+     vld1.16         {d14[1]}, [r3, :16]
+     add             r1, r2, #0x62
+     add             r9, r2, #0x1e
+     add             r8, r2, #0x68
+     add             r3, r2, #0x7a
+     vld1.16         {d8[2]}, [r1, :16]
+     vld1.16         {d10[2]}, [r9, :16]
+     vld1.16         {d12[2]}, [r8, :16]
+     vld1.16         {d14[2]}, [r3, :16]
+     add             r1, r2, #0x70
+     add             r9, r2, #0x2e
+     add             r8, r2, #0x5a
+     add             r3, r2, #0x6c
+     vld1.16         {d8[3]}, [r1, :16]
+     vld1.16         {d10[3]}, [r9, :16]
+     vld1.16         {d12[3]}, [r8, :16]
+     vld1.16         {d14[3]}, [r3, :16]
+     add             r1, r2, #0x72
+     add             r9, r2, #0x3c
+     add             r8, r2, #0x4c
+     add             r3, r2, #0x5e
+     vld1.16         {d9[0]}, [r1, :16]
+     vld1.16         {d11[0]}, [r9, :16]
+     vld1.16         {d13[0]}, [r8, :16]
+     vld1.16         {d15[0]}, [r3, :16]
+     add             r1, r2, #0x64
+     add             r9, r2, #0x4a
+     add             r8, r2, #0x3e
+     add             r3, r2, #0x6e
+     vld1.16         {d9[1]}, [r1, :16]
+     vld1.16         {d11[1]}, [r9, :16]
+     vld1.16         {d13[1]}, [r8, :16]
+     vld1.16         {d15[1]}, [r3, :16]
+     add             r1, r2, #0x56
+     add             r9, r2, #0x58
+     add             r8, r2, #0x4e
+     add             r3, r2, #0x7c
+     vld1.16         {d9[2]}, [r1, :16]
+     vld1.16         {d11[2]}, [r9, :16]
+     vld1.16         {d13[2]}, [r8, :16]
+     vld1.16         {d15[2]}, [r3, :16]
+     add             r1, r2, #0x48
+     add             r9, r2, #0x66
+     add             r8, r2, #0x5c
+     add             r3, r2, #0x7e
+     vld1.16         {d9[3]}, [r1, :16]
+     vld1.16         {d11[3]}, [r9, :16]
+     vld1.16         {d13[3]}, [r8, :16]
+     vld1.16         {d15[3]}, [r3, :16]
+     vcgt.s16        q8, q8, q4
+     vcgt.s16        q9, q9, q5
+     vcgt.s16        q10, q10, q6
+     vcgt.s16        q11, q11, q7
+     vabs.s16        q4, q4
+     vabs.s16        q5, q5
+     vabs.s16        q6, q6
+     vabs.s16        q7, q7
+     veor            q8, q8, q4
+     veor            q9, q9, q5
+     veor            q10, q10, q6
+     veor            q11, q11, q7
+     add             r1, r4, #0x40
+     add             r9, r4, #0x60
+     add             r8, r4, #0xc0
+     add             r3, r4, #0xe0
+     vclz.i16        q4, q4
+     vclz.i16        q5, q5
+     vclz.i16        q6, q6
+     vclz.i16        q7, q7
+     vsub.i16        q4, q14, q4
+     vsub.i16        q5, q14, q5
+     vsub.i16        q6, q14, q6
+     vsub.i16        q7, q14, q7
+     vst1.16         {d8, d9, d10, d11}, [r1, :256]
+     vst1.16         {d12, d13, d14, d15}, [r9, :256]
+     vshl.s16        q4, q15, q4
+     vshl.s16        q5, q15, q5
+     vshl.s16        q6, q15, q6
+     vshl.s16        q7, q15, q7
+     vsub.i16        q4, q4, q15
+     vsub.i16        q5, q5, q15
+     vsub.i16        q6, q6, q15
+     vsub.i16        q7, q7, q15
+     vand            q8, q8, q4
+     vand            q9, q9, q5
+     vand            q10, q10, q6
+     vand            q11, q11, q7
+     vst1.16         {d16, d17, d18, d19}, [r8, :256]
+     vst1.16         {d20, d21, d22, d23}, [r3, :256]
+     ldr             r12, [r7, #0xc]       /* r12 = actbl */
+     add             r1, lr, #0x400        /* r1 = dctbl->ehufsi */
+     mov             r9, r12               /* r9 = actbl */
+     add             r6, r4, #0x80         /* r6 = t2 */
+     ldr             r11, [r0, #0x8]       /* r11 = put_buffer */
+     ldr             r4, [r0, #0xc]        /* r4  = put_bits */
+     ldrh            r2, [r6, #-128]       /* r2  = nbits */
+     ldrh            r3, [r6]              /* r3  = temp2 & (((JLONG) 1)<<nbits) - 1; */
+     ldr             r0, [lr, r2, lsl #2]
+     ldrb            r5, [r1, r2]
+     put_bits        r11, r4, r0, r5
+     checkbuf15      r10, r11, r4, r5, r0
+     put_bits        r11, r4, r3, r2
+     checkbuf15      r10, r11, r4, r5, r0
+     mov             lr, r6                /* lr = t2 */
+     add             r5, r9, #0x400        /* r5 = actbl->ehufsi */
+     ldrsb           r6, [r5, #0xf0]       /* r6 = actbl->ehufsi[0xf0] */
+     veor            q8, q8, q8
+     vceq.i16        q0, q0, q8
+     vceq.i16        q1, q1, q8
+     vceq.i16        q2, q2, q8
+     vceq.i16        q3, q3, q8
+     vceq.i16        q4, q4, q8
+     vceq.i16        q5, q5, q8
+     vceq.i16        q6, q6, q8
+     vceq.i16        q7, q7, q8
+     vmovn.i16       d0, q0
+     vmovn.i16       d2, q1
+     vmovn.i16       d4, q2
+     vmovn.i16       d6, q3
+     vmovn.i16       d8, q4
+     vmovn.i16       d10, q5
+     vmovn.i16       d12, q6
+     vmovn.i16       d14, q7
+     vand            d0, d0, d26
+     vand            d2, d2, d26
+     vand            d4, d4, d26
+     vand            d6, d6, d26
+     vand            d8, d8, d26
+     vand            d10, d10, d26
+     vand            d12, d12, d26
+     vand            d14, d14, d26
+     vpadd.i8        d0, d0, d2
+     vpadd.i8        d4, d4, d6
+     vpadd.i8        d8, d8, d10
+     vpadd.i8        d12, d12, d14
+     vpadd.i8        d0, d0, d4
+     vpadd.i8        d8, d8, d12
+     vpadd.i8        d0, d0, d8
+     vmov.32         r1, d0[1]
+     vmov.32         r8, d0[0]
+     mvn             r1, r1
+     mvn             r8, r8
+     lsrs            r1, r1, #0x1
+     rrx             r8, r8            /* shift in last r1 bit while shifting out DC bit */
+     rbit            r1, r1            /* r1 = index1 */
+     rbit            r8, r8            /* r8 = index0 */
+     ldr             r0, [r9, #0x3c0]  /* r0 = actbl->ehufco[0xf0] */
+     str             r1, [sp, #0x14]   /* index1 > sp + 0x14 */
+     cmp             r8, #0x0
+     beq             6f
+ 1:
+     clz             r2, r8
+     add             lr, lr, r2, lsl #1
+     lsl             r8, r8, r2
+     ldrh            r1, [lr, #-126]
+ 2:
+     cmp             r2, #0x10
+     blt             3f
+     sub             r2, r2, #0x10
+     put_bits        r11, r4, r0, r6
+     cmp             r4, #0x10
+     blt             2b
+     eor             r3, r3, r3
+     emit_byte       r10, r11, r4, r3, r12
+     emit_byte       r10, r11, r4, r3, r12
+     b               2b
+ 3:
+     add             r2, r1, r2, lsl #4
+     ldrh            r3, [lr, #2]!
+     ldr             r12, [r9, r2, lsl #2]
+     ldrb            r2, [r5, r2]
+     put_bits        r11, r4, r12, r2
+     checkbuf15      r10, r11, r4, r2, r12
+     put_bits        r11, r4, r3, r1
+     checkbuf15      r10, r11, r4, r2, r12
+     lsls            r8, r8, #0x1
+     bne             1b
+ 6:
+     add             r12, sp, #0x20   /* r12 = t1 */
+     ldr             r8, [sp, #0x14]  /* r8 = index1 */
+     adds            r12, #0xc0       /* r12 = t2 + (DCTSIZE2/2) */
+     cmp             r8, #0x0
+     beq             6f
+     clz             r2, r8
+     sub             r12, r12, lr
+     lsl             r8, r8, r2
+     add             r2, r2, r12, lsr #1
+     add             lr, lr, r2, lsl #1
+     b               7f
+ 1:
+     clz             r2, r8
+     add             lr, lr, r2, lsl #1
+     lsl             r8, r8, r2
+ 7:
+     ldrh            r1, [lr, #-126]
+ 2:
+     cmp             r2, #0x10
+     blt             3f
+     sub             r2, r2, #0x10
+     put_bits        r11, r4, r0, r6
+     cmp             r4, #0x10
+     blt             2b
+     eor             r3, r3, r3
+     emit_byte       r10, r11, r4, r3, r12
+     emit_byte       r10, r11, r4, r3, r12
+     b               2b
+ 3:
+     add             r2, r1, r2, lsl #4
+     ldrh            r3, [lr, #2]!
+     ldr             r12, [r9, r2, lsl #2]
+     ldrb            r2, [r5, r2]
+     put_bits        r11, r4, r12, r2
+     checkbuf15      r10, r11, r4, r2, r12
+     put_bits        r11, r4, r3, r1
+     checkbuf15      r10, r11, r4, r2, r12
+     lsls            r8, r8, #0x1
+     bne             1b
+ 6:
+     add             r0, sp, #0x20
+     add             r0, #0xfe
+     cmp             lr, r0
+     bhs             1f
+     ldr             r1, [r9]
+     ldrb            r0, [r5]
+     put_bits        r11, r4, r1, r0
+     checkbuf15      r10, r11, r4, r0, r1
+ 1:
+     ldr             r12, [sp, #0x18]
+     str             r11, [r12, #0x8]
+     str             r4, [r12, #0xc]
+     add             r0, r10, #0x1
+     add             r4, sp, #0x140
+     vld1.64         {d8, d9, d10, d11}, [r4, :128]!
+     vld1.64         {d12, d13, d14, d15}, [r4, :128]
+     sub             r4, r7, #0x1c
+     mov             sp, r4
+     pop             {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+ .purgem emit_byte
+ .purgem put_bits
+ .purgem checkbuf15
++
 +#if _USE_PRODUCT_TV
 +asm_function jsimd_pick_color
 +
 +@                RGB_BUFFER   .req r0
 +@                RGB_RET      .req r1
 +@                OUTPUT_WIDTH   .req r2
 +
 +              push            {r3, r4, r5, lr}
 +              vpush           {d8-d15}
 +              MOV r5, #0
 +              VDUP.32 d0, r5  
 +              VDUP.32 d1, r5  
 +              VDUP.32 d2, r5  
 +              VDUP.32 d3, r5  
 +              VDUP.32 d4, r5  
 +              VDUP.32 d5, r5  
 +              VDUP.32 d6, r5  
 +              VDUP.32 d7, r5  
 +              VDUP.32 d8, r5  
 +      
 +              CMP   r2,#0x8
 +              BCC   UNDER_8   
 +         
 +              CMP   r2,#0x10
 +              BCC   UNDER_16   
 +         
 +              VLD3.8 {d0, d2, d4}, [r0]!
 +              VLD3.8 {d1, d3, d5}, [r0]!
 +
 +              SUB r2, r2, #16
 +              VPADDL.U8  q0,q0        
 +              VPADDL.U8  q1,q1        
 +              VPADDL.U8  q2,q2 
 +              
 +                    
 +      PROCESS_LOOP:
 +      
 +              CMP r2, #0x10
 +              BCC LOOP_BREAK                  
 +      
 +              SUB r2, r2, #16
 +              CMP r2, #0
 +              BLT LOOP_BREAK
 +              
 +              VLD3.8 {d6, d8, d10}, [r0]!
 +              VLD3.8 {d7, d9, d11}, [r0]!
 +
 +              VPADAL.U8  q0,q3        
 +              VPADAL.U8  q1,q4        
 +              VPADAL.U8  q2,q5 
 +
 +              B PROCESS_LOOP
 +
 +      LOOP_BREAK:     
 +              
 +              VPADDL.U16 q0, q0
 +              VPADDL.U16 q1, q1
 +              VPADDL.U16 q2, q2
 +              
 +              VPADDL.U32 q0, q0
 +              VPADDL.U32 q1, q1
 +              VPADDL.U32 q2, q2
 +
 +              VADD.I64 d0, d0, d1
 +              VADD.I64 d2, d2, d3
 +              VADD.I64 d4, d4, d5
 +              
 +      PROCESS_REST:
 +              CMP r2, #8
 +              BLT PROCESS_U_8 @ignore less than 8 pixels as of now
 +
 +                VLD3.8 {d6, d7, d8}, [r0]!
 +                VPADDL.U8  d6, d6
 +                VPADDL.U8  d7, d7
 +                VPADDL.U8  d8, d8
 +
 +                VPADDL.U16 d6, d6
 +                VPADDL.U16 d7, d7
 +                VPADDL.U16 d8, d8
 +              
 +                VPADDL.U32 d6, d6
 +                VPADDL.U32 d7, d7
 +                VPADDL.U32 d8, d8
 +
 +              VADD.I64 d0, d0, d6             
 +              VADD.I64 d2, d2, d7             
 +              VADD.I64 d4, d4, d8             
 +              
 +              SUB r2, r2, #8
 +      
 +      PROCESS_U_8:
 +              CMP r2, #4
 +              BLT PROCESS_U_4
 +      
 +              VLD3.8 {d6[0], d7[0], d8[0]}, [r0]!
 +              VLD3.8 {d6[1], d7[1], d8[1]}, [r0]!
 +              VLD3.8 {d6[2], d7[2], d8[2]}, [r0]!
 +              VLD3.8 {d6[3], d7[3], d8[3]}, [r0]!
 +
 +              VPADDL.U8  d6, d6
 +              VPADDL.U8  d7, d7
 +              VPADDL.U8  d8, d8
 +              
 +              VPADDL.U16  d6, d6
 +              VPADDL.U16  d7, d7
 +              VPADDL.U16  d8, d8
 +              
 +              VADD.I64 d0, d0, d6             
 +              VADD.I64 d2, d2, d7             
 +              VADD.I64 d4, d4, d8             
 +              
 +              SUB r2, r2, #4
 +
 +        PROCESS_U_4:
 +@                CMP r2, #2
 +@                BLT PROCESS_U_2
 +
 +              B STORE
 +
 +
 +      UNDER_16: 
 +                              
 +              VLD3.8 {d0, d2, d4}, [r0]!
 +              VPADDL.U8  d0, d0
 +              VPADDL.U8  d2, d2
 +              VPADDL.U8  d4, d4
 +      
 +              VPADDL.U16 d0, d0
 +              VPADDL.U16 d2, d2
 +              VPADDL.U16 d4, d4
 +
 +              VPADDL.U32 d0, d0
 +              VPADDL.U32 d2, d2
 +              VPADDL.U32 d4, d4
 +              
 +              B STORE
 +
 +      STORE:
 +              VMOV.U32  r3, d0[0]     
 +              LDR r4, [r1]
 +              ADD r4, r4, r3
 +              STR r4, [r1]
 +      
 +               VMOV.U32 r3, d2[0]
 +              LDR r4, [r1, #4]
 +              ADD r4, r4, r3
 +              STR r4, [r1, #4]
 +               
 +              VMOV.U32 r3, d4[0]
 +              LDR r4, [r1, #8]
 +              ADD r4, r4, r3
 +              STR r4, [r1, #8]
 +      
 +      UNDER_8:
 +              vpop            {d8-d15} 
 +              pop             {r3, r4, r5, pc}
++#endif
diff --cc tjbench.c
+++ b/tjbench.c
  
  #define _throw(op, err) {  \
        printf("ERROR in line %d while %s:\n%s\n", __LINE__, op, err);  \
-   retval=-1;  goto bailout;}
+       retval=-1;  goto bailout;}
 +#if _USE_PRODUCT_TV
 +#define _throwunix(m) { \
 +      char err_str[256]; \
 +      strerror_r(errno, err_str, 256); \
 +      _throw(m, err_str) \
 +}
 +#else
  #define _throwunix(m) _throw(m, strerror(errno))
 +#endif
  #define _throwtj(m) _throw(m, tjGetErrorStr())
  #define _throwbmp(m) _throw(m, bmpgeterr())
  
diff --cc tjunittest.c
Simple merge