tizen 2.3.1 release accepted/tizen_2.4_mobile tizen_2.3.1 tizen_2.4 accepted/tizen/2.4/mobile/20151029.040056 submit/tizen_2.3.1/20150915.074655 submit/tizen_2.4/20151028.063113 tizen_2.3.1_release tizen_2.4_mobile_release
authorjk7744.park <jk7744.park@samsung.com>
Tue, 8 Sep 2015 12:57:56 +0000 (21:57 +0900)
committerjk7744.park <jk7744.park@samsung.com>
Tue, 8 Sep 2015 12:57:56 +0000 (21:57 +0900)
24 files changed:
COPYING [new file with mode: 0644]
Makefile.am
configure.ac
debian/changelog
debian/copyright [new file with mode: 0644]
debian/libjpeg-turbo-dev.files
debian/libjpeg-turbo.files
jdapimin.c
jdcoefct.c
jdmarker.c
jdmerge.c
jpeglib.h
libjpeg-turbo.manifest [new file with mode: 0755]
packaging/libjpeg-turbo.spec
pkgconfig/turbojpeg.pc.in
simd/jdclrss2-64.asm
simd/jdclrss2.asm
simd/jdmrgss2-64.asm
simd/jdmrgss2.asm
simd/jsimd.h
simd/jsimd_arm.c
simd/jsimd_arm_neon.S
simd/jsimdcfg.inc.h
simd/jsimdext.inc

diff --git a/COPYING b/COPYING
new file mode 100644 (file)
index 0000000..d8c7ce7
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,31 @@
+*******************************************************************************
+**     License
+*******************************************************************************
+
+Most of libjpeg-turbo inherits the non-restrictive, BSD-style license used by
+libjpeg (see README.)  The TurboJPEG/OSS wrapper (both C and Java versions) and
+associated test programs bear a similar license, which is reproduced below:
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+- Neither the name of the libjpeg-turbo Project nor the names of its
+  contributors may be used to endorse or promote products derived from this
+  software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
index 456bb1f..3434b10 100644 (file)
@@ -1,10 +1,8 @@
 lib_LTLIBRARIES = libjpeg.la libturbojpeg.la
 libjpeg_la_LDFLAGS = -version-info ${SO_MAJOR_VERSION}:${SO_MINOR_VERSION} -no-undefined
 libturbojpeg_la_LDFLAGS = -avoid-version -no-undefined
-include_HEADERS = turbojpeg.h
-installheaderjpeglibdir = $(includedir)/turbojpeg
-installheaderjpeglib_HEADERS = jpeglib.h jerror.h jmorecfg.h jconfig.h
-#nodist_include_HEADERS = jconfig.h
+include_HEADERS = jerror.h jmorecfg.h jpeglib.h turbojpeg.h
+nodist_include_HEADERS = jconfig.h
 
 HDRS = jchuff.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
        jpegint.h jpeglib.h jversion.h jsimd.h jsimddct.h jpegcomp.h
@@ -18,7 +16,6 @@ libjpeg_la_SOURCES = $(HDRS) jcapimin.c jcapistd.c jccoefct.c jccolor.c \
        jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c \
        jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c
 
-
 if WITH_ARITH
 
 libjpeg_la_SOURCES += jaricom.c
@@ -96,7 +93,8 @@ cjpeg_SOURCES = cdjpeg.h cderror.h cdjpeg.c cjpeg.c rdbmp.c rdgif.c \
 cjpeg_LDADD = libjpeg.la
 
 cjpeg_CFLAGS = -DBMP_SUPPORTED -DGIF_SUPPORTED -DPPM_SUPPORTED \
-       -DTARGA_SUPPORTED
+       -DTARGA_SUPPORTED \
+       -fPIE -pie
 
 djpeg_SOURCES = cdjpeg.h cderror.h cdjpeg.c djpeg.c rdcolmap.c rdswitch.c \
        wrbmp.c wrgif.c wrppm.c wrtarga.c
@@ -104,7 +102,8 @@ djpeg_SOURCES = cdjpeg.h cderror.h cdjpeg.c djpeg.c rdcolmap.c rdswitch.c \
 djpeg_LDADD = libjpeg.la
 
 djpeg_CFLAGS = -DBMP_SUPPORTED -DGIF_SUPPORTED -DPPM_SUPPORTED \
-       -DTARGA_SUPPORTED
+       -DTARGA_SUPPORTED \
+        -fPIE -pie
 
 jpegtran_SOURCES = jpegtran.c rdswitch.c cdjpeg.c transupp.c transupp.h
 
index d7b249b..22c6c20 100644 (file)
@@ -159,7 +159,7 @@ EOF
 AC_LINK_IFELSE(AC_LANG_PROGRAM([], []),
   [VERSION_SCRIPT_FLAG=-Wl,--version-script,; AC_MSG_RESULT([yes (GNU style)])], [])
 if test "x$VERSION_SCRIPT_FLAG" = "x"; then
-  LDFLAGS="$SAVED_LDFLAGS -Wl,-M,conftest.map"
+  LDFLAGS="$SAVED_LDFLAGS -Wl,-M,conftest.map -pie"
   AC_LINK_IFELSE(AC_LANG_PROGRAM([], []),
     [VERSION_SCRIPT_FLAG=-Wl,-M,; AC_MSG_RESULT([yes (Sun style)])], [])
 fi
index 3c15320..3efa3fa 100644 (file)
@@ -1,3 +1,51 @@
+libjpeg-turbo (1.2.0-8) unstable; urgency=low
+
+  * [TREL] Install djpeg, cjpeg
+  * Git: external/libjpeg-turbo
+  * Tag: libjpeg-turbo_1.2.0-8
+
+ -- YoungHun Kim <yh8004.kim@samsung.com>  Fri, 05 Apr 2013 14:10:35 +0900
+
+libjpeg-turbo (1.2.0-7) unstable; urgency=low
+
+  * Enable simd for i586 (including IA)
+  * Git: external/libjpeg-turbo
+  * Tag: libjpeg-turbo_1.2.0-7
+
+ -- YoungHun Kim <yh8004.kim@samsung.com>  Mon, 07 Jan 2013 15:14:52 +0900
+
+libjpeg-turbo (1.2.0-6) unstable; urgency=low
+
+  * Enable libjpeg package of libjpeg-turbo
+  * Git: external/libjpeg-turbo
+  * Tag: libjpeg-turbo_1.2.0-6
+
+ -- YoungHun Kim <yh8004.kim@samsung.com>  Thu, 25 Oct 2012 01:09:00 +0900
+
+libjpeg-turbo (1.2.0-5) unstable; urgency=low
+
+  * License file copied to /usr/share/license/
+  * Git: external/libjpeg-turbo
+  * Tag: libjpeg-turbo_1.2.0-5
+
+ -- YoungHun Kim <yh8004.kim@ysamsung.com>  Fri, 12 Oct 2012 18:13:55 +0900
+
+libjpeg-turbo (1.2.0-4) unstable; urgency=low
+
+  * Fix CVE-2012-2806
+  * Git: external/libjpeg-turbo
+  * Tag: libjpeg-turbo_1.2.0-4
+
+ -- YoungHun Kim <yh8004.kim@samsung.com>  Thu, 04 Oct 2012 16:23:12 +0900
+
+libjpeg-turbo (1.2.0-3) unstable; urgency=low
+
+  * Add manifest file
+  * Git: external/libjpeg-turbo
+  * Tag: libjpeg-turbo_1.2.0-3
+
+ -- YoungHun Kim <yh8004.kim@samsung.com>  Thu, 20 Sep 2012 20:57:44 +0900
+
 libjpeg-turbo (1.2.0-2) unstable; urgency=low
 
   * Tag [Version] 1.2.0-2
diff --git a/debian/copyright b/debian/copyright
new file mode 100644 (file)
index 0000000..8452ed9
--- /dev/null
@@ -0,0 +1,111 @@
+Format-Specification: http://svn.debian.org/wsvn/dep/web/deps/dep5.mdwn?op=file&rev=135
+Name: libjpeg-turbo
+Source: lp:libjpeg-turbo
+
+Files: *
+Copyright: 1999-2006 MIYASAKA Masaru 
+    2004 Landmark Graphics Corporation
+    2005-2007 Sun Microsystems, Inc.
+    2009 Pierre Ossman for Cendio AB
+    2009-2010 D. R. Commander
+    2010 Thomas G. Lane, Guido Vollbeding
+    2009, Thomas G. Lane, Guido Vollbeding
+    1998, Thomas G. Lane
+    2010 Nokia Corporation
+License:  JPEG
+ .
+ In plain English:
+ .
+ 1. We don't promise that this software works.  (But if you find any bugs,
+   please let us know!)
+ 2. You can use this software for whatever you want.  You don't have to pay us.
+ 3. You may not pretend that you wrote this software.  If you use it in a
+   program, you must acknowledge somewhere in your documentation that
+   you've used the IJG code.
+ .
+ In legalese:
+ The authors make NO WARRANTY or representation, either express or implied,
+ with respect to this software, its quality, accuracy, merchantability, or
+ fitness for a particular purpose.  This software is provided "AS IS", and you,
+ its user, assume the entire risk as to its quality and accuracy.
+ .
+ This software is copyright (C) 1991-2010, Thomas G. Lane, Guido Vollbeding.
+ All Rights Reserved except as specified below.
+ .
+ Permission is hereby granted to use, copy, modify, and distribute this
+ software (or portions thereof) for any purpose, without fee, subject to these
+ conditions:
+ (1) If any part of the source code for this software is distributed, then this
+ README file must be included, with this copyright and no-warranty notice
+ unaltered; and any additions, deletions, or changes to the original files
+ must be clearly indicated in accompanying documentation.
+ (2) If only executable code is distributed, then the accompanying
+ documentation must state that "this software is based in part on the work of
+ the Independent JPEG Group".
+ (3) Permission for use of this software is granted only if the user accepts
+ full responsibility for any undesirable consequences; the authors accept
+ NO LIABILITY for damages of any kind.
+ .
+ These conditions apply to any software derived from or based on the IJG code,
+ not just to the unmodified library.  If you use our work, you ought to
+ acknowledge us.
+ .
+ Permission is NOT granted for the use of any IJG author's name or company name
+ in advertising or publicity relating to this software or products derived from
+ it.  This software may be referred to only as "the Independent JPEG Group's
+ software".
+ .
+ We specifically permit and encourage the use of this software as the basis of
+ commercial products, provided that all warranty or liability claims are
+ assumed by the product vendor.
+ .
+ .
+ ansi2knr.c is included in this distribution by permission of L. Peter Deutsch,
+ sole proprietor of its copyright holder, Aladdin Enterprises of Menlo Park, CA
+ .
+ ansi2knr.c is NOT covered by the above copyright and conditions, but instead
+ by the usual distribution terms of the Free Software Foundation; principally,
+ that you must include source code if you redistribute it.  (See the file
+ ansi2knr.c for full details.)  However, since ansi2knr.c is not needed as part
+ of any program generated from the IJG code, this does not limit you more than
+ the foregoing paragraphs do.
+ .
+ The Unix configuration script "configure" was produced with GNU Autoconf.
+ It is copyright by the Free Software Foundation but is freely distributable.
+ The same holds for its supporting scripts (config.guess, config.sub,
+ ltmain.sh).  Another support script, install-sh, is copyright by X Consortium
+ but is also freely distributable.
+ .
+ The IJG distribution formerly included code to read and write GIF files.
+ To avoid entanglement with the Unisys LZW patent, GIF reading support has
+ been removed altogether, and the GIF writer has been simplified to produce
+ "uncompressed GIFs".  This technique does not use the LZW algorithm; the
+ resulting GIF files are larger than usual, but are readable by all standard
+ GIF decoders.
+ .
+ We are required to state that
+    "The Graphics Interchange Format(c) is the Copyright property of
+    CompuServe Incorporated.  GIF(sm) is a Service Mark property of
+    CompuServe Incorporated."
+
+Files: debian/*
+Copyright: 2010, 2011 Linaro Limited
+License: LGPL-2.1
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License (LGPL) as published by the Free Software Foundation;
+ either version 2 of the License, or (at your option) any later
+ version.
+ .
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Library General Public License for more details.
+ .
+ You should have received a copy of the GNU Library General Public License
+ along with this library; see the file COPYING.LIB.  If not, write to
+ the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ Boston, MA 02110-1301, USA.
+ .
+ On Debian systems, the complete text of the GNU Library General Public
+ License, version 2, can be found in /usr/share/common-licenses/LGPL-2.1.
index 826e558..0c22589 100644 (file)
@@ -1,8 +1,2 @@
-usr/include/turbojpeg.h
-usr/include/turbojpeg/jpeglib.h
-usr/include/turbojpeg/jerror.h
-usr/include/turbojpeg/jmorecfg.h
-usr/include/turbojpeg/jconfig.h
+usr/include/*.h
 usr/lib/pkgconfig/*
-usr/lib/libturbojpeg.a
-usr/lib/libturbojpeg.la
index f3d5763..5fae0b0 100644 (file)
@@ -1 +1,2 @@
 usr/lib/libturbojpeg.so*
+usr/lib/libjpeg.so*
index cadb59f..4061a3d 100644 (file)
@@ -78,6 +78,18 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
   /* And initialize the overall input controller. */
   jinit_input_controller(cinfo);
 
+  /* Init region to decode to be empty */
+  cinfo->region_x = 0;
+  cinfo->region_y = 0;
+  cinfo->region_w = 0;
+  cinfo->region_h = 0;
+
+  /* Init region to decode to be empty */
+  cinfo->region_x = 0;
+  cinfo->region_y = 0;
+  cinfo->region_w = 0;
+  cinfo->region_h = 0;
+
   /* OK, I'm ready */
   cinfo->global_state = DSTATE_START;
 }
index 48a9fc6..304fd5a 100644 (file)
@@ -160,12 +160,45 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
   JDIMENSION start_col, output_col;
   jpeg_component_info *compptr;
   inverse_DCT_method_ptr inverse_DCT;
+  /* region decoding. this limits decode to the set of blocks +- 1 outside
+   * bounding blocks around the desired region to decode */
+  int blk1 = 0, blk2 = 0, skip = 0;
+
+  if ((cinfo->region_w > 0) && (cinfo->region_h > 0)) {
+    int bsz_w = 0, bsz_h = 0;
+
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      compptr = cinfo->cur_comp_info[ci];
+      if (compptr->MCU_sample_width > bsz_w)
+        bsz_w = compptr->MCU_sample_width;
+      if ((compptr->MCU_height * 8) > bsz_h)
+        bsz_h = compptr->MCU_height * 8;
+    }
+    int _region_y = (int)cinfo->region_y;
+    _region_y = (_region_y>>1)<<1;
+    if (((int)cinfo->output_scanline < (_region_y - bsz_h - 1)) ||
+        ((int)cinfo->output_scanline > (_region_y + cinfo->region_h + bsz_h)))
+      skip = 1;
+    blk1 = (cinfo->region_x / bsz_w) - 1;
+    if (blk1 < 0) blk1 = 0;
+    blk2 = ((cinfo->region_x + cinfo->region_w + bsz_w - 1) / bsz_w) + 1;
+    if (blk2 < 0) blk2 = 0;
+  }
 
   /* Loop to process as much as one whole iMCU row */
   for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
        yoffset++) {
     for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
         MCU_col_num++) {
+      /* see if we need to skip this MCU or not */
+      if ((cinfo->region_w > 0) && (cinfo->region_h > 0)) {
+        if (!((MCU_col_num < blk1) || (MCU_col_num > blk2) || skip))
+          skip = 0;
+      }
+      /* if we are not skipping this MCU, zero it ready for huffman decode */
+      if (!skip)
+        jzero_far((void FAR *) coef->MCU_buffer[0],
+                  (size_t) (cinfo->blocks_in_MCU * SIZEOF(JBLOCK)));
       /* Try to fetch an MCU.  Entropy decoder expects buffer to be zeroed. */
       jzero_far((void FAR *) coef->MCU_buffer[0],
                (size_t) (cinfo->blocks_in_MCU * SIZEOF(JBLOCK)));
@@ -175,6 +208,10 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
        coef->MCU_ctr = MCU_col_num;
        return JPEG_SUSPENDED;
       }
+      /* region decoding. this limits decode to the set of blocks +- 1 outside
+       * bounding blocks around the desired region to decode */
+      if (skip)
+        continue;
       /* Determine where data should go in output_buf and do the IDCT thing.
        * We skip dummy blocks at the right and bottom edges (but blkn gets
        * incremented past them!).  Note the inner loop relies on having
index d8dcba9..6fc0f7d 100644 (file)
@@ -323,14 +323,15 @@ get_sos (j_decompress_ptr cinfo)
 
   /* Collect the component-spec parameters */
 
-  for (i = 0; i < cinfo->num_components; i++)
+  for (i = 0; i < MAX_COMPS_IN_SCAN; i++)
     cinfo->cur_comp_info[i] = NULL;
 
   for (i = 0; i < n; i++) {
     INPUT_BYTE(cinfo, cc, return FALSE);
     INPUT_BYTE(cinfo, c, return FALSE);
     
-    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+    for (ci = 0, compptr = cinfo->comp_info;
+        ci < cinfo->num_components && ci < MAX_COMPS_IN_SCAN;
         ci++, compptr++) {
       if (cc == compptr->component_id && !cinfo->cur_comp_info[ci])
        goto id_found;
index cfa3bb9..6ecf93b 100644 (file)
--- a/jdmerge.c
+++ b/jdmerge.c
@@ -248,6 +248,7 @@ merged_2v_upsample (j_decompress_ptr cinfo,
   my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
   JSAMPROW work_ptrs[2];
   JDIMENSION num_rows;         /* number of rows returned to caller */
+  int skip = 0;
 
   if (upsample->spare_full) {
     /* If we have a spare row saved from a previous cycle, just return it. */
@@ -256,6 +257,13 @@ merged_2v_upsample (j_decompress_ptr cinfo,
     num_rows = 1;
     upsample->spare_full = FALSE;
   } else {
+    int _region_y = (int)cinfo->region_y;
+    _region_y = (_region_y>>1)<<1;
+    if ((cinfo->region_w > 0) && (cinfo->region_h > 0)) {
+       if (((int)cinfo->output_scanline < _region_y) ||
+           ((int)cinfo->output_scanline >= (_region_y + (int)cinfo->region_h)))
+         skip = 1;
+    }
     /* Figure number of rows to return to caller. */
     num_rows = 2;
     /* Not more than the distance to the end of the image. */
@@ -274,7 +282,8 @@ merged_2v_upsample (j_decompress_ptr cinfo,
       upsample->spare_full = TRUE;
     }
     /* Now do the upsampling. */
-    (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr, work_ptrs);
+    if (!skip)
+      (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr, work_ptrs);
   }
 
   /* Adjust counts */
index d19a3ef..0f53709 100644 (file)
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -491,6 +491,8 @@ struct jpeg_decompress_struct {
 
   unsigned int scale_num, scale_denom; /* fraction by which to scale image */
 
+  unsigned int region_x, region_y, region_w, region_h; /* if region_w && region_h > 0, then use this region to decode. scale above is done prior to region select */
+
   double output_gamma;         /* image gamma wanted in output */
 
   boolean buffered_image;      /* TRUE=multiple output passes */
diff --git a/libjpeg-turbo.manifest b/libjpeg-turbo.manifest
new file mode 100755 (executable)
index 0000000..d1b4247
--- /dev/null
@@ -0,0 +1,12 @@
+<manifest>
+        <define>
+                <domain name="libjpeg-turbo"/>
+        </define>
+        <request>
+                <domain name="_" />
+        </request>
+        <assign>
+                <filesystem path="/usr/bin/cjpeg" label="libjpeg-turbo" exec_label="libjpeg-turbo" />
+                <filesystem path="/usr/bin/djpeg" label="libjpeg-turbo" exec_label="libjpeg-turbo" />
+        </assign>
+</manifest>
index 588a180..60e14d1 100644 (file)
@@ -1,9 +1,9 @@
 Name:           libjpeg-turbo
-License:        BSD3c(or similar)
+License:        BSD-2.0
 Group:          Productivity/Graphics/Convertors
 AutoReqProv:    on
 Version:       1.2.0
-Release:        2
+Release:        12
 Summary:        A MMX/SSE2 accelerated library for manipulating JPEG image files
 Url:            http://sourceforge.net/projects/libjpeg-turbo
 Source0:        %{name}-%{version}.tar.gz
@@ -13,25 +13,28 @@ The libjpeg-turbo package contains a library of functions for manipulating
 JPEG images.
 
 %package devel
-
-License:        BSD3c(or similar)
 Summary:        Developement files for libjpeg-turbo contains a wrapper library (TurboJPEG/OSS) that emulates the TurboJPEG API using libjpeg-turbo
 Group:          Development/Libraries/C and C++
 Requires:       %{name} = %{version}-%{release}
-
+Provides:      libjpeg-devel
+%ifarch %{ix86}
+BuildRequires:  nasm
+%endif
 %description devel
-The libjpeg-turbo shared libraries can be used as drop-in replacements for libjpeg on most systems
+The libjpeg-devel package includes the header files and documentation
+necessary for developing programs which will manipulate JPEG files using
+the libjpeg library.
+
+If you are going to develop programs which will manipulate JPEG images,
+you should install libjpeg-devel.  You'll also need to have the libjpeg
+package installed.
 
 %prep
 %setup -q 
 
 %build
 autoreconf -fiv
-%ifarch %{arm}
-%configure --disable-static --with-jpeg8
-%else
-%configure --disable-static --with-jpeg8 --without-simd
-%endif
+%configure --enable-shared --disable-static --with-jpeg8
 make %{?_smp_mflags}
 
 #%check
@@ -39,6 +42,8 @@ make %{?_smp_mflags}
 
 %install
 %makeinstall
+mkdir -p %{buildroot}/usr/share/license
+cp COPYING %{buildroot}/usr/share/license/%{name}
 # Fix perms
 chmod -x README-turbo.txt release/copyright
 
@@ -50,27 +55,27 @@ rm -rf $RPM_BUILD_ROOT
 %postun  -p /sbin/ldconfig
 
 %files
+/usr/share/license/%{name}
+%manifest libjpeg-turbo.manifest
 %defattr(-,root,root)
 %{_libdir}/libturbojpeg.so
+%{_libdir}/libjpeg.so.*
+%{_bindir}/cjpeg
+%{_bindir}/djpeg
 %exclude %{_datadir}/man/man1/*
 %exclude %{_datadir}/doc/
-%exclude %{_bindir}/cjpeg
-%exclude %{_bindir}/djpeg
+#%exclude %{_bindir}/cjpeg
+#%exclude %{_bindir}/djpeg
 %exclude %{_bindir}/jpegtran
 %exclude %{_bindir}/rdjpgcom
 %exclude %{_bindir}/tjbench
 %exclude %{_bindir}/wrjpgcom
-%exclude %{_libdir}/libjpeg.so.*
 
 
 %files devel
 %defattr(-,root,root)
-%{_includedir}/turbojpeg.h
-%exclude %{_libdir}/libjpeg.so
-%{_includedir}/turbojpeg/jpeglib.h
-%{_includedir}/turbojpeg/jerror.h
-%{_includedir}/turbojpeg/jmorecfg.h
-%{_includedir}/turbojpeg/jconfig.h
+%{_libdir}/libjpeg.so
+%{_includedir}/*.h
 %{_libdir}/pkgconfig/turbojpeg.pc
 %exclude %{_libdir}/libjpeg.la
 %exclude %{_libdir}/libturbojpeg.la
index 123e590..df77158 100755 (executable)
@@ -1,7 +1,7 @@
 prefix=/usr
 exec_prefix=${prefix}
 libdir=${exec_prefix}/lib
-includedir=${prefix}/include/turbojpeg
+includedir=${prefix}/include
 
 Name: libturbojpeg
 Description: Loads and saves jpeg
index 696a383..7d17c52 100644 (file)
@@ -1,8 +1,8 @@
 ;
 ; jdclrss2-64.asm - colorspace conversion (64-bit SSE2)
 ;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 D. R. Commander
 ;
 ; Based on
 ; x86 SIMD extension for IJG JPEG library
@@ -251,17 +251,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        jmp     short .out0
 .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmH,xmmH                    ; xmmH=(all 1's)
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmF,xmmH                    ; movntdqu XMMWORD [rdi], xmmF
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
 .out0:
+       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        sub     rcx, byte SIZEOF_XMMWORD
        jz      near .nextrow
 
@@ -271,31 +267,28 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
        jmp     near .columnloop
 
 .column_st32:
-       pcmpeqb xmmH,xmmH                       ; xmmH=(all 1's)
        lea     rcx, [rcx+rcx*2]                ; imul ecx, RGB_PIXELSIZE
        cmp     rcx, byte 2*SIZEOF_XMMWORD
        jb      short .column_st16
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
        movdqa  xmmA,xmmF
        sub     rcx, byte 2*SIZEOF_XMMWORD
        jmp     short .column_st15
 .column_st16:
        cmp     rcx, byte SIZEOF_XMMWORD
        jb      short .column_st15
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
        add     rdi, byte SIZEOF_XMMWORD        ; outptr
        movdqa  xmmA,xmmD
        sub     rcx, byte SIZEOF_XMMWORD
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
        ; Store the lower 8 bytes of xmmA to the output when it has enough
        ; space.
        cmp     rcx, byte SIZEOF_MMWORD
        jb      short .column_st7
-       movq    MMWORD [rdi], xmmA
+       movq    XMM_MMWORD [rdi], xmmA
        add     rdi, byte SIZEOF_MMWORD
        sub     rcx, byte SIZEOF_MMWORD
        psrldq  xmmA, SIZEOF_MMWORD
@@ -304,7 +297,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
        ; space.
        cmp     rcx, byte SIZEOF_DWORD
        jb      short .column_st3
-       movd    DWORD [rdi], xmmA
+       movd    XMM_DWORD [rdi], xmmA
        add     rdi, byte SIZEOF_DWORD
        sub     rcx, byte SIZEOF_DWORD
        psrldq  xmmA, SIZEOF_DWORD
@@ -324,47 +317,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
        test    rcx, rcx
        jz      short .nextrow
        mov     BYTE [rdi], al
-%else
-       mov     rax,rcx
-       xor     rcx, byte 0x0F
-       shl     rcx, 2
-       movd    xmmB,ecx
-       psrlq   xmmH,4
-       pcmpeqb xmmE,xmmE
-       psrlq   xmmH,xmmB
-       psrlq   xmmE,xmmB
-       punpcklbw xmmE,xmmH
-       ; ----------------
-       mov     rcx,rdi
-       and     rcx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       add     rax,rcx
-       cmp     rax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     rdi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     rcx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,rcx
-       movdqa  xmmG,xmmA
-       movdqa  xmmC,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmD,ecx
-       sub     rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmF,ecx
-       psllq   xmmA,xmmF
-       psllq   xmmE,xmmF
-       jmp     short .adj0
-.adj1: neg     ecx
-       movd    xmmF,ecx
-       psrlq   xmmA,xmmF
-       psrlq   xmmE,xmmF
-       psllq   xmmG,xmmD
-       psllq   xmmC,xmmD
-       por     xmmA,xmmG
-       por     xmmE,xmmC
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -409,19 +361,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
        movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        jmp     short .out0
 .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmE,xmmE                    ; xmmE=(all 1's)
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmC,xmmE                    ; movntdqu XMMWORD [rdi], xmmC
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmH,xmmE                    ; movntdqu XMMWORD [rdi], xmmH
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+       movdqu  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
 .out0:
+       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        sub     rcx, byte SIZEOF_XMMWORD
        jz      near .nextrow
 
@@ -431,25 +378,22 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
        jmp     near .columnloop
 
 .column_st32:
-       pcmpeqb xmmE,xmmE                       ; xmmE=(all 1's)
        cmp     rcx, byte SIZEOF_XMMWORD/2
        jb      short .column_st16
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
        movdqa  xmmA,xmmC
        movdqa  xmmD,xmmH
        sub     rcx, byte SIZEOF_XMMWORD/2
 .column_st16:
        cmp     rcx, byte SIZEOF_XMMWORD/4
        jb      short .column_st15
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
        add     rdi, byte SIZEOF_XMMWORD        ; outptr
        movdqa  xmmA,xmmD
        sub     rcx, byte SIZEOF_XMMWORD/4
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
        ; space.
        cmp     rcx, byte SIZEOF_XMMWORD/8
@@ -463,48 +407,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
        ; space.
        test    rcx, rcx
        jz      short .nextrow
-       movd    DWORD [rdi], xmmA
-%else
-       cmp     rcx, byte SIZEOF_XMMWORD/16
-       jb      near .nextrow
-       mov     rax,rcx
-       xor     rcx, byte 0x03
-       inc     rcx
-       shl     rcx, 4
-       movd    xmmF,ecx
-       psrlq   xmmE,xmmF
-       punpcklbw xmmE,xmmE
-       ; ----------------
-       mov     rcx,rdi
-       and     rcx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       lea     rax, [rcx+rax*4]        ; RGB_PIXELSIZE
-       cmp     rax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     rdi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     rcx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmB,xmmA
-       movdqa  xmmG,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmC,ecx
-       sub     rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmH,ecx
-       psllq   xmmA,xmmH
-       psllq   xmmE,xmmH
-       jmp     short .adj0
-.adj1: neg     rcx
-       movd    xmmH,ecx
-       psrlq   xmmA,xmmH
-       psrlq   xmmE,xmmH
-       psllq   xmmB,xmmC
-       psllq   xmmG,xmmC
-       por     xmmA,xmmB
-       por     xmmE,xmmG
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
+       movd    XMM_DWORD [rdi], xmmA
 
 %endif ; RGB_PIXELSIZE ; ---------------
 
index 7f519e6..97754cb 100644 (file)
@@ -1,7 +1,8 @@
 ;
 ; jdclrss2.asm - colorspace conversion (SSE2)
 ;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2012 D. R. Commander
 ;
 ; Based on
 ; x86 SIMD extension for IJG JPEG library
@@ -262,17 +263,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        jmp     short .out0
 .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmH,xmmH                    ; xmmH=(all 1's)
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmF,xmmH                    ; movntdqu XMMWORD [edi], xmmF
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
 .out0:
+       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        sub     ecx, byte SIZEOF_XMMWORD
        jz      near .nextrow
 
@@ -283,31 +280,28 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
        alignx  16,7
 
 .column_st32:
-       pcmpeqb xmmH,xmmH                       ; xmmH=(all 1's)
        lea     ecx, [ecx+ecx*2]                ; imul ecx, RGB_PIXELSIZE
        cmp     ecx, byte 2*SIZEOF_XMMWORD
        jb      short .column_st16
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
        movdqa  xmmA,xmmF
        sub     ecx, byte 2*SIZEOF_XMMWORD
        jmp     short .column_st15
 .column_st16:
        cmp     ecx, byte SIZEOF_XMMWORD
        jb      short .column_st15
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
        add     edi, byte SIZEOF_XMMWORD        ; outptr
        movdqa  xmmA,xmmD
        sub     ecx, byte SIZEOF_XMMWORD
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
        ; Store the lower 8 bytes of xmmA to the output when it has enough
        ; space.
        cmp     ecx, byte SIZEOF_MMWORD
        jb      short .column_st7
-       movq    MMWORD [edi], xmmA
+       movq    XMM_MMWORD [edi], xmmA
        add     edi, byte SIZEOF_MMWORD
        sub     ecx, byte SIZEOF_MMWORD
        psrldq  xmmA, SIZEOF_MMWORD
@@ -316,7 +310,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
        ; space.
        cmp     ecx, byte SIZEOF_DWORD
        jb      short .column_st3
-       movd    DWORD [edi], xmmA
+       movd    XMM_DWORD [edi], xmmA
        add     edi, byte SIZEOF_DWORD
        sub     ecx, byte SIZEOF_DWORD
        psrldq  xmmA, SIZEOF_DWORD
@@ -336,47 +330,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
        test    ecx, ecx
        jz      short .nextrow
        mov     BYTE [edi], al
-%else
-       mov     eax,ecx
-       xor     ecx, byte 0x0F
-       shl     ecx, 2
-       movd    xmmB,ecx
-       psrlq   xmmH,4
-       pcmpeqb xmmE,xmmE
-       psrlq   xmmH,xmmB
-       psrlq   xmmE,xmmB
-       punpcklbw xmmE,xmmH
-       ; ----------------
-       mov     ecx,edi
-       and     ecx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       add     eax,ecx
-       cmp     eax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     edi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     ecx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmG,xmmA
-       movdqa  xmmC,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmD,ecx
-       sub     ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmF,ecx
-       psllq   xmmA,xmmF
-       psllq   xmmE,xmmF
-       jmp     short .adj0
-.adj1: neg     ecx
-       movd    xmmF,ecx
-       psrlq   xmmA,xmmF
-       psrlq   xmmE,xmmF
-       psllq   xmmG,xmmD
-       psllq   xmmC,xmmD
-       por     xmmA,xmmG
-       por     xmmE,xmmC
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -421,19 +374,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
        movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        jmp     short .out0
 .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmE,xmmE                    ; xmmE=(all 1's)
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmC,xmmE                    ; movntdqu XMMWORD [edi], xmmC
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmH,xmmE                    ; movntdqu XMMWORD [edi], xmmH
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+       movdqu  XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
 .out0:
+       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        sub     ecx, byte SIZEOF_XMMWORD
        jz      near .nextrow
 
@@ -444,30 +392,27 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
        alignx  16,7
 
 .column_st32:
-       pcmpeqb xmmE,xmmE                       ; xmmE=(all 1's)
        cmp     ecx, byte SIZEOF_XMMWORD/2
        jb      short .column_st16
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
        movdqa  xmmA,xmmC
        movdqa  xmmD,xmmH
        sub     ecx, byte SIZEOF_XMMWORD/2
 .column_st16:
        cmp     ecx, byte SIZEOF_XMMWORD/4
        jb      short .column_st15
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
        add     edi, byte SIZEOF_XMMWORD        ; outptr
        movdqa  xmmA,xmmD
        sub     ecx, byte SIZEOF_XMMWORD/4
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
        ; space.
        cmp     ecx, byte SIZEOF_XMMWORD/8
        jb      short .column_st7
-       movq    MMWORD [edi], xmmA
+       movq    XMM_MMWORD [edi], xmmA
        add     edi, byte SIZEOF_XMMWORD/8*4
        sub     ecx, byte SIZEOF_XMMWORD/8
        psrldq  xmmA, SIZEOF_XMMWORD/8*4
@@ -476,48 +421,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
        ; space.
        test    ecx, ecx
        jz      short .nextrow
-       movd    DWORD [edi], xmmA
-%else
-       cmp     ecx, byte SIZEOF_XMMWORD/16
-       jb      short .nextrow
-       mov     eax,ecx
-       xor     ecx, byte 0x03
-       inc     ecx
-       shl     ecx, 4
-       movd    xmmF,ecx
-       psrlq   xmmE,xmmF
-       punpcklbw xmmE,xmmE
-       ; ----------------
-       mov     ecx,edi
-       and     ecx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       lea     eax, [ecx+eax*4]        ; RGB_PIXELSIZE
-       cmp     eax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     edi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     ecx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmB,xmmA
-       movdqa  xmmG,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmC,ecx
-       sub     ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmH,ecx
-       psllq   xmmA,xmmH
-       psllq   xmmE,xmmH
-       jmp     short .adj0
-.adj1: neg     ecx
-       movd    xmmH,ecx
-       psrlq   xmmA,xmmH
-       psrlq   xmmE,xmmH
-       psllq   xmmB,xmmC
-       psllq   xmmG,xmmC
-       por     xmmA,xmmB
-       por     xmmE,xmmG
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
+       movd    XMM_DWORD [edi], xmmA
 
 %endif ; RGB_PIXELSIZE ; ---------------
 
index a64a6b3..ffbf6b2 100644 (file)
@@ -1,8 +1,8 @@
 ;
 ; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
 ;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 D. R. Commander
 ;
 ; Based on
 ; x86 SIMD extension for IJG JPEG library
@@ -12,7 +12,7 @@
 ; This file should be assembled with NASM (Netwide Assembler),
 ; can *not* be assembled with Microsoft's MASM or any compatible
 ; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ for
+; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
 ;
 ; [TAB8]
@@ -252,17 +252,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        jmp     short .out0
 .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmH,xmmH                    ; xmmH=(all 1's)
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmF,xmmH                    ; movntdqu XMMWORD [rdi], xmmF
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
 .out0:
+       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        sub     rcx, byte SIZEOF_XMMWORD
        jz      near .endcolumn
 
@@ -275,31 +271,28 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        jmp     near .columnloop
 
 .column_st32:
-       pcmpeqb xmmH,xmmH                       ; xmmH=(all 1's)
        lea     rcx, [rcx+rcx*2]                ; imul ecx, RGB_PIXELSIZE
        cmp     rcx, byte 2*SIZEOF_XMMWORD
        jb      short .column_st16
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
        movdqa  xmmA,xmmF
        sub     rcx, byte 2*SIZEOF_XMMWORD
        jmp     short .column_st15
 .column_st16:
        cmp     rcx, byte SIZEOF_XMMWORD
        jb      short .column_st15
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
        add     rdi, byte SIZEOF_XMMWORD        ; outptr
        movdqa  xmmA,xmmD
        sub     rcx, byte SIZEOF_XMMWORD
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
        ; Store the lower 8 bytes of xmmA to the output when it has enough
        ; space.
        cmp     rcx, byte SIZEOF_MMWORD
        jb      short .column_st7
-       movq    MMWORD [rdi], xmmA
+       movq    XMM_MMWORD [rdi], xmmA
        add     rdi, byte SIZEOF_MMWORD
        sub     rcx, byte SIZEOF_MMWORD
        psrldq  xmmA, SIZEOF_MMWORD
@@ -308,7 +301,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        ; space.
        cmp     rcx, byte SIZEOF_DWORD
        jb      short .column_st3
-       movd    DWORD [rdi], xmmA
+       movd    XMM_DWORD [rdi], xmmA
        add     rdi, byte SIZEOF_DWORD
        sub     rcx, byte SIZEOF_DWORD
        psrldq  xmmA, SIZEOF_DWORD
@@ -328,47 +321,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        test    rcx, rcx
        jz      short .endcolumn
        mov     BYTE [rdi], al
-%else
-       mov     rax,rcx
-       xor     rcx, byte 0x0F
-       shl     rcx, 2
-       movd    xmmB,ecx
-       psrlq   xmmH,4
-       pcmpeqb xmmE,xmmE
-       psrlq   xmmH,xmmB
-       psrlq   xmmE,xmmB
-       punpcklbw xmmE,xmmH
-       ; ----------------
-       mov     rcx,rdi
-       and     rcx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       add     rax,rcx
-       cmp     rax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     rdi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     rcx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmG,xmmA
-       movdqa  xmmC,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmD,ecx
-       sub     rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmF,ecx
-       psllq   xmmA,xmmF
-       psllq   xmmE,xmmF
-       jmp     short .adj0
-.adj1: neg     rcx
-       movd    xmmF,ecx
-       psrlq   xmmA,xmmF
-       psrlq   xmmE,xmmF
-       psllq   xmmG,xmmD
-       psllq   xmmC,xmmD
-       por     xmmA,xmmG
-       por     xmmE,xmmC
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -413,19 +365,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
        movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        jmp     short .out0
 .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmE,xmmE                    ; xmmE=(all 1's)
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmC,xmmE                    ; movntdqu XMMWORD [rdi], xmmC
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmH,xmmE                    ; movntdqu XMMWORD [rdi], xmmH
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+       movdqu  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
 .out0:
+       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        sub     rcx, byte SIZEOF_XMMWORD
        jz      near .endcolumn
 
@@ -438,30 +385,27 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        jmp     near .columnloop
 
 .column_st32:
-       pcmpeqb xmmE,xmmE                       ; xmmE=(all 1's)
        cmp     rcx, byte SIZEOF_XMMWORD/2
        jb      short .column_st16
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
        movdqa  xmmA,xmmC
        movdqa  xmmD,xmmH
        sub     rcx, byte SIZEOF_XMMWORD/2
 .column_st16:
        cmp     rcx, byte SIZEOF_XMMWORD/4
        jb      short .column_st15
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
        add     rdi, byte SIZEOF_XMMWORD        ; outptr
        movdqa  xmmA,xmmD
        sub     rcx, byte SIZEOF_XMMWORD/4
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
        ; space.
        cmp     rcx, byte SIZEOF_XMMWORD/8
        jb      short .column_st7
-       movq    MMWORD [rdi], xmmA
+       movq    XMM_MMWORD [rdi], xmmA
        add     rdi, byte SIZEOF_XMMWORD/8*4
        sub     rcx, byte SIZEOF_XMMWORD/8
        psrldq  xmmA, SIZEOF_XMMWORD/8*4
@@ -470,48 +414,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        ; space.
        test    rcx, rcx
        jz      short .endcolumn
-       movd    DWORD [rdi], xmmA
-%else
-       cmp     rcx, byte SIZEOF_XMMWORD/16
-       jb      near .endcolumn
-       mov     rax,rcx
-       xor     rcx, byte 0x03
-       inc     rcx
-       shl     rcx, 4
-       movd    xmmF,ecx
-       psrlq   xmmE,xmmF
-       punpcklbw xmmE,xmmE
-       ; ----------------
-       mov     rcx,rdi
-       and     rcx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       lea     rax, [rcx+rax*4]        ; RGB_PIXELSIZE
-       cmp     rax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     rdi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     rcx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmB,xmmA
-       movdqa  xmmG,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmC,ecx
-       sub     rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmH,ecx
-       psllq   xmmA,xmmH
-       psllq   xmmE,xmmH
-       jmp     short .adj0
-.adj1: neg     rcx
-       movd    xmmH,ecx
-       psrlq   xmmA,xmmH
-       psrlq   xmmE,xmmH
-       psllq   xmmB,xmmC
-       psllq   xmmG,xmmC
-       por     xmmA,xmmB
-       por     xmmE,xmmG
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
+       movd    XMM_DWORD [rdi], xmmA
 
 %endif ; RGB_PIXELSIZE ; ---------------
 
index 04089aa..6494340 100644 (file)
@@ -1,7 +1,8 @@
 ;
 ; jdmrgss2.asm - merged upsampling/color conversion (SSE2)
 ;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2012 D. R. Commander
 ;
 ; Based on
 ; x86 SIMD extension for IJG JPEG library
@@ -264,17 +265,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        jmp     short .out0
 .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmH,xmmH                    ; xmmH=(all 1's)
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmF,xmmH                    ; movntdqu XMMWORD [edi], xmmF
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
 .out0:
+       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        sub     ecx, byte SIZEOF_XMMWORD
        jz      near .endcolumn
 
@@ -288,31 +285,28 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        alignx  16,7
 
 .column_st32:
-       pcmpeqb xmmH,xmmH                       ; xmmH=(all 1's)
        lea     ecx, [ecx+ecx*2]                ; imul ecx, RGB_PIXELSIZE
        cmp     ecx, byte 2*SIZEOF_XMMWORD
        jb      short .column_st16
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
        movdqa  xmmA,xmmF
        sub     ecx, byte 2*SIZEOF_XMMWORD
        jmp     short .column_st15
 .column_st16:
        cmp     ecx, byte SIZEOF_XMMWORD
        jb      short .column_st15
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
        add     edi, byte SIZEOF_XMMWORD        ; outptr
        movdqa  xmmA,xmmD
        sub     ecx, byte SIZEOF_XMMWORD
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
        ; Store the lower 8 bytes of xmmA to the output when it has enough
        ; space.
        cmp     ecx, byte SIZEOF_MMWORD
        jb      short .column_st7
-       movq    MMWORD [edi], xmmA
+       movq    XMM_MMWORD [edi], xmmA
        add     edi, byte SIZEOF_MMWORD
        sub     ecx, byte SIZEOF_MMWORD
        psrldq  xmmA, SIZEOF_MMWORD
@@ -321,7 +315,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        ; space.
        cmp     ecx, byte SIZEOF_DWORD
        jb      short .column_st3
-       movd    DWORD [edi], xmmA
+       movd    XMM_DWORD [edi], xmmA
        add     edi, byte SIZEOF_DWORD
        sub     ecx, byte SIZEOF_DWORD
        psrldq  xmmA, SIZEOF_DWORD
@@ -341,47 +335,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        test    ecx, ecx
        jz      short .endcolumn
        mov     BYTE [edi], al
-%else
-       mov     eax,ecx
-       xor     ecx, byte 0x0F
-       shl     ecx, 2
-       movd    xmmB,ecx
-       psrlq   xmmH,4
-       pcmpeqb xmmE,xmmE
-       psrlq   xmmH,xmmB
-       psrlq   xmmE,xmmB
-       punpcklbw xmmE,xmmH
-       ; ----------------
-       mov     ecx,edi
-       and     ecx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       add     eax,ecx
-       cmp     eax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     edi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     ecx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmG,xmmA
-       movdqa  xmmC,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmD,ecx
-       sub     ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmF,ecx
-       psllq   xmmA,xmmF
-       psllq   xmmE,xmmF
-       jmp     short .adj0
-.adj1: neg     ecx
-       movd    xmmF,ecx
-       psrlq   xmmA,xmmF
-       psrlq   xmmE,xmmF
-       psllq   xmmG,xmmD
-       psllq   xmmC,xmmD
-       por     xmmA,xmmG
-       por     xmmE,xmmC
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -426,19 +379,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
        movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
        movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        jmp     short .out0
 .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmE,xmmE                    ; xmmE=(all 1's)
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmC,xmmE                    ; movntdqu XMMWORD [edi], xmmC
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmH,xmmE                    ; movntdqu XMMWORD [edi], xmmH
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+       movdqu  XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
 .out0:
+       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        sub     ecx, byte SIZEOF_XMMWORD
        jz      near .endcolumn
 
@@ -452,80 +400,36 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        alignx  16,7
 
 .column_st32:
-       pcmpeqb xmmE,xmmE                       ; xmmE=(all 1's)
        cmp     ecx, byte SIZEOF_XMMWORD/2
        jb      short .column_st16
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
        movdqa  xmmA,xmmC
        movdqa  xmmD,xmmH
        sub     ecx, byte SIZEOF_XMMWORD/2
 .column_st16:
        cmp     ecx, byte SIZEOF_XMMWORD/4
        jb      short .column_st15
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
        add     edi, byte SIZEOF_XMMWORD        ; outptr
        movdqa  xmmA,xmmD
        sub     ecx, byte SIZEOF_XMMWORD/4
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
        ; space.
        cmp     ecx, byte SIZEOF_XMMWORD/8
        jb      short .column_st7
-       movq    MMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD/2
+       movq    XMM_MMWORD [edi], xmmA
+       add     edi, byte SIZEOF_XMMWORD/8*4
        sub     ecx, byte SIZEOF_XMMWORD/8
-       psrldq  xmmA, 64
+       psrldq  xmmA, SIZEOF_XMMWORD/8*4
 .column_st7:
        ; Store one pixel (4 bytes) of xmmA to the output when it has enough
        ; space.
        test    ecx, ecx
        jz      short .endcolumn
-       movd    DWORD [edi], xmmA
-%else
-       cmp     ecx, byte SIZEOF_XMMWORD/16
-       jb      short .endcolumn
-       mov     eax,ecx
-       xor     ecx, byte 0x03
-       inc     ecx
-       shl     ecx, 4
-       movd    xmmF,ecx
-       psrlq   xmmE,xmmF
-       punpcklbw xmmE,xmmE
-       ; ----------------
-       mov     ecx,edi
-       and     ecx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       lea     eax, [ecx+eax*4]        ; RGB_PIXELSIZE
-       cmp     eax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     edi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     ecx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmB,xmmA
-       movdqa  xmmG,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmC,ecx
-       sub     ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmH,ecx
-       psllq   xmmA,xmmH
-       psllq   xmmE,xmmH
-       jmp     short .adj0
-.adj1: neg     ecx
-       movd    xmmH,ecx
-       psrlq   xmmA,xmmH
-       psrlq   xmmE,xmmH
-       psllq   xmmB,xmmC
-       psllq   xmmG,xmmC
-       por     xmmA,xmmB
-       por     xmmE,xmmG
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
+       movd    XMM_DWORD [edi], xmmA
 
 %endif ; RGB_PIXELSIZE ; ---------------
 
index 6ee99cc..3d4751f 100644 (file)
@@ -522,6 +522,10 @@ EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
         JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
              JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
 
+EXTERN(void) jsimd_h2v1_fancy_upsample_neon
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
 /* SIMD Sample Conversion */
 EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
                                      JDIMENSION start_col,
index af0c2c8..cae84df 100644 (file)
@@ -338,6 +338,15 @@ jsimd_can_h2v1_fancy_upsample (void)
 {
   init_simd();
 
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -355,6 +364,9 @@ jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
                            JSAMPARRAY input_data,
                            JSAMPARRAY * output_data_ptr)
 {
+  if (simd_support & JSIMD_ARM_NEON)
+    jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
 }
 
 GLOBAL(int)
index b2f9c2a..9962b8a 100644 (file)
@@ -2157,3 +2157,241 @@ asm_function jsimd_quantize_neon
     .unreq          SHIFT
     .unreq          LOOP_COUNT
 .endfunc
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(void)
+ * jsimd_h2v1_fancy_upsample_neon (int          max_v_samp_factor,
+ *                                 JDIMENSION   downsampled_width,
+ *                                 JSAMPARRAY   input_data,
+ *                                 JSAMPARRAY * output_data_ptr);
+ *
+ * Note: the use of unaligned writes is the main remaining bottleneck in
+ *       this code, which can be potentially solved to get up to tens
+ *       of percents performance improvement on Cortex-A8/Cortex-A9.
+ */
+
+/*
+ * Upsample 16 source pixels to 32 destination pixels. The new 16 source
+ * pixels are loaded to q0. The previous 16 source pixels are in q1. The
+ * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
+ * Register d28 is used for multiplication by 3. Register q15 is used
+ * for adding +1 bias.
+ */
+.macro upsample16   OUTPTR, INPTR
+    vld1.8          {q0}, [\INPTR]!
+    vmovl.u8        q8,  d0
+    vext.8          q2,  q1,  q0, #15
+    vmovl.u8        q9,  d1
+    vaddw.u8        q10, q15, d4
+    vaddw.u8        q11, q15, d5
+    vmlal.u8        q8,  d4,  d28
+    vmlal.u8        q9,  d5,  d28
+    vmlal.u8        q10, d0,  d28
+    vmlal.u8        q11, d1,  d28
+    vmov            q1,  q0       /* backup source pixels to q1 */
+    vrshrn.u16      d6,  q8,  #2
+    vrshrn.u16      d7,  q9,  #2
+    vshrn.u16       d8,  q10, #2
+    vshrn.u16       d9,  q11, #2
+    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
+.endm
+
+/*
+ * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
+ * macro, the roles of q0 and q1 registers are reversed for even and odd
+ * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
+ * Also this unrolling allows to reorder loads and stores to compensate
+ * multiplication latency and reduce stalls.
+ */
+.macro upsample32   OUTPTR, INPTR
+    /* even 16 pixels group */
+    vld1.8          {q0}, [\INPTR]!
+    vmovl.u8        q8,  d0
+    vext.8          q2,  q1,  q0, #15
+    vmovl.u8        q9,  d1
+    vaddw.u8        q10, q15, d4
+    vaddw.u8        q11, q15, d5
+    vmlal.u8        q8,  d4,  d28
+    vmlal.u8        q9,  d5,  d28
+    vmlal.u8        q10, d0,  d28
+    vmlal.u8        q11, d1,  d28
+        /* odd 16 pixels group */
+        vld1.8          {q1}, [\INPTR]!
+    vrshrn.u16      d6,  q8,  #2
+    vrshrn.u16      d7,  q9,  #2
+    vshrn.u16       d8,  q10, #2
+    vshrn.u16       d9,  q11, #2
+        vmovl.u8        q8,  d2
+        vext.8          q2,  q0,  q1, #15
+        vmovl.u8        q9,  d3
+        vaddw.u8        q10, q15, d4
+        vaddw.u8        q11, q15, d5
+        vmlal.u8        q8,  d4,  d28
+        vmlal.u8        q9,  d5,  d28
+        vmlal.u8        q10, d2,  d28
+        vmlal.u8        q11, d3,  d28
+    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
+        vrshrn.u16      d6,  q8,  #2
+        vrshrn.u16      d7,  q9,  #2
+        vshrn.u16       d8,  q10, #2
+        vshrn.u16       d9,  q11, #2
+        vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
+.endm
+
+/*
+ * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
+ */
+.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
+    /* special case for the first and last pixels */
+    sub             \WIDTH, \WIDTH, #1
+    add             \OUTPTR, \OUTPTR, #1
+    ldrb            \TMP1, [\INPTR, \WIDTH]
+    strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
+    ldrb            \TMP1, [\INPTR], #1
+    strb            \TMP1, [\OUTPTR, #-1]
+    vmov.8          d3[7], \TMP1
+
+    subs            \WIDTH, \WIDTH, #32
+    blt             5f
+0:  /* process 32 pixels per iteration */
+    upsample32      \OUTPTR, \INPTR
+    subs            \WIDTH, \WIDTH, #32
+    bge             0b
+5:
+    adds            \WIDTH, \WIDTH, #16
+    blt             1f
+0:  /* process 16 pixels if needed */
+    upsample16      \OUTPTR, \INPTR
+    subs            \WIDTH, \WIDTH, #16
+1:
+    adds            \WIDTH, \WIDTH, #16
+    beq             9f
+
+    /* load the remaining 1-15 pixels */
+    add             \INPTR, \INPTR, \WIDTH
+    tst             \WIDTH, #1
+    beq             2f
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[0]}, [\INPTR]
+2:
+    tst             \WIDTH, #2
+    beq             2f
+    vext.8          d0, d0, d0, #6
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[1]}, [\INPTR]
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[0]}, [\INPTR]
+2:
+    tst             \WIDTH, #4
+    beq             2f
+    vrev64.32       d0, d0
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[3]}, [\INPTR]
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[2]}, [\INPTR]
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[1]}, [\INPTR]
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[0]}, [\INPTR]
+2:
+    tst             \WIDTH, #8
+    beq             2f
+    vmov            d1,  d0
+    sub             \INPTR, \INPTR, #8
+    vld1.8          {d0}, [\INPTR]
+2:  /* upsample the remaining pixels */
+    vmovl.u8        q8,  d0
+    vext.8          q2,  q1,  q0, #15
+    vmovl.u8        q9,  d1
+    vaddw.u8        q10, q15, d4
+    vaddw.u8        q11, q15, d5
+    vmlal.u8        q8,  d4,  d28
+    vmlal.u8        q9,  d5,  d28
+    vmlal.u8        q10, d0,  d28
+    vmlal.u8        q11, d1,  d28
+    vrshrn.u16      d10, q8,  #2
+    vrshrn.u16      d12, q9,  #2
+    vshrn.u16       d11, q10, #2
+    vshrn.u16       d13, q11, #2
+    vzip.8          d10, d11
+    vzip.8          d12, d13
+    /* store the remaining pixels */
+    tst             \WIDTH, #8
+    beq             2f
+    vst1.8          {d10, d11}, [\OUTPTR]!
+    vmov            q5,  q6
+2:
+    tst             \WIDTH, #4
+    beq             2f
+    vst1.8          {d10}, [\OUTPTR]!
+    vmov            d10,  d11
+2:
+    tst             \WIDTH, #2
+    beq             2f
+    vst1.8          {d10[0]}, [\OUTPTR]!
+    vst1.8          {d10[1]}, [\OUTPTR]!
+    vst1.8          {d10[2]}, [\OUTPTR]!
+    vst1.8          {d10[3]}, [\OUTPTR]!
+    vext.8          d10, d10, d10, #4
+2:
+    tst             \WIDTH, #1
+    beq             2f
+    vst1.8          {d10[0]}, [\OUTPTR]!
+    vst1.8          {d10[1]}, [\OUTPTR]!
+2:
+9:
+.endm
+
+asm_function jsimd_h2v1_fancy_upsample_neon
+
+    MAX_V_SAMP_FACTOR .req r0
+    DOWNSAMPLED_WIDTH .req r1
+    INPUT_DATA        .req r2
+    OUTPUT_DATA_PTR   .req r3
+    OUTPUT_DATA       .req OUTPUT_DATA_PTR
+
+    OUTPTR            .req r4
+    INPTR             .req r5
+    WIDTH             .req ip
+    TMP               .req lr
+
+    push            {r4, r5, r6, lr}
+    vpush           {d8-d15}
+
+    ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
+    cmp             MAX_V_SAMP_FACTOR, #0
+    ble             99f
+
+    /* initialize constants */
+    vmov.u8         d28, #3
+    vmov.u16        q15, #1
+11:
+    ldr             INPTR, [INPUT_DATA], #4
+    ldr             OUTPTR, [OUTPUT_DATA], #4
+    mov             WIDTH, DOWNSAMPLED_WIDTH
+    upsample_row    OUTPTR, INPTR, WIDTH, TMP
+    subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
+    bgt             11b
+
+99:
+    vpop            {d8-d15}
+    pop             {r4, r5, r6, pc}
+
+    .unreq          MAX_V_SAMP_FACTOR
+    .unreq          DOWNSAMPLED_WIDTH
+    .unreq          INPUT_DATA
+    .unreq          OUTPUT_DATA_PTR
+    .unreq          OUTPUT_DATA
+
+    .unreq          OUTPTR
+    .unreq          INPTR
+    .unreq          WIDTH
+    .unreq          TMP
+
+.endfunc
+
+.purgem upsample16
+.purgem upsample32
+.purgem upsample_row
index 0dacd06..583b7e3 100644 (file)
 #include "../jmorecfg.h"
 #include "jsimd.h"
 
-#define define(var) %define _cpp_protection_##var
-#define definev(var) %define _cpp_protection_##var var
-
 ;
 ; -- jpeglib.h
 ;
 
-definev(DCTSIZE)
-definev(DCTSIZE2)
+%define _cpp_protection_DCTSIZE DCTSIZE
+%define _cpp_protection_DCTSIZE2 DCTSIZE2
 
 ;
 ; -- jmorecfg.h
 ;
 
-definev(RGB_RED)
-definev(RGB_GREEN)
-definev(RGB_BLUE)
-definev(RGB_PIXELSIZE)
+%define _cpp_protection_RGB_RED RGB_RED
+%define _cpp_protection_RGB_GREEN RGB_GREEN
+%define _cpp_protection_RGB_BLUE RGB_BLUE
+%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE
 
-definev(EXT_RGB_RED)
-definev(EXT_RGB_GREEN)
-definev(EXT_RGB_BLUE)
-definev(EXT_RGB_PIXELSIZE)
+%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED
+%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN
+%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE
+%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE
 
-definev(EXT_RGBX_RED)
-definev(EXT_RGBX_GREEN)
-definev(EXT_RGBX_BLUE)
-definev(EXT_RGBX_PIXELSIZE)
+%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED
+%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN
+%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE
+%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE
 
-definev(EXT_BGR_RED)
-definev(EXT_BGR_GREEN)
-definev(EXT_BGR_BLUE)
-definev(EXT_BGR_PIXELSIZE)
+%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED
+%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN
+%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE
+%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE
 
-definev(EXT_BGRX_RED)
-definev(EXT_BGRX_GREEN)
-definev(EXT_BGRX_BLUE)
-definev(EXT_BGRX_PIXELSIZE)
+%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED
+%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN
+%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE
+%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE
 
-definev(EXT_XBGR_RED)
-definev(EXT_XBGR_GREEN)
-definev(EXT_XBGR_BLUE)
-definev(EXT_XBGR_PIXELSIZE)
+%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED
+%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN
+%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE
+%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE
 
-definev(EXT_XRGB_RED)
-definev(EXT_XRGB_GREEN)
-definev(EXT_XRGB_BLUE)
-definev(EXT_XRGB_PIXELSIZE)
+%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED
+%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN
+%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE
+%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE
 
 %define RGBX_FILLER_0XFF        1
 
@@ -73,7 +70,7 @@ definev(EXT_XRGB_PIXELSIZE)
 %define JSAMPLE                 byte          ; unsigned char
 %define SIZEOF_JSAMPLE          SIZEOF_BYTE   ; sizeof(JSAMPLE)
 
-definev(CENTERJSAMPLE)
+%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE
 
 ; Representation of a DCT frequency coefficient.
 ; On this SIMD implementation, this must be 'short'.
@@ -126,74 +123,74 @@ definev(CENTERJSAMPLE)
 ; -- jsimd.h
 ;
 
-definev(JSIMD_NONE)
-definev(JSIMD_MMX)
-definev(JSIMD_3DNOW)
-definev(JSIMD_SSE)
-definev(JSIMD_SSE2)
+%define _cpp_protection_JSIMD_NONE JSIMD_NONE
+%define _cpp_protection_JSIMD_MMX JSIMD_MMX
+%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW
+%define _cpp_protection_JSIMD_SSE JSIMD_SSE
+%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2
 
 ; Short forms of external names for systems with brain-damaged linkers.
 ;
 #ifdef NEED_SHORT_EXTERNAL_NAMES
-definev(jpeg_simd_cpu_support)
-definev(jsimd_rgb_ycc_convert_mmx)
-definev(jsimd_ycc_rgb_convert_mmx)
-definev(jconst_rgb_ycc_convert_sse2)
-definev(jsimd_rgb_ycc_convert_sse2)
-definev(jconst_ycc_rgb_convert_sse2)
-definev(jsimd_ycc_rgb_convert_sse2)
-definev(jsimd_h2v2_downsample_mmx)
-definev(jsimd_h2v1_downsample_mmx)
-definev(jsimd_h2v2_downsample_sse2)
-definev(jsimd_h2v1_downsample_sse2)
-definev(jsimd_h2v2_upsample_mmx)
-definev(jsimd_h2v1_upsample_mmx)
-definev(jsimd_h2v1_fancy_upsample_mmx)
-definev(jsimd_h2v2_fancy_upsample_mmx)
-definev(jsimd_h2v1_merged_upsample_mmx)
-definev(jsimd_h2v2_merged_upsample_mmx)
-definev(jsimd_h2v2_upsample_sse2)
-definev(jsimd_h2v1_upsample_sse2)
-definev(jconst_fancy_upsample_sse2)
-definev(jsimd_h2v1_fancy_upsample_sse2)
-definev(jsimd_h2v2_fancy_upsample_sse2)
-definev(jconst_merged_upsample_sse2)
-definev(jsimd_h2v1_merged_upsample_sse2)
-definev(jsimd_h2v2_merged_upsample_sse2)
-definev(jsimd_convsamp_mmx)
-definev(jsimd_convsamp_sse2)
-definev(jsimd_convsamp_float_3dnow)
-definev(jsimd_convsamp_float_sse)
-definev(jsimd_convsamp_float_sse2)
-definev(jsimd_fdct_islow_mmx)
-definev(jsimd_fdct_ifast_mmx)
-definev(jconst_fdct_islow_sse2)
-definev(jsimd_fdct_islow_sse2)
-definev(jconst_fdct_ifast_sse2)
-definev(jsimd_fdct_ifast_sse2)
-definev(jsimd_fdct_float_3dnow)
-definev(jconst_fdct_float_sse)
-definev(jsimd_fdct_float_sse)
-definev(jsimd_quantize_mmx)
-definev(jsimd_quantize_sse2)
-definev(jsimd_quantize_float_3dnow)
-definev(jsimd_quantize_float_sse)
-definev(jsimd_quantize_float_sse2)
-definev(jsimd_idct_2x2_mmx)
-definev(jsimd_idct_4x4_mmx)
-definev(jconst_idct_red_sse2)
-definev(jsimd_idct_2x2_sse2)
-definev(jsimd_idct_4x4_sse2)
-definev(jsimd_idct_islow_mmx)
-definev(jsimd_idct_ifast_mmx)
-definev(jconst_idct_islow_sse2)
-definev(jsimd_idct_islow_sse2)
-definev(jconst_idct_ifast_sse2)
-definev(jsimd_idct_ifast_sse2)
-definev(jsimd_idct_float_3dnow)
-definev(jconst_idct_float_sse)
-definev(jsimd_idct_float_sse)
-definev(jconst_idct_float_sse2)
-definev(jsimd_idct_float_sse2)
+%define _cpp_protection_jpeg_simd_cpu_support jpeg_simd_cpu_support
+%define _cpp_protection_jsimd_rgb_ycc_convert_mmx jsimd_rgb_ycc_convert_mmx
+%define _cpp_protection_jsimd_ycc_rgb_convert_mmx jsimd_ycc_rgb_convert_mmx
+%define _cpp_protection_jconst_rgb_ycc_convert_sse2 jconst_rgb_ycc_convert_sse2
+%define _cpp_protection_jsimd_rgb_ycc_convert_sse2 jsimd_rgb_ycc_convert_sse2
+%define _cpp_protection_jconst_ycc_rgb_convert_sse2 jconst_ycc_rgb_convert_sse2
+%define _cpp_protection_jsimd_ycc_rgb_convert_sse2 jsimd_ycc_rgb_convert_sse2
+%define _cpp_protection_jsimd_h2v2_downsample_mmx jsimd_h2v2_downsample_mmx
+%define _cpp_protection_jsimd_h2v1_downsample_mmx jsimd_h2v1_downsample_mmx
+%define _cpp_protection_jsimd_h2v2_downsample_sse2 jsimd_h2v2_downsample_sse2
+%define _cpp_protection_jsimd_h2v1_downsample_sse2 jsimd_h2v1_downsample_sse2
+%define _cpp_protection_jsimd_h2v2_upsample_mmx jsimd_h2v2_upsample_mmx
+%define _cpp_protection_jsimd_h2v1_upsample_mmx jsimd_h2v1_upsample_mmx
+%define _cpp_protection_jsimd_h2v1_fancy_upsample_mmx jsimd_h2v1_fancy_upsample_mmx
+%define _cpp_protection_jsimd_h2v2_fancy_upsample_mmx jsimd_h2v2_fancy_upsample_mmx
+%define _cpp_protection_jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_merged_upsample_mmx
+%define _cpp_protection_jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_merged_upsample_mmx
+%define _cpp_protection_jsimd_h2v2_upsample_sse2 jsimd_h2v2_upsample_sse2
+%define _cpp_protection_jsimd_h2v1_upsample_sse2 jsimd_h2v1_upsample_sse2
+%define _cpp_protection_jconst_fancy_upsample_sse2 jconst_fancy_upsample_sse2
+%define _cpp_protection_jsimd_h2v1_fancy_upsample_sse2 jsimd_h2v1_fancy_upsample_sse2
+%define _cpp_protection_jsimd_h2v2_fancy_upsample_sse2 jsimd_h2v2_fancy_upsample_sse2
+%define _cpp_protection_jconst_merged_upsample_sse2 jconst_merged_upsample_sse2
+%define _cpp_protection_jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_merged_upsample_sse2
+%define _cpp_protection_jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_merged_upsample_sse2
+%define _cpp_protection_jsimd_convsamp_mmx jsimd_convsamp_mmx
+%define _cpp_protection_jsimd_convsamp_sse2 jsimd_convsamp_sse2
+%define _cpp_protection_jsimd_convsamp_float_3dnow jsimd_convsamp_float_3dnow
+%define _cpp_protection_jsimd_convsamp_float_sse jsimd_convsamp_float_sse
+%define _cpp_protection_jsimd_convsamp_float_sse2 jsimd_convsamp_float_sse2
+%define _cpp_protection_jsimd_fdct_islow_mmx jsimd_fdct_islow_mmx
+%define _cpp_protection_jsimd_fdct_ifast_mmx jsimd_fdct_ifast_mmx
+%define _cpp_protection_jconst_fdct_islow_sse2 jconst_fdct_islow_sse2
+%define _cpp_protection_jsimd_fdct_islow_sse2 jsimd_fdct_islow_sse2
+%define _cpp_protection_jconst_fdct_ifast_sse2 jconst_fdct_ifast_sse2
+%define _cpp_protection_jsimd_fdct_ifast_sse2 jsimd_fdct_ifast_sse2
+%define _cpp_protection_jsimd_fdct_float_3dnow jsimd_fdct_float_3dnow
+%define _cpp_protection_jconst_fdct_float_sse jconst_fdct_float_sse
+%define _cpp_protection_jsimd_fdct_float_sse jsimd_fdct_float_sse
+%define _cpp_protection_jsimd_quantize_mmx jsimd_quantize_mmx
+%define _cpp_protection_jsimd_quantize_sse2 jsimd_quantize_sse2
+%define _cpp_protection_jsimd_quantize_float_3dnow jsimd_quantize_float_3dnow
+%define _cpp_protection_jsimd_quantize_float_sse jsimd_quantize_float_sse
+%define _cpp_protection_jsimd_quantize_float_sse2 jsimd_quantize_float_sse2
+%define _cpp_protection_jsimd_idct_2x2_mmx jsimd_idct_2x2_mmx
+%define _cpp_protection_jsimd_idct_4x4_mmx jsimd_idct_4x4_mmx
+%define _cpp_protection_jconst_idct_red_sse2 jconst_idct_red_sse2
+%define _cpp_protection_jsimd_idct_2x2_sse2 jsimd_idct_2x2_sse2
+%define _cpp_protection_jsimd_idct_4x4_sse2 jsimd_idct_4x4_sse2
+%define _cpp_protection_jsimd_idct_islow_mmx jsimd_idct_islow_mmx
+%define _cpp_protection_jsimd_idct_ifast_mmx jsimd_idct_ifast_mmx
+%define _cpp_protection_jconst_idct_islow_sse2 jconst_idct_islow_sse2
+%define _cpp_protection_jsimd_idct_islow_sse2 jsimd_idct_islow_sse2
+%define _cpp_protection_jconst_idct_ifast_sse2 jconst_idct_ifast_sse2
+%define _cpp_protection_jsimd_idct_ifast_sse2 jsimd_idct_ifast_sse2
+%define _cpp_protection_jsimd_idct_float_3dnow jsimd_idct_float_3dnow
+%define _cpp_protection_jconst_idct_float_sse jconst_idct_float_sse
+%define _cpp_protection_jsimd_idct_float_sse jsimd_idct_float_sse
+%define _cpp_protection_jconst_idct_float_sse2 jconst_idct_float_sse2
+%define _cpp_protection_jsimd_idct_float_sse2 jsimd_idct_float_sse2
 #endif /* NEED_SHORT_EXTERNAL_NAMES */
 
index 4ab9bc0..253b897 100644 (file)
@@ -86,8 +86,6 @@ section .note.GNU-stack noalloc noexec nowrite progbits
 %define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
 %endif
 
-%define STRICT_MEMORY_ACCESS 1
-
 ; To make the code position-independent, append -DPIC to the commandline
 ;
 %define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_      ; ELF supports PIC
@@ -322,15 +320,15 @@ const_base:
        push rsi
        push rdi
        sub     rsp, SIZEOF_XMMWORD
-       movlpd  XMMWORD [rsp], xmm6
+       movaps  XMMWORD [rsp], xmm6
        sub     rsp, SIZEOF_XMMWORD
-       movlpd  XMMWORD [rsp], xmm7
+       movaps  XMMWORD [rsp], xmm7
 %endmacro
 
 %imacro uncollect_args 0
-       movlpd  xmm7, XMMWORD [rsp]
+       movaps  xmm7, XMMWORD [rsp]
        add     rsp, SIZEOF_XMMWORD
-       movlpd  xmm6, XMMWORD [rsp]
+       movaps  xmm6, XMMWORD [rsp]
        add     rsp, SIZEOF_XMMWORD
        pop rdi
        pop rsi