tizen 2.3.1 release

author jk7744.park <jk7744.park@samsung.com>

Tue, 8 Sep 2015 12:57:56 +0000 (21:57 +0900)

committer jk7744.park <jk7744.park@samsung.com>

Tue, 8 Sep 2015 12:57:56 +0000 (21:57 +0900)
author jk7744.park <jk7744.park@samsung.com>
Tue, 8 Sep 2015 12:57:56 +0000 (21:57 +0900)
committer jk7744.park <jk7744.park@samsung.com>
Tue, 8 Sep 2015 12:57:56 +0000 (21:57 +0900)
diff --git a/COPYING b/COPYING

new file mode 100644 (file)

index 0000000..d8c7ce7
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,31 @@
+*******************************************************************************
+**     License
+*******************************************************************************
+
+Most of libjpeg-turbo inherits the non-restrictive, BSD-style license used by
+libjpeg (see README.)  The TurboJPEG/OSS wrapper (both C and Java versions) and
+associated test programs bear a similar license, which is reproduced below:
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+- Neither the name of the libjpeg-turbo Project nor the names of its
+  contributors may be used to endorse or promote products derived from this
+  software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile.am b/Makefile.am

index 456bb1f..3434b10 100644 (file)
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,10 +1,8 @@
  lib_LTLIBRARIES = libjpeg.la libturbojpeg.la
  libjpeg_la_LDFLAGS = -version-info ${SO_MAJOR_VERSION}:${SO_MINOR_VERSION} -no-undefined
  libturbojpeg_la_LDFLAGS = -avoid-version -no-undefined
-include_HEADERS = turbojpeg.h
-installheaderjpeglibdir = $(includedir)/turbojpeg
-installheaderjpeglib_HEADERS = jpeglib.h jerror.h jmorecfg.h jconfig.h
-#nodist_include_HEADERS = jconfig.h
+include_HEADERS = jerror.h jmorecfg.h jpeglib.h turbojpeg.h
+nodist_include_HEADERS = jconfig.h
  
  HDRS = jchuff.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
         jpegint.h jpeglib.h jversion.h jsimd.h jsimddct.h jpegcomp.h
@@ -18,7 +16,6 @@ libjpeg_la_SOURCES = $(HDRS) jcapimin.c jcapistd.c jccoefct.c jccolor.c \
         jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c \
         jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c
  
-
  if WITH_ARITH
  
  libjpeg_la_SOURCES += jaricom.c
@@ -96,7 +93,8 @@ cjpeg_SOURCES = cdjpeg.h cderror.h cdjpeg.c cjpeg.c rdbmp.c rdgif.c \
  cjpeg_LDADD = libjpeg.la
  
  cjpeg_CFLAGS = -DBMP_SUPPORTED -DGIF_SUPPORTED -DPPM_SUPPORTED \
-       -DTARGA_SUPPORTED
+       -DTARGA_SUPPORTED \
+       -fPIE -pie
  
  djpeg_SOURCES = cdjpeg.h cderror.h cdjpeg.c djpeg.c rdcolmap.c rdswitch.c \
         wrbmp.c wrgif.c wrppm.c wrtarga.c
@@ -104,7 +102,8 @@ djpeg_SOURCES = cdjpeg.h cderror.h cdjpeg.c djpeg.c rdcolmap.c rdswitch.c \
  djpeg_LDADD = libjpeg.la
  
  djpeg_CFLAGS = -DBMP_SUPPORTED -DGIF_SUPPORTED -DPPM_SUPPORTED \
-       -DTARGA_SUPPORTED
+       -DTARGA_SUPPORTED \
+        -fPIE -pie
  
  jpegtran_SOURCES = jpegtran.c rdswitch.c cdjpeg.c transupp.c transupp.h
  
diff --git a/configure.ac b/configure.ac

index d7b249b..22c6c20 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -159,7 +159,7 @@ EOF
  AC_LINK_IFELSE(AC_LANG_PROGRAM([], []),
    [VERSION_SCRIPT_FLAG=-Wl,--version-script,; AC_MSG_RESULT([yes (GNU style)])], [])
  if test "x$VERSION_SCRIPT_FLAG" = "x"; then
-  LDFLAGS="$SAVED_LDFLAGS -Wl,-M,conftest.map"
+  LDFLAGS="$SAVED_LDFLAGS -Wl,-M,conftest.map -pie"
    AC_LINK_IFELSE(AC_LANG_PROGRAM([], []),
      [VERSION_SCRIPT_FLAG=-Wl,-M,; AC_MSG_RESULT([yes (Sun style)])], [])
  fi
diff --git a/debian/changelog b/debian/changelog

index 3c15320..3efa3fa 100644 (file)
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,51 @@
+libjpeg-turbo (1.2.0-8) unstable; urgency=low
+
+  * [TREL] Install djpeg, cjpeg
+  * Git: external/libjpeg-turbo
+  * Tag: libjpeg-turbo_1.2.0-8
+
+ -- YoungHun Kim <yh8004.kim@samsung.com>  Fri, 05 Apr 2013 14:10:35 +0900
+
+libjpeg-turbo (1.2.0-7) unstable; urgency=low
+
+  * Enable simd for i586 (including IA)
+  * Git: external/libjpeg-turbo
+  * Tag: libjpeg-turbo_1.2.0-7
+
+ -- YoungHun Kim <yh8004.kim@samsung.com>  Mon, 07 Jan 2013 15:14:52 +0900
+
+libjpeg-turbo (1.2.0-6) unstable; urgency=low
+
+  * Enable libjpeg package of libjpeg-turbo
+  * Git: external/libjpeg-turbo
+  * Tag: libjpeg-turbo_1.2.0-6
+
+ -- YoungHun Kim <yh8004.kim@samsung.com>  Thu, 25 Oct 2012 01:09:00 +0900
+
+libjpeg-turbo (1.2.0-5) unstable; urgency=low
+
+  * License file copied to /usr/share/license/
+  * Git: external/libjpeg-turbo
+  * Tag: libjpeg-turbo_1.2.0-5
+
+ -- YoungHun Kim <yh8004.kim@ysamsung.com>  Fri, 12 Oct 2012 18:13:55 +0900
+
+libjpeg-turbo (1.2.0-4) unstable; urgency=low
+
+  * Fix CVE-2012-2806
+  * Git: external/libjpeg-turbo
+  * Tag: libjpeg-turbo_1.2.0-4
+
+ -- YoungHun Kim <yh8004.kim@samsung.com>  Thu, 04 Oct 2012 16:23:12 +0900
+
+libjpeg-turbo (1.2.0-3) unstable; urgency=low
+
+  * Add manifest file
+  * Git: external/libjpeg-turbo
+  * Tag: libjpeg-turbo_1.2.0-3
+
+ -- YoungHun Kim <yh8004.kim@samsung.com>  Thu, 20 Sep 2012 20:57:44 +0900
+
  libjpeg-turbo (1.2.0-2) unstable; urgency=low
  
    * Tag [Version] 1.2.0-2
diff --git a/debian/copyright b/debian/copyright

new file mode 100644 (file)

index 0000000..8452ed9
--- /dev/null
+++ b/debian/copyright
@@ -0,0 +1,111 @@
+Format-Specification: http://svn.debian.org/wsvn/dep/web/deps/dep5.mdwn?op=file&rev=135
+Name: libjpeg-turbo
+Source: lp:libjpeg-turbo
+
+Files: *
+Copyright: 1999-2006 MIYASAKA Masaru 
+    2004 Landmark Graphics Corporation
+    2005-2007 Sun Microsystems, Inc.
+    2009 Pierre Ossman for Cendio AB
+    2009-2010 D. R. Commander
+    2010 Thomas G. Lane, Guido Vollbeding
+    2009, Thomas G. Lane, Guido Vollbeding
+    1998, Thomas G. Lane
+    2010 Nokia Corporation
+License:  JPEG
+ .
+ In plain English:
+ .
+ 1. We don't promise that this software works.  (But if you find any bugs,
+   please let us know!)
+ 2. You can use this software for whatever you want.  You don't have to pay us.
+ 3. You may not pretend that you wrote this software.  If you use it in a
+   program, you must acknowledge somewhere in your documentation that
+   you've used the IJG code.
+ .
+ In legalese:
+ The authors make NO WARRANTY or representation, either express or implied,
+ with respect to this software, its quality, accuracy, merchantability, or
+ fitness for a particular purpose.  This software is provided "AS IS", and you,
+ its user, assume the entire risk as to its quality and accuracy.
+ .
+ This software is copyright (C) 1991-2010, Thomas G. Lane, Guido Vollbeding.
+ All Rights Reserved except as specified below.
+ .
+ Permission is hereby granted to use, copy, modify, and distribute this
+ software (or portions thereof) for any purpose, without fee, subject to these
+ conditions:
+ (1) If any part of the source code for this software is distributed, then this
+ README file must be included, with this copyright and no-warranty notice
+ unaltered; and any additions, deletions, or changes to the original files
+ must be clearly indicated in accompanying documentation.
+ (2) If only executable code is distributed, then the accompanying
+ documentation must state that "this software is based in part on the work of
+ the Independent JPEG Group".
+ (3) Permission for use of this software is granted only if the user accepts
+ full responsibility for any undesirable consequences; the authors accept
+ NO LIABILITY for damages of any kind.
+ .
+ These conditions apply to any software derived from or based on the IJG code,
+ not just to the unmodified library.  If you use our work, you ought to
+ acknowledge us.
+ .
+ Permission is NOT granted for the use of any IJG author's name or company name
+ in advertising or publicity relating to this software or products derived from
+ it.  This software may be referred to only as "the Independent JPEG Group's
+ software".
+ .
+ We specifically permit and encourage the use of this software as the basis of
+ commercial products, provided that all warranty or liability claims are
+ assumed by the product vendor.
+ .
+ .
+ ansi2knr.c is included in this distribution by permission of L. Peter Deutsch,
+ sole proprietor of its copyright holder, Aladdin Enterprises of Menlo Park, CA
+ .
+ ansi2knr.c is NOT covered by the above copyright and conditions, but instead
+ by the usual distribution terms of the Free Software Foundation; principally,
+ that you must include source code if you redistribute it.  (See the file
+ ansi2knr.c for full details.)  However, since ansi2knr.c is not needed as part
+ of any program generated from the IJG code, this does not limit you more than
+ the foregoing paragraphs do.
+ .
+ The Unix configuration script "configure" was produced with GNU Autoconf.
+ It is copyright by the Free Software Foundation but is freely distributable.
+ The same holds for its supporting scripts (config.guess, config.sub,
+ ltmain.sh).  Another support script, install-sh, is copyright by X Consortium
+ but is also freely distributable.
+ .
+ The IJG distribution formerly included code to read and write GIF files.
+ To avoid entanglement with the Unisys LZW patent, GIF reading support has
+ been removed altogether, and the GIF writer has been simplified to produce
+ "uncompressed GIFs".  This technique does not use the LZW algorithm; the
+ resulting GIF files are larger than usual, but are readable by all standard
+ GIF decoders.
+ .
+ We are required to state that
+    "The Graphics Interchange Format(c) is the Copyright property of
+    CompuServe Incorporated.  GIF(sm) is a Service Mark property of
+    CompuServe Incorporated."
+
+Files: debian/*
+Copyright: 2010, 2011 Linaro Limited
+License: LGPL-2.1
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License (LGPL) as published by the Free Software Foundation;
+ either version 2 of the License, or (at your option) any later
+ version.
+ .
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Library General Public License for more details.
+ .
+ You should have received a copy of the GNU Library General Public License
+ along with this library; see the file COPYING.LIB.  If not, write to
+ the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ Boston, MA 02110-1301, USA.
+ .
+ On Debian systems, the complete text of the GNU Library General Public
+ License, version 2, can be found in /usr/share/common-licenses/LGPL-2.1.
diff --git a/debian/libjpeg-turbo-dev.files b/debian/libjpeg-turbo-dev.files

index 826e558..0c22589 100644 (file)
--- a/debian/libjpeg-turbo-dev.files
+++ b/debian/libjpeg-turbo-dev.files
@@ -1,8 +1,2 @@
-usr/include/turbojpeg.h
-usr/include/turbojpeg/jpeglib.h
-usr/include/turbojpeg/jerror.h
-usr/include/turbojpeg/jmorecfg.h
-usr/include/turbojpeg/jconfig.h
+usr/include/*.h
  usr/lib/pkgconfig/*
-usr/lib/libturbojpeg.a
-usr/lib/libturbojpeg.la
diff --git a/debian/libjpeg-turbo.files b/debian/libjpeg-turbo.files

index f3d5763..5fae0b0 100644 (file)
--- a/debian/libjpeg-turbo.files
+++ b/debian/libjpeg-turbo.files
@@ -1 +1,2 @@
  usr/lib/libturbojpeg.so*
+usr/lib/libjpeg.so*
diff --git a/jdapimin.c b/jdapimin.c

index cadb59f..4061a3d 100644 (file)
--- a/jdapimin.c
+++ b/jdapimin.c
@@ -78,6 +78,18 @@ jpeg_CreateDecompress (j_decompress_ptr cinfo, int version, size_t structsize)
    /* And initialize the overall input controller. */
    jinit_input_controller(cinfo);
  
+  /* Init region to decode to be empty */
+  cinfo->region_x = 0;
+  cinfo->region_y = 0;
+  cinfo->region_w = 0;
+  cinfo->region_h = 0;
+
+  /* Init region to decode to be empty */
+  cinfo->region_x = 0;
+  cinfo->region_y = 0;
+  cinfo->region_w = 0;
+  cinfo->region_h = 0;
+
    /* OK, I'm ready */
    cinfo->global_state = DSTATE_START;
  }
diff --git a/jdcoefct.c b/jdcoefct.c

index 48a9fc6..304fd5a 100644 (file)
--- a/jdcoefct.c
+++ b/jdcoefct.c
@@ -160,12 +160,45 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
    JDIMENSION start_col, output_col;
    jpeg_component_info *compptr;
    inverse_DCT_method_ptr inverse_DCT;
+  /* region decoding. this limits decode to the set of blocks +- 1 outside
+   * bounding blocks around the desired region to decode */
+  int blk1 = 0, blk2 = 0, skip = 0;
+
+  if ((cinfo->region_w > 0) && (cinfo->region_h > 0)) {
+    int bsz_w = 0, bsz_h = 0;
+
+    for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+      compptr = cinfo->cur_comp_info[ci];
+      if (compptr->MCU_sample_width > bsz_w)
+        bsz_w = compptr->MCU_sample_width;
+      if ((compptr->MCU_height * 8) > bsz_h)
+        bsz_h = compptr->MCU_height * 8;
+    }
+    int _region_y = (int)cinfo->region_y;
+    _region_y = (_region_y>>1)<<1;
+    if (((int)cinfo->output_scanline < (_region_y - bsz_h - 1)) ||
+        ((int)cinfo->output_scanline > (_region_y + cinfo->region_h + bsz_h)))
+      skip = 1;
+    blk1 = (cinfo->region_x / bsz_w) - 1;
+    if (blk1 < 0) blk1 = 0;
+    blk2 = ((cinfo->region_x + cinfo->region_w + bsz_w - 1) / bsz_w) + 1;
+    if (blk2 < 0) blk2 = 0;
+  }
  
    /* Loop to process as much as one whole iMCU row */
    for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
         yoffset++) {
      for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
          MCU_col_num++) {
+      /* see if we need to skip this MCU or not */
+      if ((cinfo->region_w > 0) && (cinfo->region_h > 0)) {
+        if (!((MCU_col_num < blk1) || (MCU_col_num > blk2) || skip))
+          skip = 0;
+      }
+      /* if we are not skipping this MCU, zero it ready for huffman decode */
+      if (!skip)
+        jzero_far((void FAR *) coef->MCU_buffer[0],
+                  (size_t) (cinfo->blocks_in_MCU * SIZEOF(JBLOCK)));
        /* Try to fetch an MCU.  Entropy decoder expects buffer to be zeroed. */
        jzero_far((void FAR *) coef->MCU_buffer[0],
                 (size_t) (cinfo->blocks_in_MCU * SIZEOF(JBLOCK)));
@@ -175,6 +208,10 @@ decompress_onepass (j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
         coef->MCU_ctr = MCU_col_num;
         return JPEG_SUSPENDED;
        }
+      /* region decoding. this limits decode to the set of blocks +- 1 outside
+       * bounding blocks around the desired region to decode */
+      if (skip)
+        continue;
        /* Determine where data should go in output_buf and do the IDCT thing.
         * We skip dummy blocks at the right and bottom edges (but blkn gets
         * incremented past them!).  Note the inner loop relies on having
diff --git a/jdmarker.c b/jdmarker.c

index d8dcba9..6fc0f7d 100644 (file)
--- a/jdmarker.c
+++ b/jdmarker.c
@@ -323,14 +323,15 @@ get_sos (j_decompress_ptr cinfo)
  
    /* Collect the component-spec parameters */
  
-  for (i = 0; i < cinfo->num_components; i++)
+  for (i = 0; i < MAX_COMPS_IN_SCAN; i++)
      cinfo->cur_comp_info[i] = NULL;
  
    for (i = 0; i < n; i++) {
      INPUT_BYTE(cinfo, cc, return FALSE);
      INPUT_BYTE(cinfo, c, return FALSE);
      
-    for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+    for (ci = 0, compptr = cinfo->comp_info;
+        ci < cinfo->num_components && ci < MAX_COMPS_IN_SCAN;
          ci++, compptr++) {
        if (cc == compptr->component_id && !cinfo->cur_comp_info[ci])
         goto id_found;
diff --git a/jdmerge.c b/jdmerge.c

index cfa3bb9..6ecf93b 100644 (file)
--- a/jdmerge.c
+++ b/jdmerge.c
@@ -248,6 +248,7 @@ merged_2v_upsample (j_decompress_ptr cinfo,
    my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
    JSAMPROW work_ptrs[2];
    JDIMENSION num_rows;         /* number of rows returned to caller */
+  int skip = 0;
  
    if (upsample->spare_full) {
      /* If we have a spare row saved from a previous cycle, just return it. */
@@ -256,6 +257,13 @@ merged_2v_upsample (j_decompress_ptr cinfo,
      num_rows = 1;
      upsample->spare_full = FALSE;
    } else {
+    int _region_y = (int)cinfo->region_y;
+    _region_y = (_region_y>>1)<<1;
+    if ((cinfo->region_w > 0) && (cinfo->region_h > 0)) {
+       if (((int)cinfo->output_scanline < _region_y) ||
+           ((int)cinfo->output_scanline >= (_region_y + (int)cinfo->region_h)))
+         skip = 1;
+    }
      /* Figure number of rows to return to caller. */
      num_rows = 2;
      /* Not more than the distance to the end of the image. */
@@ -274,7 +282,8 @@ merged_2v_upsample (j_decompress_ptr cinfo,
        upsample->spare_full = TRUE;
      }
      /* Now do the upsampling. */
-    (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr, work_ptrs);
+    if (!skip)
+      (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr, work_ptrs);
    }
  
    /* Adjust counts */
diff --git a/jpeglib.h b/jpeglib.h

index d19a3ef..0f53709 100644 (file)
--- a/jpeglib.h
+++ b/jpeglib.h
@@ -491,6 +491,8 @@ struct jpeg_decompress_struct {
  
    unsigned int scale_num, scale_denom; /* fraction by which to scale image */
  
+  unsigned int region_x, region_y, region_w, region_h; /* if region_w && region_h > 0, then use this region to decode. scale above is done prior to region select */
+
    double output_gamma;         /* image gamma wanted in output */
  
    boolean buffered_image;      /* TRUE=multiple output passes */
diff --git a/libjpeg-turbo.manifest b/libjpeg-turbo.manifest

new file mode 100755 (executable)

index 0000000..d1b4247
--- /dev/null
+++ b/libjpeg-turbo.manifest
@@ -0,0 +1,12 @@
+<manifest>
+        <define>
+                <domain name="libjpeg-turbo"/>
+        </define>
+        <request>
+                <domain name="_" />
+        </request>
+        <assign>
+                <filesystem path="/usr/bin/cjpeg" label="libjpeg-turbo" exec_label="libjpeg-turbo" />
+                <filesystem path="/usr/bin/djpeg" label="libjpeg-turbo" exec_label="libjpeg-turbo" />
+        </assign>
+</manifest>
diff --git a/packaging/libjpeg-turbo.spec b/packaging/libjpeg-turbo.spec

index 588a180..60e14d1 100644 (file)
--- a/packaging/libjpeg-turbo.spec
+++ b/packaging/libjpeg-turbo.spec
@@ -1,9 +1,9 @@
  Name:           libjpeg-turbo
-License:        BSD3c(or similar)
+License:        BSD-2.0
  Group:          Productivity/Graphics/Convertors
  AutoReqProv:    on
  Version:       1.2.0
-Release:        2
+Release:        12
  Summary:        A MMX/SSE2 accelerated library for manipulating JPEG image files
  Url:            http://sourceforge.net/projects/libjpeg-turbo
  Source0:        %{name}-%{version}.tar.gz
@@ -13,25 +13,28 @@ The libjpeg-turbo package contains a library of functions for manipulating
  JPEG images.
  
  %package devel
-
-License:        BSD3c(or similar)
  Summary:        Developement files for libjpeg-turbo contains a wrapper library (TurboJPEG/OSS) that emulates the TurboJPEG API using libjpeg-turbo
  Group:          Development/Libraries/C and C++
  Requires:       %{name} = %{version}-%{release}
-
+Provides:      libjpeg-devel
+%ifarch %{ix86}
+BuildRequires:  nasm
+%endif
  %description devel
-The libjpeg-turbo shared libraries can be used as drop-in replacements for libjpeg on most systems
+The libjpeg-devel package includes the header files and documentation
+necessary for developing programs which will manipulate JPEG files using
+the libjpeg library.
+
+If you are going to develop programs which will manipulate JPEG images,
+you should install libjpeg-devel.  You'll also need to have the libjpeg
+package installed.
  
  %prep
  %setup -q 
  
  %build
  autoreconf -fiv
-%ifarch %{arm}
-%configure --disable-static --with-jpeg8
-%else
-%configure --disable-static --with-jpeg8 --without-simd
-%endif
+%configure --enable-shared --disable-static --with-jpeg8
  make %{?_smp_mflags}
  
  #%check
@@ -39,6 +42,8 @@ make %{?_smp_mflags}
  
  %install
  %makeinstall
+mkdir -p %{buildroot}/usr/share/license
+cp COPYING %{buildroot}/usr/share/license/%{name}
  # Fix perms
  chmod -x README-turbo.txt release/copyright
  
@@ -50,27 +55,27 @@ rm -rf $RPM_BUILD_ROOT
  %postun  -p /sbin/ldconfig
  
  %files
+/usr/share/license/%{name}
+%manifest libjpeg-turbo.manifest
  %defattr(-,root,root)
  %{_libdir}/libturbojpeg.so
+%{_libdir}/libjpeg.so.*
+%{_bindir}/cjpeg
+%{_bindir}/djpeg
  %exclude %{_datadir}/man/man1/*
  %exclude %{_datadir}/doc/
-%exclude %{_bindir}/cjpeg
-%exclude %{_bindir}/djpeg
+#%exclude %{_bindir}/cjpeg
+#%exclude %{_bindir}/djpeg
  %exclude %{_bindir}/jpegtran
  %exclude %{_bindir}/rdjpgcom
  %exclude %{_bindir}/tjbench
  %exclude %{_bindir}/wrjpgcom
-%exclude %{_libdir}/libjpeg.so.*
  
  
  %files devel
  %defattr(-,root,root)
-%{_includedir}/turbojpeg.h
-%exclude %{_libdir}/libjpeg.so
-%{_includedir}/turbojpeg/jpeglib.h
-%{_includedir}/turbojpeg/jerror.h
-%{_includedir}/turbojpeg/jmorecfg.h
-%{_includedir}/turbojpeg/jconfig.h
+%{_libdir}/libjpeg.so
+%{_includedir}/*.h
  %{_libdir}/pkgconfig/turbojpeg.pc
  %exclude %{_libdir}/libjpeg.la
  %exclude %{_libdir}/libturbojpeg.la
diff --git a/pkgconfig/turbojpeg.pc.in b/pkgconfig/turbojpeg.pc.in

index 123e590..df77158 100755 (executable)
--- a/pkgconfig/turbojpeg.pc.in
+++ b/pkgconfig/turbojpeg.pc.in
@@ -1,7 +1,7 @@
  prefix=/usr
  exec_prefix=${prefix}
  libdir=${exec_prefix}/lib
-includedir=${prefix}/include/turbojpeg
+includedir=${prefix}/include
  
  Name: libturbojpeg
  Description: Loads and saves jpeg
diff --git a/simd/jdclrss2-64.asm b/simd/jdclrss2-64.asm

index 696a383..7d17c52 100644 (file)
--- a/simd/jdclrss2-64.asm
+++ b/simd/jdclrss2-64.asm
@@ -1,8 +1,8 @@
  ;
  ; jdclrss2-64.asm - colorspace conversion (64-bit SSE2)
  ;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 D. R. Commander
  ;
  ; Based on
  ; x86 SIMD extension for IJG JPEG library
@@ -251,17 +251,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
         movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
         movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
         movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
         jmp     short .out0
  .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmH,xmmH                    ; xmmH=(all 1's)
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmF,xmmH                    ; movntdqu XMMWORD [rdi], xmmF
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
  .out0:
+       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
         sub     rcx, byte SIZEOF_XMMWORD
         jz      near .nextrow
  
@@ -271,31 +267,28 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
         jmp     near .columnloop
  
  .column_st32:
-       pcmpeqb xmmH,xmmH                       ; xmmH=(all 1's)
         lea     rcx, [rcx+rcx*2]                ; imul ecx, RGB_PIXELSIZE
         cmp     rcx, byte 2*SIZEOF_XMMWORD
         jb      short .column_st16
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
         movdqa  xmmA,xmmF
         sub     rcx, byte 2*SIZEOF_XMMWORD
         jmp     short .column_st15
  .column_st16:
         cmp     rcx, byte SIZEOF_XMMWORD
         jb      short .column_st15
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
         add     rdi, byte SIZEOF_XMMWORD        ; outptr
         movdqa  xmmA,xmmD
         sub     rcx, byte SIZEOF_XMMWORD
  .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
         ; Store the lower 8 bytes of xmmA to the output when it has enough
         ; space.
         cmp     rcx, byte SIZEOF_MMWORD
         jb      short .column_st7
-       movq    MMWORD [rdi], xmmA
+       movq    XMM_MMWORD [rdi], xmmA
         add     rdi, byte SIZEOF_MMWORD
         sub     rcx, byte SIZEOF_MMWORD
         psrldq  xmmA, SIZEOF_MMWORD
@@ -304,7 +297,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
         ; space.
         cmp     rcx, byte SIZEOF_DWORD
         jb      short .column_st3
-       movd    DWORD [rdi], xmmA
+       movd    XMM_DWORD [rdi], xmmA
         add     rdi, byte SIZEOF_DWORD
         sub     rcx, byte SIZEOF_DWORD
         psrldq  xmmA, SIZEOF_DWORD
@@ -324,47 +317,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
         test    rcx, rcx
         jz      short .nextrow
         mov     BYTE [rdi], al
-%else
-       mov     rax,rcx
-       xor     rcx, byte 0x0F
-       shl     rcx, 2
-       movd    xmmB,ecx
-       psrlq   xmmH,4
-       pcmpeqb xmmE,xmmE
-       psrlq   xmmH,xmmB
-       psrlq   xmmE,xmmB
-       punpcklbw xmmE,xmmH
-       ; ----------------
-       mov     rcx,rdi
-       and     rcx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       add     rax,rcx
-       cmp     rax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     rdi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     rcx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,rcx
-       movdqa  xmmG,xmmA
-       movdqa  xmmC,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmD,ecx
-       sub     rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmF,ecx
-       psllq   xmmA,xmmF
-       psllq   xmmE,xmmF
-       jmp     short .adj0
-.adj1: neg     ecx
-       movd    xmmF,ecx
-       psrlq   xmmA,xmmF
-       psrlq   xmmE,xmmF
-       psllq   xmmG,xmmD
-       psllq   xmmC,xmmD
-       por     xmmA,xmmG
-       por     xmmE,xmmC
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
  
  %else ; RGB_PIXELSIZE == 4 ; -----------
  
@@ -409,19 +361,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
         movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
         movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
         movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
         jmp     short .out0
  .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmE,xmmE                    ; xmmE=(all 1's)
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmC,xmmE                    ; movntdqu XMMWORD [rdi], xmmC
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmH,xmmE                    ; movntdqu XMMWORD [rdi], xmmH
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+       movdqu  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
  .out0:
+       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
         sub     rcx, byte SIZEOF_XMMWORD
         jz      near .nextrow
  
@@ -431,25 +378,22 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
         jmp     near .columnloop
  
  .column_st32:
-       pcmpeqb xmmE,xmmE                       ; xmmE=(all 1's)
         cmp     rcx, byte SIZEOF_XMMWORD/2
         jb      short .column_st16
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
         movdqa  xmmA,xmmC
         movdqa  xmmD,xmmH
         sub     rcx, byte SIZEOF_XMMWORD/2
  .column_st16:
         cmp     rcx, byte SIZEOF_XMMWORD/4
         jb      short .column_st15
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
         add     rdi, byte SIZEOF_XMMWORD        ; outptr
         movdqa  xmmA,xmmD
         sub     rcx, byte SIZEOF_XMMWORD/4
  .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
         ; Store two pixels (8 bytes) of xmmA to the output when it has enough
         ; space.
         cmp     rcx, byte SIZEOF_XMMWORD/8
@@ -463,48 +407,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
         ; space.
         test    rcx, rcx
         jz      short .nextrow
-       movd    DWORD [rdi], xmmA
-%else
-       cmp     rcx, byte SIZEOF_XMMWORD/16
-       jb      near .nextrow
-       mov     rax,rcx
-       xor     rcx, byte 0x03
-       inc     rcx
-       shl     rcx, 4
-       movd    xmmF,ecx
-       psrlq   xmmE,xmmF
-       punpcklbw xmmE,xmmE
-       ; ----------------
-       mov     rcx,rdi
-       and     rcx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       lea     rax, [rcx+rax*4]        ; RGB_PIXELSIZE
-       cmp     rax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     rdi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     rcx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmB,xmmA
-       movdqa  xmmG,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmC,ecx
-       sub     rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmH,ecx
-       psllq   xmmA,xmmH
-       psllq   xmmE,xmmH
-       jmp     short .adj0
-.adj1: neg     rcx
-       movd    xmmH,ecx
-       psrlq   xmmA,xmmH
-       psrlq   xmmE,xmmH
-       psllq   xmmB,xmmC
-       psllq   xmmG,xmmC
-       por     xmmA,xmmB
-       por     xmmE,xmmG
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
+       movd    XMM_DWORD [rdi], xmmA
  
  %endif ; RGB_PIXELSIZE ; ---------------
  
diff --git a/simd/jdclrss2.asm b/simd/jdclrss2.asm

index 7f519e6..97754cb 100644 (file)
--- a/simd/jdclrss2.asm
+++ b/simd/jdclrss2.asm
@@ -1,7 +1,8 @@
  ;
  ; jdclrss2.asm - colorspace conversion (SSE2)
  ;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2012 D. R. Commander
  ;
  ; Based on
  ; x86 SIMD extension for IJG JPEG library
@@ -262,17 +263,13 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
         movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
         movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
         movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
         jmp     short .out0
  .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmH,xmmH                    ; xmmH=(all 1's)
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmF,xmmH                    ; movntdqu XMMWORD [edi], xmmF
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
  .out0:
+       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
         sub     ecx, byte SIZEOF_XMMWORD
         jz      near .nextrow
  
@@ -283,31 +280,28 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
         alignx  16,7
  
  .column_st32:
-       pcmpeqb xmmH,xmmH                       ; xmmH=(all 1's)
         lea     ecx, [ecx+ecx*2]                ; imul ecx, RGB_PIXELSIZE
         cmp     ecx, byte 2*SIZEOF_XMMWORD
         jb      short .column_st16
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
         movdqa  xmmA,xmmF
         sub     ecx, byte 2*SIZEOF_XMMWORD
         jmp     short .column_st15
  .column_st16:
         cmp     ecx, byte SIZEOF_XMMWORD
         jb      short .column_st15
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
         add     edi, byte SIZEOF_XMMWORD        ; outptr
         movdqa  xmmA,xmmD
         sub     ecx, byte SIZEOF_XMMWORD
  .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
         ; Store the lower 8 bytes of xmmA to the output when it has enough
         ; space.
         cmp     ecx, byte SIZEOF_MMWORD
         jb      short .column_st7
-       movq    MMWORD [edi], xmmA
+       movq    XMM_MMWORD [edi], xmmA
         add     edi, byte SIZEOF_MMWORD
         sub     ecx, byte SIZEOF_MMWORD
         psrldq  xmmA, SIZEOF_MMWORD
@@ -316,7 +310,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
         ; space.
         cmp     ecx, byte SIZEOF_DWORD
         jb      short .column_st3
-       movd    DWORD [edi], xmmA
+       movd    XMM_DWORD [edi], xmmA
         add     edi, byte SIZEOF_DWORD
         sub     ecx, byte SIZEOF_DWORD
         psrldq  xmmA, SIZEOF_DWORD
@@ -336,47 +330,6 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
         test    ecx, ecx
         jz      short .nextrow
         mov     BYTE [edi], al
-%else
-       mov     eax,ecx
-       xor     ecx, byte 0x0F
-       shl     ecx, 2
-       movd    xmmB,ecx
-       psrlq   xmmH,4
-       pcmpeqb xmmE,xmmE
-       psrlq   xmmH,xmmB
-       psrlq   xmmE,xmmB
-       punpcklbw xmmE,xmmH
-       ; ----------------
-       mov     ecx,edi
-       and     ecx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       add     eax,ecx
-       cmp     eax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     edi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     ecx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmG,xmmA
-       movdqa  xmmC,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmD,ecx
-       sub     ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmF,ecx
-       psllq   xmmA,xmmF
-       psllq   xmmE,xmmF
-       jmp     short .adj0
-.adj1: neg     ecx
-       movd    xmmF,ecx
-       psrlq   xmmA,xmmF
-       psrlq   xmmE,xmmF
-       psllq   xmmG,xmmD
-       psllq   xmmC,xmmD
-       por     xmmA,xmmG
-       por     xmmE,xmmC
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
  
  %else ; RGB_PIXELSIZE == 4 ; -----------
  
@@ -421,19 +374,14 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
         movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
         movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
         movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
         jmp     short .out0
  .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmE,xmmE                    ; xmmE=(all 1's)
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmC,xmmE                    ; movntdqu XMMWORD [edi], xmmC
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmH,xmmE                    ; movntdqu XMMWORD [edi], xmmH
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+       movdqu  XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
  .out0:
+       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
         sub     ecx, byte SIZEOF_XMMWORD
         jz      near .nextrow
  
@@ -444,30 +392,27 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
         alignx  16,7
  
  .column_st32:
-       pcmpeqb xmmE,xmmE                       ; xmmE=(all 1's)
         cmp     ecx, byte SIZEOF_XMMWORD/2
         jb      short .column_st16
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
         movdqa  xmmA,xmmC
         movdqa  xmmD,xmmH
         sub     ecx, byte SIZEOF_XMMWORD/2
  .column_st16:
         cmp     ecx, byte SIZEOF_XMMWORD/4
         jb      short .column_st15
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
         add     edi, byte SIZEOF_XMMWORD        ; outptr
         movdqa  xmmA,xmmD
         sub     ecx, byte SIZEOF_XMMWORD/4
  .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
         ; Store two pixels (8 bytes) of xmmA to the output when it has enough
         ; space.
         cmp     ecx, byte SIZEOF_XMMWORD/8
         jb      short .column_st7
-       movq    MMWORD [edi], xmmA
+       movq    XMM_MMWORD [edi], xmmA
         add     edi, byte SIZEOF_XMMWORD/8*4
         sub     ecx, byte SIZEOF_XMMWORD/8
         psrldq  xmmA, SIZEOF_XMMWORD/8*4
@@ -476,48 +421,7 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
         ; space.
         test    ecx, ecx
         jz      short .nextrow
-       movd    DWORD [edi], xmmA
-%else
-       cmp     ecx, byte SIZEOF_XMMWORD/16
-       jb      short .nextrow
-       mov     eax,ecx
-       xor     ecx, byte 0x03
-       inc     ecx
-       shl     ecx, 4
-       movd    xmmF,ecx
-       psrlq   xmmE,xmmF
-       punpcklbw xmmE,xmmE
-       ; ----------------
-       mov     ecx,edi
-       and     ecx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       lea     eax, [ecx+eax*4]        ; RGB_PIXELSIZE
-       cmp     eax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     edi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     ecx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmB,xmmA
-       movdqa  xmmG,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmC,ecx
-       sub     ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmH,ecx
-       psllq   xmmA,xmmH
-       psllq   xmmE,xmmH
-       jmp     short .adj0
-.adj1: neg     ecx
-       movd    xmmH,ecx
-       psrlq   xmmA,xmmH
-       psrlq   xmmE,xmmH
-       psllq   xmmB,xmmC
-       psllq   xmmG,xmmC
-       por     xmmA,xmmB
-       por     xmmE,xmmG
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
+       movd    XMM_DWORD [edi], xmmA
  
  %endif ; RGB_PIXELSIZE ; ---------------
  
diff --git a/simd/jdmrgss2-64.asm b/simd/jdmrgss2-64.asm

index a64a6b3..ffbf6b2 100644 (file)
--- a/simd/jdmrgss2-64.asm
+++ b/simd/jdmrgss2-64.asm
@@ -1,8 +1,8 @@
  ;
  ; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
  ;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 D. R. Commander
  ;
  ; Based on
  ; x86 SIMD extension for IJG JPEG library
@@ -12,7 +12,7 @@
  ; This file should be assembled with NASM (Netwide Assembler),
  ; can *not* be assembled with Microsoft's MASM or any compatible
  ; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ for
+; NASM is available from http://nasm.sourceforge.net/ or
  ; http://sourceforge.net/project/showfiles.php?group_id=6208
  ;
  ; [TAB8]
@@ -252,17 +252,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
         movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
         movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
         movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
         jmp     short .out0
  .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmH,xmmH                    ; xmmH=(all 1's)
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmF,xmmH                    ; movntdqu XMMWORD [rdi], xmmF
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
  .out0:
+       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
         sub     rcx, byte SIZEOF_XMMWORD
         jz      near .endcolumn
  
@@ -275,31 +271,28 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
         jmp     near .columnloop
  
  .column_st32:
-       pcmpeqb xmmH,xmmH                       ; xmmH=(all 1's)
         lea     rcx, [rcx+rcx*2]                ; imul ecx, RGB_PIXELSIZE
         cmp     rcx, byte 2*SIZEOF_XMMWORD
         jb      short .column_st16
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
         movdqa  xmmA,xmmF
         sub     rcx, byte 2*SIZEOF_XMMWORD
         jmp     short .column_st15
  .column_st16:
         cmp     rcx, byte SIZEOF_XMMWORD
         jb      short .column_st15
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
         add     rdi, byte SIZEOF_XMMWORD        ; outptr
         movdqa  xmmA,xmmD
         sub     rcx, byte SIZEOF_XMMWORD
  .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
         ; Store the lower 8 bytes of xmmA to the output when it has enough
         ; space.
         cmp     rcx, byte SIZEOF_MMWORD
         jb      short .column_st7
-       movq    MMWORD [rdi], xmmA
+       movq    XMM_MMWORD [rdi], xmmA
         add     rdi, byte SIZEOF_MMWORD
         sub     rcx, byte SIZEOF_MMWORD
         psrldq  xmmA, SIZEOF_MMWORD
@@ -308,7 +301,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
         ; space.
         cmp     rcx, byte SIZEOF_DWORD
         jb      short .column_st3
-       movd    DWORD [rdi], xmmA
+       movd    XMM_DWORD [rdi], xmmA
         add     rdi, byte SIZEOF_DWORD
         sub     rcx, byte SIZEOF_DWORD
         psrldq  xmmA, SIZEOF_DWORD
@@ -328,47 +321,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
         test    rcx, rcx
         jz      short .endcolumn
         mov     BYTE [rdi], al
-%else
-       mov     rax,rcx
-       xor     rcx, byte 0x0F
-       shl     rcx, 2
-       movd    xmmB,ecx
-       psrlq   xmmH,4
-       pcmpeqb xmmE,xmmE
-       psrlq   xmmH,xmmB
-       psrlq   xmmE,xmmB
-       punpcklbw xmmE,xmmH
-       ; ----------------
-       mov     rcx,rdi
-       and     rcx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       add     rax,rcx
-       cmp     rax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     rdi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     rcx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmG,xmmA
-       movdqa  xmmC,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmD,ecx
-       sub     rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmF,ecx
-       psllq   xmmA,xmmF
-       psllq   xmmE,xmmF
-       jmp     short .adj0
-.adj1: neg     rcx
-       movd    xmmF,ecx
-       psrlq   xmmA,xmmF
-       psrlq   xmmE,xmmF
-       psllq   xmmG,xmmD
-       psllq   xmmC,xmmD
-       por     xmmA,xmmG
-       por     xmmE,xmmC
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
  
  %else ; RGB_PIXELSIZE == 4 ; -----------
  
@@ -413,19 +365,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
         movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
         movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
         movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
         jmp     short .out0
  .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmE,xmmE                    ; xmmE=(all 1's)
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmC,xmmE                    ; movntdqu XMMWORD [rdi], xmmC
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmH,xmmE                    ; movntdqu XMMWORD [rdi], xmmH
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+       movdqu  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
  .out0:
+       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
         sub     rcx, byte SIZEOF_XMMWORD
         jz      near .endcolumn
  
@@ -438,30 +385,27 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
         jmp     near .columnloop
  
  .column_st32:
-       pcmpeqb xmmE,xmmE                       ; xmmE=(all 1's)
         cmp     rcx, byte SIZEOF_XMMWORD/2
         jb      short .column_st16
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
         movdqa  xmmA,xmmC
         movdqa  xmmD,xmmH
         sub     rcx, byte SIZEOF_XMMWORD/2
  .column_st16:
         cmp     rcx, byte SIZEOF_XMMWORD/4
         jb      short .column_st15
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
         add     rdi, byte SIZEOF_XMMWORD        ; outptr
         movdqa  xmmA,xmmD
         sub     rcx, byte SIZEOF_XMMWORD/4
  .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
         ; Store two pixels (8 bytes) of xmmA to the output when it has enough
         ; space.
         cmp     rcx, byte SIZEOF_XMMWORD/8
         jb      short .column_st7
-       movq    MMWORD [rdi], xmmA
+       movq    XMM_MMWORD [rdi], xmmA
         add     rdi, byte SIZEOF_XMMWORD/8*4
         sub     rcx, byte SIZEOF_XMMWORD/8
         psrldq  xmmA, SIZEOF_XMMWORD/8*4
@@ -470,48 +414,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
         ; space.
         test    rcx, rcx
         jz      short .endcolumn
-       movd    DWORD [rdi], xmmA
-%else
-       cmp     rcx, byte SIZEOF_XMMWORD/16
-       jb      near .endcolumn
-       mov     rax,rcx
-       xor     rcx, byte 0x03
-       inc     rcx
-       shl     rcx, 4
-       movd    xmmF,ecx
-       psrlq   xmmE,xmmF
-       punpcklbw xmmE,xmmE
-       ; ----------------
-       mov     rcx,rdi
-       and     rcx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       lea     rax, [rcx+rax*4]        ; RGB_PIXELSIZE
-       cmp     rax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     rdi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     rcx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmB,xmmA
-       movdqa  xmmG,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmC,ecx
-       sub     rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmH,ecx
-       psllq   xmmA,xmmH
-       psllq   xmmE,xmmH
-       jmp     short .adj0
-.adj1: neg     rcx
-       movd    xmmH,ecx
-       psrlq   xmmA,xmmH
-       psrlq   xmmE,xmmH
-       psllq   xmmB,xmmC
-       psllq   xmmG,xmmC
-       por     xmmA,xmmB
-       por     xmmE,xmmG
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
+       movd    XMM_DWORD [rdi], xmmA
  
  %endif ; RGB_PIXELSIZE ; ---------------
  
diff --git a/simd/jdmrgss2.asm b/simd/jdmrgss2.asm

index 04089aa..6494340 100644 (file)
--- a/simd/jdmrgss2.asm
+++ b/simd/jdmrgss2.asm
@@ -1,7 +1,8 @@
  ;
  ; jdmrgss2.asm - merged upsampling/color conversion (SSE2)
  ;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2012 D. R. Commander
  ;
  ; Based on
  ; x86 SIMD extension for IJG JPEG library
@@ -264,17 +265,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
         movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
         movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
         movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
         jmp     short .out0
  .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmH,xmmH                    ; xmmH=(all 1's)
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmF,xmmH                    ; movntdqu XMMWORD [edi], xmmF
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
  .out0:
+       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
         sub     ecx, byte SIZEOF_XMMWORD
         jz      near .endcolumn
  
@@ -288,31 +285,28 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
         alignx  16,7
  
  .column_st32:
-       pcmpeqb xmmH,xmmH                       ; xmmH=(all 1's)
         lea     ecx, [ecx+ecx*2]                ; imul ecx, RGB_PIXELSIZE
         cmp     ecx, byte 2*SIZEOF_XMMWORD
         jb      short .column_st16
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
         movdqa  xmmA,xmmF
         sub     ecx, byte 2*SIZEOF_XMMWORD
         jmp     short .column_st15
  .column_st16:
         cmp     ecx, byte SIZEOF_XMMWORD
         jb      short .column_st15
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
         add     edi, byte SIZEOF_XMMWORD        ; outptr
         movdqa  xmmA,xmmD
         sub     ecx, byte SIZEOF_XMMWORD
  .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
         ; Store the lower 8 bytes of xmmA to the output when it has enough
         ; space.
         cmp     ecx, byte SIZEOF_MMWORD
         jb      short .column_st7
-       movq    MMWORD [edi], xmmA
+       movq    XMM_MMWORD [edi], xmmA
         add     edi, byte SIZEOF_MMWORD
         sub     ecx, byte SIZEOF_MMWORD
         psrldq  xmmA, SIZEOF_MMWORD
@@ -321,7 +315,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
         ; space.
         cmp     ecx, byte SIZEOF_DWORD
         jb      short .column_st3
-       movd    DWORD [edi], xmmA
+       movd    XMM_DWORD [edi], xmmA
         add     edi, byte SIZEOF_DWORD
         sub     ecx, byte SIZEOF_DWORD
         psrldq  xmmA, SIZEOF_DWORD
@@ -341,47 +335,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
         test    ecx, ecx
         jz      short .endcolumn
         mov     BYTE [edi], al
-%else
-       mov     eax,ecx
-       xor     ecx, byte 0x0F
-       shl     ecx, 2
-       movd    xmmB,ecx
-       psrlq   xmmH,4
-       pcmpeqb xmmE,xmmE
-       psrlq   xmmH,xmmB
-       psrlq   xmmE,xmmB
-       punpcklbw xmmE,xmmH
-       ; ----------------
-       mov     ecx,edi
-       and     ecx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       add     eax,ecx
-       cmp     eax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     edi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     ecx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmG,xmmA
-       movdqa  xmmC,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmD,ecx
-       sub     ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmF,ecx
-       psllq   xmmA,xmmF
-       psllq   xmmE,xmmF
-       jmp     short .adj0
-.adj1: neg     ecx
-       movd    xmmF,ecx
-       psrlq   xmmA,xmmF
-       psrlq   xmmE,xmmF
-       psllq   xmmG,xmmD
-       psllq   xmmC,xmmD
-       por     xmmA,xmmG
-       por     xmmE,xmmC
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
  
  %else ; RGB_PIXELSIZE == 4 ; -----------
  
@@ -426,19 +379,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
         movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
         movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
         movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
         jmp     short .out0
  .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmE,xmmE                    ; xmmE=(all 1's)
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmC,xmmE                    ; movntdqu XMMWORD [edi], xmmC
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmH,xmmE                    ; movntdqu XMMWORD [edi], xmmH
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+       movdqu  XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
  .out0:
+       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
         sub     ecx, byte SIZEOF_XMMWORD
         jz      near .endcolumn
  
@@ -452,80 +400,36 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
         alignx  16,7
  
  .column_st32:
-       pcmpeqb xmmE,xmmE                       ; xmmE=(all 1's)
         cmp     ecx, byte SIZEOF_XMMWORD/2
         jb      short .column_st16
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       add     edi, byte 2*SIZEOF_XMMWORD      ; outptr
         movdqa  xmmA,xmmC
         movdqa  xmmD,xmmH
         sub     ecx, byte SIZEOF_XMMWORD/2
  .column_st16:
         cmp     ecx, byte SIZEOF_XMMWORD/4
         jb      short .column_st15
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
+       movdqu  XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
         add     edi, byte SIZEOF_XMMWORD        ; outptr
         movdqa  xmmA,xmmD
         sub     ecx, byte SIZEOF_XMMWORD/4
  .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
         ; Store two pixels (8 bytes) of xmmA to the output when it has enough
         ; space.
         cmp     ecx, byte SIZEOF_XMMWORD/8
         jb      short .column_st7
-       movq    MMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD/2
+       movq    XMM_MMWORD [edi], xmmA
+       add     edi, byte SIZEOF_XMMWORD/8*4
         sub     ecx, byte SIZEOF_XMMWORD/8
-       psrldq  xmmA, 64
+       psrldq  xmmA, SIZEOF_XMMWORD/8*4
  .column_st7:
         ; Store one pixel (4 bytes) of xmmA to the output when it has enough
         ; space.
         test    ecx, ecx
         jz      short .endcolumn
-       movd    DWORD [edi], xmmA
-%else
-       cmp     ecx, byte SIZEOF_XMMWORD/16
-       jb      short .endcolumn
-       mov     eax,ecx
-       xor     ecx, byte 0x03
-       inc     ecx
-       shl     ecx, 4
-       movd    xmmF,ecx
-       psrlq   xmmE,xmmF
-       punpcklbw xmmE,xmmE
-       ; ----------------
-       mov     ecx,edi
-       and     ecx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       lea     eax, [ecx+eax*4]        ; RGB_PIXELSIZE
-       cmp     eax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     edi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     ecx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmB,xmmA
-       movdqa  xmmG,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmC,ecx
-       sub     ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmH,ecx
-       psllq   xmmA,xmmH
-       psllq   xmmE,xmmH
-       jmp     short .adj0
-.adj1: neg     ecx
-       movd    xmmH,ecx
-       psrlq   xmmA,xmmH
-       psrlq   xmmE,xmmH
-       psllq   xmmB,xmmC
-       psllq   xmmG,xmmC
-       por     xmmA,xmmB
-       por     xmmE,xmmG
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
+       movd    XMM_DWORD [edi], xmmA
  
  %endif ; RGB_PIXELSIZE ; ---------------
  
diff --git a/simd/jsimd.h b/simd/jsimd.h

index 6ee99cc..3d4751f 100644 (file)
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@ -522,6 +522,10 @@ EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
          JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
               JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
  
+EXTERN(void) jsimd_h2v1_fancy_upsample_neon
+        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
  /* SIMD Sample Conversion */
  EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
                                       JDIMENSION start_col,
diff --git a/simd/jsimd_arm.c b/simd/jsimd_arm.c

index af0c2c8..cae84df 100644 (file)
--- a/simd/jsimd_arm.c
+++ b/simd/jsimd_arm.c
@@ -338,6 +338,15 @@ jsimd_can_h2v1_fancy_upsample (void)
  {
    init_simd();
  
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_ARM_NEON)
+    return 1;
+
    return 0;
  }
  
@@ -355,6 +364,9 @@ jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
                             JSAMPARRAY input_data,
                             JSAMPARRAY * output_data_ptr)
  {
+  if (simd_support & JSIMD_ARM_NEON)
+    jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+        compptr->downsampled_width, input_data, output_data_ptr);
  }
  
  GLOBAL(int)
diff --git a/simd/jsimd_arm_neon.S b/simd/jsimd_arm_neon.S

index b2f9c2a..9962b8a 100644 (file)
--- a/simd/jsimd_arm_neon.S
+++ b/simd/jsimd_arm_neon.S
@@ -2157,3 +2157,241 @@ asm_function jsimd_quantize_neon
      .unreq          SHIFT
      .unreq          LOOP_COUNT
  .endfunc
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(void)
+ * jsimd_h2v1_fancy_upsample_neon (int          max_v_samp_factor,
+ *                                 JDIMENSION   downsampled_width,
+ *                                 JSAMPARRAY   input_data,
+ *                                 JSAMPARRAY * output_data_ptr);
+ *
+ * Note: the use of unaligned writes is the main remaining bottleneck in
+ *       this code, which can be potentially solved to get up to tens
+ *       of percents performance improvement on Cortex-A8/Cortex-A9.
+ */
+
+/*
+ * Upsample 16 source pixels to 32 destination pixels. The new 16 source
+ * pixels are loaded to q0. The previous 16 source pixels are in q1. The
+ * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
+ * Register d28 is used for multiplication by 3. Register q15 is used
+ * for adding +1 bias.
+ */
+.macro upsample16   OUTPTR, INPTR
+    vld1.8          {q0}, [\INPTR]!
+    vmovl.u8        q8,  d0
+    vext.8          q2,  q1,  q0, #15
+    vmovl.u8        q9,  d1
+    vaddw.u8        q10, q15, d4
+    vaddw.u8        q11, q15, d5
+    vmlal.u8        q8,  d4,  d28
+    vmlal.u8        q9,  d5,  d28
+    vmlal.u8        q10, d0,  d28
+    vmlal.u8        q11, d1,  d28
+    vmov            q1,  q0       /* backup source pixels to q1 */
+    vrshrn.u16      d6,  q8,  #2
+    vrshrn.u16      d7,  q9,  #2
+    vshrn.u16       d8,  q10, #2
+    vshrn.u16       d9,  q11, #2
+    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
+.endm
+
+/*
+ * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
+ * macro, the roles of q0 and q1 registers are reversed for even and odd
+ * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
+ * Also this unrolling allows to reorder loads and stores to compensate
+ * multiplication latency and reduce stalls.
+ */
+.macro upsample32   OUTPTR, INPTR
+    /* even 16 pixels group */
+    vld1.8          {q0}, [\INPTR]!
+    vmovl.u8        q8,  d0
+    vext.8          q2,  q1,  q0, #15
+    vmovl.u8        q9,  d1
+    vaddw.u8        q10, q15, d4
+    vaddw.u8        q11, q15, d5
+    vmlal.u8        q8,  d4,  d28
+    vmlal.u8        q9,  d5,  d28
+    vmlal.u8        q10, d0,  d28
+    vmlal.u8        q11, d1,  d28
+        /* odd 16 pixels group */
+        vld1.8          {q1}, [\INPTR]!
+    vrshrn.u16      d6,  q8,  #2
+    vrshrn.u16      d7,  q9,  #2
+    vshrn.u16       d8,  q10, #2
+    vshrn.u16       d9,  q11, #2
+        vmovl.u8        q8,  d2
+        vext.8          q2,  q0,  q1, #15
+        vmovl.u8        q9,  d3
+        vaddw.u8        q10, q15, d4
+        vaddw.u8        q11, q15, d5
+        vmlal.u8        q8,  d4,  d28
+        vmlal.u8        q9,  d5,  d28
+        vmlal.u8        q10, d2,  d28
+        vmlal.u8        q11, d3,  d28
+    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
+        vrshrn.u16      d6,  q8,  #2
+        vrshrn.u16      d7,  q9,  #2
+        vshrn.u16       d8,  q10, #2
+        vshrn.u16       d9,  q11, #2
+        vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
+.endm
+
+/*
+ * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
+ */
+.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
+    /* special case for the first and last pixels */
+    sub             \WIDTH, \WIDTH, #1
+    add             \OUTPTR, \OUTPTR, #1
+    ldrb            \TMP1, [\INPTR, \WIDTH]
+    strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
+    ldrb            \TMP1, [\INPTR], #1
+    strb            \TMP1, [\OUTPTR, #-1]
+    vmov.8          d3[7], \TMP1
+
+    subs            \WIDTH, \WIDTH, #32
+    blt             5f
+0:  /* process 32 pixels per iteration */
+    upsample32      \OUTPTR, \INPTR
+    subs            \WIDTH, \WIDTH, #32
+    bge             0b
+5:
+    adds            \WIDTH, \WIDTH, #16
+    blt             1f
+0:  /* process 16 pixels if needed */
+    upsample16      \OUTPTR, \INPTR
+    subs            \WIDTH, \WIDTH, #16
+1:
+    adds            \WIDTH, \WIDTH, #16
+    beq             9f
+
+    /* load the remaining 1-15 pixels */
+    add             \INPTR, \INPTR, \WIDTH
+    tst             \WIDTH, #1
+    beq             2f
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[0]}, [\INPTR]
+2:
+    tst             \WIDTH, #2
+    beq             2f
+    vext.8          d0, d0, d0, #6
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[1]}, [\INPTR]
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[0]}, [\INPTR]
+2:
+    tst             \WIDTH, #4
+    beq             2f
+    vrev64.32       d0, d0
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[3]}, [\INPTR]
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[2]}, [\INPTR]
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[1]}, [\INPTR]
+    sub             \INPTR, \INPTR, #1
+    vld1.8          {d0[0]}, [\INPTR]
+2:
+    tst             \WIDTH, #8
+    beq             2f
+    vmov            d1,  d0
+    sub             \INPTR, \INPTR, #8
+    vld1.8          {d0}, [\INPTR]
+2:  /* upsample the remaining pixels */
+    vmovl.u8        q8,  d0
+    vext.8          q2,  q1,  q0, #15
+    vmovl.u8        q9,  d1
+    vaddw.u8        q10, q15, d4
+    vaddw.u8        q11, q15, d5
+    vmlal.u8        q8,  d4,  d28
+    vmlal.u8        q9,  d5,  d28
+    vmlal.u8        q10, d0,  d28
+    vmlal.u8        q11, d1,  d28
+    vrshrn.u16      d10, q8,  #2
+    vrshrn.u16      d12, q9,  #2
+    vshrn.u16       d11, q10, #2
+    vshrn.u16       d13, q11, #2
+    vzip.8          d10, d11
+    vzip.8          d12, d13
+    /* store the remaining pixels */
+    tst             \WIDTH, #8
+    beq             2f
+    vst1.8          {d10, d11}, [\OUTPTR]!
+    vmov            q5,  q6
+2:
+    tst             \WIDTH, #4
+    beq             2f
+    vst1.8          {d10}, [\OUTPTR]!
+    vmov            d10,  d11
+2:
+    tst             \WIDTH, #2
+    beq             2f
+    vst1.8          {d10[0]}, [\OUTPTR]!
+    vst1.8          {d10[1]}, [\OUTPTR]!
+    vst1.8          {d10[2]}, [\OUTPTR]!
+    vst1.8          {d10[3]}, [\OUTPTR]!
+    vext.8          d10, d10, d10, #4
+2:
+    tst             \WIDTH, #1
+    beq             2f
+    vst1.8          {d10[0]}, [\OUTPTR]!
+    vst1.8          {d10[1]}, [\OUTPTR]!
+2:
+9:
+.endm
+
+asm_function jsimd_h2v1_fancy_upsample_neon
+
+    MAX_V_SAMP_FACTOR .req r0
+    DOWNSAMPLED_WIDTH .req r1
+    INPUT_DATA        .req r2
+    OUTPUT_DATA_PTR   .req r3
+    OUTPUT_DATA       .req OUTPUT_DATA_PTR
+
+    OUTPTR            .req r4
+    INPTR             .req r5
+    WIDTH             .req ip
+    TMP               .req lr
+
+    push            {r4, r5, r6, lr}
+    vpush           {d8-d15}
+
+    ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
+    cmp             MAX_V_SAMP_FACTOR, #0
+    ble             99f
+
+    /* initialize constants */
+    vmov.u8         d28, #3
+    vmov.u16        q15, #1
+11:
+    ldr             INPTR, [INPUT_DATA], #4
+    ldr             OUTPTR, [OUTPUT_DATA], #4
+    mov             WIDTH, DOWNSAMPLED_WIDTH
+    upsample_row    OUTPTR, INPTR, WIDTH, TMP
+    subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
+    bgt             11b
+
+99:
+    vpop            {d8-d15}
+    pop             {r4, r5, r6, pc}
+
+    .unreq          MAX_V_SAMP_FACTOR
+    .unreq          DOWNSAMPLED_WIDTH
+    .unreq          INPUT_DATA
+    .unreq          OUTPUT_DATA_PTR
+    .unreq          OUTPUT_DATA
+
+    .unreq          OUTPTR
+    .unreq          INPTR
+    .unreq          WIDTH
+    .unreq          TMP
+
+.endfunc
+
+.purgem upsample16
+.purgem upsample32
+.purgem upsample_row
diff --git a/simd/jsimdcfg.inc.h b/simd/jsimdcfg.inc.h

index 0dacd06..583b7e3 100644 (file)
--- a/simd/jsimdcfg.inc.h
+++ b/simd/jsimdcfg.inc.h
@@ -15,54 +15,51 @@
  #include "../jmorecfg.h"
  #include "jsimd.h"
  
-#define define(var) %define _cpp_protection_##var
-#define definev(var) %define _cpp_protection_##var var
-
  ;
  ; -- jpeglib.h
  ;
  
-definev(DCTSIZE)
-definev(DCTSIZE2)
+%define _cpp_protection_DCTSIZE DCTSIZE
+%define _cpp_protection_DCTSIZE2 DCTSIZE2
  
  ;
  ; -- jmorecfg.h
  ;
  
-definev(RGB_RED)
-definev(RGB_GREEN)
-definev(RGB_BLUE)
-definev(RGB_PIXELSIZE)
+%define _cpp_protection_RGB_RED RGB_RED
+%define _cpp_protection_RGB_GREEN RGB_GREEN
+%define _cpp_protection_RGB_BLUE RGB_BLUE
+%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE
  
-definev(EXT_RGB_RED)
-definev(EXT_RGB_GREEN)
-definev(EXT_RGB_BLUE)
-definev(EXT_RGB_PIXELSIZE)
+%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED
+%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN
+%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE
+%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE
  
-definev(EXT_RGBX_RED)
-definev(EXT_RGBX_GREEN)
-definev(EXT_RGBX_BLUE)
-definev(EXT_RGBX_PIXELSIZE)
+%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED
+%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN
+%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE
+%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE
  
-definev(EXT_BGR_RED)
-definev(EXT_BGR_GREEN)
-definev(EXT_BGR_BLUE)
-definev(EXT_BGR_PIXELSIZE)
+%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED
+%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN
+%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE
+%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE
  
-definev(EXT_BGRX_RED)
-definev(EXT_BGRX_GREEN)
-definev(EXT_BGRX_BLUE)
-definev(EXT_BGRX_PIXELSIZE)
+%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED
+%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN
+%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE
+%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE
  
-definev(EXT_XBGR_RED)
-definev(EXT_XBGR_GREEN)
-definev(EXT_XBGR_BLUE)
-definev(EXT_XBGR_PIXELSIZE)
+%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED
+%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN
+%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE
+%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE
  
-definev(EXT_XRGB_RED)
-definev(EXT_XRGB_GREEN)
-definev(EXT_XRGB_BLUE)
-definev(EXT_XRGB_PIXELSIZE)
+%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED
+%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN
+%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE
+%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE
  
  %define RGBX_FILLER_0XFF        1
  
@@ -73,7 +70,7 @@ definev(EXT_XRGB_PIXELSIZE)
  %define JSAMPLE                 byte          ; unsigned char
  %define SIZEOF_JSAMPLE          SIZEOF_BYTE   ; sizeof(JSAMPLE)
  
-definev(CENTERJSAMPLE)
+%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE
  
  ; Representation of a DCT frequency coefficient.
  ; On this SIMD implementation, this must be 'short'.
@@ -126,74 +123,74 @@ definev(CENTERJSAMPLE)
  ; -- jsimd.h
  ;
  
-definev(JSIMD_NONE)
-definev(JSIMD_MMX)
-definev(JSIMD_3DNOW)
-definev(JSIMD_SSE)
-definev(JSIMD_SSE2)
+%define _cpp_protection_JSIMD_NONE JSIMD_NONE
+%define _cpp_protection_JSIMD_MMX JSIMD_MMX
+%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW
+%define _cpp_protection_JSIMD_SSE JSIMD_SSE
+%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2
  
  ; Short forms of external names for systems with brain-damaged linkers.
  ;
  #ifdef NEED_SHORT_EXTERNAL_NAMES
-definev(jpeg_simd_cpu_support)
-definev(jsimd_rgb_ycc_convert_mmx)
-definev(jsimd_ycc_rgb_convert_mmx)
-definev(jconst_rgb_ycc_convert_sse2)
-definev(jsimd_rgb_ycc_convert_sse2)
-definev(jconst_ycc_rgb_convert_sse2)
-definev(jsimd_ycc_rgb_convert_sse2)
-definev(jsimd_h2v2_downsample_mmx)
-definev(jsimd_h2v1_downsample_mmx)
-definev(jsimd_h2v2_downsample_sse2)
-definev(jsimd_h2v1_downsample_sse2)
-definev(jsimd_h2v2_upsample_mmx)
-definev(jsimd_h2v1_upsample_mmx)
-definev(jsimd_h2v1_fancy_upsample_mmx)
-definev(jsimd_h2v2_fancy_upsample_mmx)
-definev(jsimd_h2v1_merged_upsample_mmx)
-definev(jsimd_h2v2_merged_upsample_mmx)
-definev(jsimd_h2v2_upsample_sse2)
-definev(jsimd_h2v1_upsample_sse2)
-definev(jconst_fancy_upsample_sse2)
-definev(jsimd_h2v1_fancy_upsample_sse2)
-definev(jsimd_h2v2_fancy_upsample_sse2)
-definev(jconst_merged_upsample_sse2)
-definev(jsimd_h2v1_merged_upsample_sse2)
-definev(jsimd_h2v2_merged_upsample_sse2)
-definev(jsimd_convsamp_mmx)
-definev(jsimd_convsamp_sse2)
-definev(jsimd_convsamp_float_3dnow)
-definev(jsimd_convsamp_float_sse)
-definev(jsimd_convsamp_float_sse2)
-definev(jsimd_fdct_islow_mmx)
-definev(jsimd_fdct_ifast_mmx)
-definev(jconst_fdct_islow_sse2)
-definev(jsimd_fdct_islow_sse2)
-definev(jconst_fdct_ifast_sse2)
-definev(jsimd_fdct_ifast_sse2)
-definev(jsimd_fdct_float_3dnow)
-definev(jconst_fdct_float_sse)
-definev(jsimd_fdct_float_sse)
-definev(jsimd_quantize_mmx)
-definev(jsimd_quantize_sse2)
-definev(jsimd_quantize_float_3dnow)
-definev(jsimd_quantize_float_sse)
-definev(jsimd_quantize_float_sse2)
-definev(jsimd_idct_2x2_mmx)
-definev(jsimd_idct_4x4_mmx)
-definev(jconst_idct_red_sse2)
-definev(jsimd_idct_2x2_sse2)
-definev(jsimd_idct_4x4_sse2)
-definev(jsimd_idct_islow_mmx)
-definev(jsimd_idct_ifast_mmx)
-definev(jconst_idct_islow_sse2)
-definev(jsimd_idct_islow_sse2)
-definev(jconst_idct_ifast_sse2)
-definev(jsimd_idct_ifast_sse2)
-definev(jsimd_idct_float_3dnow)
-definev(jconst_idct_float_sse)
-definev(jsimd_idct_float_sse)
-definev(jconst_idct_float_sse2)
-definev(jsimd_idct_float_sse2)
+%define _cpp_protection_jpeg_simd_cpu_support jpeg_simd_cpu_support
+%define _cpp_protection_jsimd_rgb_ycc_convert_mmx jsimd_rgb_ycc_convert_mmx
+%define _cpp_protection_jsimd_ycc_rgb_convert_mmx jsimd_ycc_rgb_convert_mmx
+%define _cpp_protection_jconst_rgb_ycc_convert_sse2 jconst_rgb_ycc_convert_sse2
+%define _cpp_protection_jsimd_rgb_ycc_convert_sse2 jsimd_rgb_ycc_convert_sse2
+%define _cpp_protection_jconst_ycc_rgb_convert_sse2 jconst_ycc_rgb_convert_sse2
+%define _cpp_protection_jsimd_ycc_rgb_convert_sse2 jsimd_ycc_rgb_convert_sse2
+%define _cpp_protection_jsimd_h2v2_downsample_mmx jsimd_h2v2_downsample_mmx
+%define _cpp_protection_jsimd_h2v1_downsample_mmx jsimd_h2v1_downsample_mmx
+%define _cpp_protection_jsimd_h2v2_downsample_sse2 jsimd_h2v2_downsample_sse2
+%define _cpp_protection_jsimd_h2v1_downsample_sse2 jsimd_h2v1_downsample_sse2
+%define _cpp_protection_jsimd_h2v2_upsample_mmx jsimd_h2v2_upsample_mmx
+%define _cpp_protection_jsimd_h2v1_upsample_mmx jsimd_h2v1_upsample_mmx
+%define _cpp_protection_jsimd_h2v1_fancy_upsample_mmx jsimd_h2v1_fancy_upsample_mmx
+%define _cpp_protection_jsimd_h2v2_fancy_upsample_mmx jsimd_h2v2_fancy_upsample_mmx
+%define _cpp_protection_jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_merged_upsample_mmx
+%define _cpp_protection_jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_merged_upsample_mmx
+%define _cpp_protection_jsimd_h2v2_upsample_sse2 jsimd_h2v2_upsample_sse2
+%define _cpp_protection_jsimd_h2v1_upsample_sse2 jsimd_h2v1_upsample_sse2
+%define _cpp_protection_jconst_fancy_upsample_sse2 jconst_fancy_upsample_sse2
+%define _cpp_protection_jsimd_h2v1_fancy_upsample_sse2 jsimd_h2v1_fancy_upsample_sse2
+%define _cpp_protection_jsimd_h2v2_fancy_upsample_sse2 jsimd_h2v2_fancy_upsample_sse2
+%define _cpp_protection_jconst_merged_upsample_sse2 jconst_merged_upsample_sse2
+%define _cpp_protection_jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_merged_upsample_sse2
+%define _cpp_protection_jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_merged_upsample_sse2
+%define _cpp_protection_jsimd_convsamp_mmx jsimd_convsamp_mmx
+%define _cpp_protection_jsimd_convsamp_sse2 jsimd_convsamp_sse2
+%define _cpp_protection_jsimd_convsamp_float_3dnow jsimd_convsamp_float_3dnow
+%define _cpp_protection_jsimd_convsamp_float_sse jsimd_convsamp_float_sse
+%define _cpp_protection_jsimd_convsamp_float_sse2 jsimd_convsamp_float_sse2
+%define _cpp_protection_jsimd_fdct_islow_mmx jsimd_fdct_islow_mmx
+%define _cpp_protection_jsimd_fdct_ifast_mmx jsimd_fdct_ifast_mmx
+%define _cpp_protection_jconst_fdct_islow_sse2 jconst_fdct_islow_sse2
+%define _cpp_protection_jsimd_fdct_islow_sse2 jsimd_fdct_islow_sse2
+%define _cpp_protection_jconst_fdct_ifast_sse2 jconst_fdct_ifast_sse2
+%define _cpp_protection_jsimd_fdct_ifast_sse2 jsimd_fdct_ifast_sse2
+%define _cpp_protection_jsimd_fdct_float_3dnow jsimd_fdct_float_3dnow
+%define _cpp_protection_jconst_fdct_float_sse jconst_fdct_float_sse
+%define _cpp_protection_jsimd_fdct_float_sse jsimd_fdct_float_sse
+%define _cpp_protection_jsimd_quantize_mmx jsimd_quantize_mmx
+%define _cpp_protection_jsimd_quantize_sse2 jsimd_quantize_sse2
+%define _cpp_protection_jsimd_quantize_float_3dnow jsimd_quantize_float_3dnow
+%define _cpp_protection_jsimd_quantize_float_sse jsimd_quantize_float_sse
+%define _cpp_protection_jsimd_quantize_float_sse2 jsimd_quantize_float_sse2
+%define _cpp_protection_jsimd_idct_2x2_mmx jsimd_idct_2x2_mmx
+%define _cpp_protection_jsimd_idct_4x4_mmx jsimd_idct_4x4_mmx
+%define _cpp_protection_jconst_idct_red_sse2 jconst_idct_red_sse2
+%define _cpp_protection_jsimd_idct_2x2_sse2 jsimd_idct_2x2_sse2
+%define _cpp_protection_jsimd_idct_4x4_sse2 jsimd_idct_4x4_sse2
+%define _cpp_protection_jsimd_idct_islow_mmx jsimd_idct_islow_mmx
+%define _cpp_protection_jsimd_idct_ifast_mmx jsimd_idct_ifast_mmx
+%define _cpp_protection_jconst_idct_islow_sse2 jconst_idct_islow_sse2
+%define _cpp_protection_jsimd_idct_islow_sse2 jsimd_idct_islow_sse2
+%define _cpp_protection_jconst_idct_ifast_sse2 jconst_idct_ifast_sse2
+%define _cpp_protection_jsimd_idct_ifast_sse2 jsimd_idct_ifast_sse2
+%define _cpp_protection_jsimd_idct_float_3dnow jsimd_idct_float_3dnow
+%define _cpp_protection_jconst_idct_float_sse jconst_idct_float_sse
+%define _cpp_protection_jsimd_idct_float_sse jsimd_idct_float_sse
+%define _cpp_protection_jconst_idct_float_sse2 jconst_idct_float_sse2
+%define _cpp_protection_jsimd_idct_float_sse2 jsimd_idct_float_sse2
  #endif /* NEED_SHORT_EXTERNAL_NAMES */
  
diff --git a/simd/jsimdext.inc b/simd/jsimdext.inc

index 4ab9bc0..253b897 100644 (file)
--- a/simd/jsimdext.inc
+++ b/simd/jsimdext.inc
@@ -86,8 +86,6 @@ section .note.GNU-stack noalloc noexec nowrite progbits
  %define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
  %endif
  
-%define STRICT_MEMORY_ACCESS 1
-
  ; To make the code position-independent, append -DPIC to the commandline
  ;
  %define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_      ; ELF supports PIC
@@ -322,15 +320,15 @@ const_base:
         push rsi
         push rdi
         sub     rsp, SIZEOF_XMMWORD
-       movlpd  XMMWORD [rsp], xmm6
+       movaps  XMMWORD [rsp], xmm6
         sub     rsp, SIZEOF_XMMWORD
-       movlpd  XMMWORD [rsp], xmm7
+       movaps  XMMWORD [rsp], xmm7
  %endmacro
  
  %imacro uncollect_args 0
-       movlpd  xmm7, XMMWORD [rsp]
+       movaps  xmm7, XMMWORD [rsp]
         add     rsp, SIZEOF_XMMWORD
-       movlpd  xmm6, XMMWORD [rsp]
+       movaps  xmm6, XMMWORD [rsp]
         add     rsp, SIZEOF_XMMWORD
         pop rdi
         pop rsi
author	jk7744.park <jk7744.park@samsung.com>
	Tue, 8 Sep 2015 12:57:56 +0000 (21:57 +0900)
committer	jk7744.park <jk7744.park@samsung.com>
	Tue, 8 Sep 2015 12:57:56 +0000 (21:57 +0900)
COPYING	[new file with mode: 0644]	patch \| blob
Makefile.am		patch \| blob \| history
configure.ac		patch \| blob \| history
debian/changelog		patch \| blob \| history
debian/copyright	[new file with mode: 0644]	patch \| blob
debian/libjpeg-turbo-dev.files		patch \| blob \| history
debian/libjpeg-turbo.files		patch \| blob \| history
jdapimin.c		patch \| blob \| history
jdcoefct.c		patch \| blob \| history
jdmarker.c		patch \| blob \| history
jdmerge.c		patch \| blob \| history
jpeglib.h		patch \| blob \| history
libjpeg-turbo.manifest	[new file with mode: 0755]	patch \| blob
packaging/libjpeg-turbo.spec		patch \| blob \| history
pkgconfig/turbojpeg.pc.in		patch \| blob \| history
simd/jdclrss2-64.asm		patch \| blob \| history
simd/jdclrss2.asm		patch \| blob \| history
simd/jdmrgss2-64.asm		patch \| blob \| history
simd/jdmrgss2.asm		patch \| blob \| history
simd/jsimd.h		patch \| blob \| history
simd/jsimd_arm.c		patch \| blob \| history
simd/jsimd_arm_neon.S		patch \| blob \| history
simd/jsimdcfg.inc.h		patch \| blob \| history
simd/jsimdext.inc		patch \| blob \| history