--- /dev/null
+*******************************************************************************
+** License
+*******************************************************************************
+
+Most of libjpeg-turbo inherits the non-restrictive, BSD-style license used by
+libjpeg (see README.) The TurboJPEG/OSS wrapper (both C and Java versions) and
+associated test programs bear a similar license, which is reproduced below:
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+- Neither the name of the libjpeg-turbo Project nor the names of its
+ contributors may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
lib_LTLIBRARIES = libjpeg.la libturbojpeg.la
libjpeg_la_LDFLAGS = -version-info ${SO_MAJOR_VERSION}:${SO_MINOR_VERSION} -no-undefined
libturbojpeg_la_LDFLAGS = -avoid-version -no-undefined
-include_HEADERS = turbojpeg.h
-installheaderjpeglibdir = $(includedir)/turbojpeg
-installheaderjpeglib_HEADERS = jpeglib.h jerror.h jmorecfg.h jconfig.h
-#nodist_include_HEADERS = jconfig.h
+include_HEADERS = jerror.h jmorecfg.h jpeglib.h turbojpeg.h
+nodist_include_HEADERS = jconfig.h
HDRS = jchuff.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
jpegint.h jpeglib.h jversion.h jsimd.h jsimddct.h jpegcomp.h
jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c \
jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c
-
if WITH_ARITH
libjpeg_la_SOURCES += jaricom.c
cjpeg_LDADD = libjpeg.la
cjpeg_CFLAGS = -DBMP_SUPPORTED -DGIF_SUPPORTED -DPPM_SUPPORTED \
- -DTARGA_SUPPORTED
+ -DTARGA_SUPPORTED \
+ -fPIE -pie
djpeg_SOURCES = cdjpeg.h cderror.h cdjpeg.c djpeg.c rdcolmap.c rdswitch.c \
wrbmp.c wrgif.c wrppm.c wrtarga.c
djpeg_LDADD = libjpeg.la
djpeg_CFLAGS = -DBMP_SUPPORTED -DGIF_SUPPORTED -DPPM_SUPPORTED \
- -DTARGA_SUPPORTED
+ -DTARGA_SUPPORTED \
+ -fPIE -pie
jpegtran_SOURCES = jpegtran.c rdswitch.c cdjpeg.c transupp.c transupp.h
AC_LINK_IFELSE(AC_LANG_PROGRAM([], []),
[VERSION_SCRIPT_FLAG=-Wl,--version-script,; AC_MSG_RESULT([yes (GNU style)])], [])
if test "x$VERSION_SCRIPT_FLAG" = "x"; then
- LDFLAGS="$SAVED_LDFLAGS -Wl,-M,conftest.map"
+ LDFLAGS="$SAVED_LDFLAGS -Wl,-M,conftest.map -pie"
AC_LINK_IFELSE(AC_LANG_PROGRAM([], []),
[VERSION_SCRIPT_FLAG=-Wl,-M,; AC_MSG_RESULT([yes (Sun style)])], [])
fi
+libjpeg-turbo (1.2.0-8) unstable; urgency=low
+
+ * [TREL] Install djpeg, cjpeg
+ * Git: external/libjpeg-turbo
+ * Tag: libjpeg-turbo_1.2.0-8
+
+ -- YoungHun Kim <yh8004.kim@samsung.com> Fri, 05 Apr 2013 14:10:35 +0900
+
+libjpeg-turbo (1.2.0-7) unstable; urgency=low
+
+ * Enable simd for i586 (including IA)
+ * Git: external/libjpeg-turbo
+ * Tag: libjpeg-turbo_1.2.0-7
+
+ -- YoungHun Kim <yh8004.kim@samsung.com> Mon, 07 Jan 2013 15:14:52 +0900
+
+libjpeg-turbo (1.2.0-6) unstable; urgency=low
+
+ * Enable libjpeg package of libjpeg-turbo
+ * Git: external/libjpeg-turbo
+ * Tag: libjpeg-turbo_1.2.0-6
+
+ -- YoungHun Kim <yh8004.kim@samsung.com> Thu, 25 Oct 2012 01:09:00 +0900
+
+libjpeg-turbo (1.2.0-5) unstable; urgency=low
+
+ * License file copied to /usr/share/license/
+ * Git: external/libjpeg-turbo
+ * Tag: libjpeg-turbo_1.2.0-5
+
+ -- YoungHun Kim <yh8004.kim@ysamsung.com> Fri, 12 Oct 2012 18:13:55 +0900
+
+libjpeg-turbo (1.2.0-4) unstable; urgency=low
+
+ * Fix CVE-2012-2806
+ * Git: external/libjpeg-turbo
+ * Tag: libjpeg-turbo_1.2.0-4
+
+ -- YoungHun Kim <yh8004.kim@samsung.com> Thu, 04 Oct 2012 16:23:12 +0900
+
+libjpeg-turbo (1.2.0-3) unstable; urgency=low
+
+ * Add manifest file
+ * Git: external/libjpeg-turbo
+ * Tag: libjpeg-turbo_1.2.0-3
+
+ -- YoungHun Kim <yh8004.kim@samsung.com> Thu, 20 Sep 2012 20:57:44 +0900
+
libjpeg-turbo (1.2.0-2) unstable; urgency=low
* Tag [Version] 1.2.0-2
--- /dev/null
+Format-Specification: http://svn.debian.org/wsvn/dep/web/deps/dep5.mdwn?op=file&rev=135
+Name: libjpeg-turbo
+Source: lp:libjpeg-turbo
+
+Files: *
+Copyright: 1999-2006 MIYASAKA Masaru
+ 2004 Landmark Graphics Corporation
+ 2005-2007 Sun Microsystems, Inc.
+ 2009 Pierre Ossman for Cendio AB
+ 2009-2010 D. R. Commander
+ 2010 Thomas G. Lane, Guido Vollbeding
+ 2009, Thomas G. Lane, Guido Vollbeding
+ 1998, Thomas G. Lane
+ 2010 Nokia Corporation
+License: JPEG
+ .
+ In plain English:
+ .
+ 1. We don't promise that this software works. (But if you find any bugs,
+ please let us know!)
+ 2. You can use this software for whatever you want. You don't have to pay us.
+ 3. You may not pretend that you wrote this software. If you use it in a
+ program, you must acknowledge somewhere in your documentation that
+ you've used the IJG code.
+ .
+ In legalese:
+ The authors make NO WARRANTY or representation, either express or implied,
+ with respect to this software, its quality, accuracy, merchantability, or
+ fitness for a particular purpose. This software is provided "AS IS", and you,
+ its user, assume the entire risk as to its quality and accuracy.
+ .
+ This software is copyright (C) 1991-2010, Thomas G. Lane, Guido Vollbeding.
+ All Rights Reserved except as specified below.
+ .
+ Permission is hereby granted to use, copy, modify, and distribute this
+ software (or portions thereof) for any purpose, without fee, subject to these
+ conditions:
+ (1) If any part of the source code for this software is distributed, then this
+ README file must be included, with this copyright and no-warranty notice
+ unaltered; and any additions, deletions, or changes to the original files
+ must be clearly indicated in accompanying documentation.
+ (2) If only executable code is distributed, then the accompanying
+ documentation must state that "this software is based in part on the work of
+ the Independent JPEG Group".
+ (3) Permission for use of this software is granted only if the user accepts
+ full responsibility for any undesirable consequences; the authors accept
+ NO LIABILITY for damages of any kind.
+ .
+ These conditions apply to any software derived from or based on the IJG code,
+ not just to the unmodified library. If you use our work, you ought to
+ acknowledge us.
+ .
+ Permission is NOT granted for the use of any IJG author's name or company name
+ in advertising or publicity relating to this software or products derived from
+ it. This software may be referred to only as "the Independent JPEG Group's
+ software".
+ .
+ We specifically permit and encourage the use of this software as the basis of
+ commercial products, provided that all warranty or liability claims are
+ assumed by the product vendor.
+ .
+ .
+ ansi2knr.c is included in this distribution by permission of L. Peter Deutsch,
+ sole proprietor of its copyright holder, Aladdin Enterprises of Menlo Park, CA
+ .
+ ansi2knr.c is NOT covered by the above copyright and conditions, but instead
+ by the usual distribution terms of the Free Software Foundation; principally,
+ that you must include source code if you redistribute it. (See the file
+ ansi2knr.c for full details.) However, since ansi2knr.c is not needed as part
+ of any program generated from the IJG code, this does not limit you more than
+ the foregoing paragraphs do.
+ .
+ The Unix configuration script "configure" was produced with GNU Autoconf.
+ It is copyright by the Free Software Foundation but is freely distributable.
+ The same holds for its supporting scripts (config.guess, config.sub,
+ ltmain.sh). Another support script, install-sh, is copyright by X Consortium
+ but is also freely distributable.
+ .
+ The IJG distribution formerly included code to read and write GIF files.
+ To avoid entanglement with the Unisys LZW patent, GIF reading support has
+ been removed altogether, and the GIF writer has been simplified to produce
+ "uncompressed GIFs". This technique does not use the LZW algorithm; the
+ resulting GIF files are larger than usual, but are readable by all standard
+ GIF decoders.
+ .
+ We are required to state that
+ "The Graphics Interchange Format(c) is the Copyright property of
+ CompuServe Incorporated. GIF(sm) is a Service Mark property of
+ CompuServe Incorporated."
+
+Files: debian/*
+Copyright: 2010, 2011 Linaro Limited
+License: LGPL-2.1
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Library General Public
+ License (LGPL) as published by the Free Software Foundation;
+ either version 2 of the License, or (at your option) any later
+ version.
+ .
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Library General Public License for more details.
+ .
+ You should have received a copy of the GNU Library General Public License
+ along with this library; see the file COPYING.LIB. If not, write to
+ the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ Boston, MA 02110-1301, USA.
+ .
+ On Debian systems, the complete text of the GNU Library General Public
+ License, version 2, can be found in /usr/share/common-licenses/LGPL-2.1.
-usr/include/turbojpeg.h
-usr/include/turbojpeg/jpeglib.h
-usr/include/turbojpeg/jerror.h
-usr/include/turbojpeg/jmorecfg.h
-usr/include/turbojpeg/jconfig.h
+usr/include/*.h
usr/lib/pkgconfig/*
-usr/lib/libturbojpeg.a
-usr/lib/libturbojpeg.la
usr/lib/libturbojpeg.so*
+usr/lib/libjpeg.so*
/* And initialize the overall input controller. */
jinit_input_controller(cinfo);
+ /* Init region to decode to be empty */
+ cinfo->region_x = 0;
+ cinfo->region_y = 0;
+ cinfo->region_w = 0;
+ cinfo->region_h = 0;
+
+ /* Init region to decode to be empty */
+ cinfo->region_x = 0;
+ cinfo->region_y = 0;
+ cinfo->region_w = 0;
+ cinfo->region_h = 0;
+
/* OK, I'm ready */
cinfo->global_state = DSTATE_START;
}
JDIMENSION start_col, output_col;
jpeg_component_info *compptr;
inverse_DCT_method_ptr inverse_DCT;
+ /* region decoding. this limits decode to the set of blocks +- 1 outside
+ * bounding blocks around the desired region to decode */
+ int blk1 = 0, blk2 = 0, skip = 0;
+
+ if ((cinfo->region_w > 0) && (cinfo->region_h > 0)) {
+ int bsz_w = 0, bsz_h = 0;
+
+ for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
+ compptr = cinfo->cur_comp_info[ci];
+ if (compptr->MCU_sample_width > bsz_w)
+ bsz_w = compptr->MCU_sample_width;
+ if ((compptr->MCU_height * 8) > bsz_h)
+ bsz_h = compptr->MCU_height * 8;
+ }
+ int _region_y = (int)cinfo->region_y;
+ _region_y = (_region_y>>1)<<1;
+ if (((int)cinfo->output_scanline < (_region_y - bsz_h - 1)) ||
+ ((int)cinfo->output_scanline > (_region_y + cinfo->region_h + bsz_h)))
+ skip = 1;
+ blk1 = (cinfo->region_x / bsz_w) - 1;
+ if (blk1 < 0) blk1 = 0;
+ blk2 = ((cinfo->region_x + cinfo->region_w + bsz_w - 1) / bsz_w) + 1;
+ if (blk2 < 0) blk2 = 0;
+ }
/* Loop to process as much as one whole iMCU row */
for (yoffset = coef->MCU_vert_offset; yoffset < coef->MCU_rows_per_iMCU_row;
yoffset++) {
for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
MCU_col_num++) {
+ /* see if we need to skip this MCU or not */
+ if ((cinfo->region_w > 0) && (cinfo->region_h > 0)) {
+ if (!((MCU_col_num < blk1) || (MCU_col_num > blk2) || skip))
+ skip = 0;
+ }
+ /* if we are not skipping this MCU, zero it ready for huffman decode */
+ if (!skip)
+ jzero_far((void FAR *) coef->MCU_buffer[0],
+ (size_t) (cinfo->blocks_in_MCU * SIZEOF(JBLOCK)));
/* Try to fetch an MCU. Entropy decoder expects buffer to be zeroed. */
jzero_far((void FAR *) coef->MCU_buffer[0],
(size_t) (cinfo->blocks_in_MCU * SIZEOF(JBLOCK)));
coef->MCU_ctr = MCU_col_num;
return JPEG_SUSPENDED;
}
+ /* region decoding. this limits decode to the set of blocks +- 1 outside
+ * bounding blocks around the desired region to decode */
+ if (skip)
+ continue;
/* Determine where data should go in output_buf and do the IDCT thing.
* We skip dummy blocks at the right and bottom edges (but blkn gets
* incremented past them!). Note the inner loop relies on having
/* Collect the component-spec parameters */
- for (i = 0; i < cinfo->num_components; i++)
+ for (i = 0; i < MAX_COMPS_IN_SCAN; i++)
cinfo->cur_comp_info[i] = NULL;
for (i = 0; i < n; i++) {
INPUT_BYTE(cinfo, cc, return FALSE);
INPUT_BYTE(cinfo, c, return FALSE);
- for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
+ for (ci = 0, compptr = cinfo->comp_info;
+ ci < cinfo->num_components && ci < MAX_COMPS_IN_SCAN;
ci++, compptr++) {
if (cc == compptr->component_id && !cinfo->cur_comp_info[ci])
goto id_found;
my_upsample_ptr upsample = (my_upsample_ptr) cinfo->upsample;
JSAMPROW work_ptrs[2];
JDIMENSION num_rows; /* number of rows returned to caller */
+ int skip = 0;
if (upsample->spare_full) {
/* If we have a spare row saved from a previous cycle, just return it. */
num_rows = 1;
upsample->spare_full = FALSE;
} else {
+ int _region_y = (int)cinfo->region_y;
+ _region_y = (_region_y>>1)<<1;
+ if ((cinfo->region_w > 0) && (cinfo->region_h > 0)) {
+ if (((int)cinfo->output_scanline < _region_y) ||
+ ((int)cinfo->output_scanline >= (_region_y + (int)cinfo->region_h)))
+ skip = 1;
+ }
/* Figure number of rows to return to caller. */
num_rows = 2;
/* Not more than the distance to the end of the image. */
upsample->spare_full = TRUE;
}
/* Now do the upsampling. */
- (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr, work_ptrs);
+ if (!skip)
+ (*upsample->upmethod) (cinfo, input_buf, *in_row_group_ctr, work_ptrs);
}
/* Adjust counts */
unsigned int scale_num, scale_denom; /* fraction by which to scale image */
+ unsigned int region_x, region_y, region_w, region_h; /* if region_w && region_h > 0, then use this region to decode. scale above is done prior to region select */
+
double output_gamma; /* image gamma wanted in output */
boolean buffered_image; /* TRUE=multiple output passes */
--- /dev/null
+<manifest>
+ <define>
+ <domain name="libjpeg-turbo"/>
+ </define>
+ <request>
+ <domain name="_" />
+ </request>
+ <assign>
+ <filesystem path="/usr/bin/cjpeg" label="libjpeg-turbo" exec_label="libjpeg-turbo" />
+ <filesystem path="/usr/bin/djpeg" label="libjpeg-turbo" exec_label="libjpeg-turbo" />
+ </assign>
+</manifest>
Name: libjpeg-turbo
-License: BSD3c(or similar)
+License: BSD-2.0
Group: Productivity/Graphics/Convertors
AutoReqProv: on
Version: 1.2.0
-Release: 2
+Release: 12
Summary: A MMX/SSE2 accelerated library for manipulating JPEG image files
Url: http://sourceforge.net/projects/libjpeg-turbo
Source0: %{name}-%{version}.tar.gz
JPEG images.
%package devel
-
-License: BSD3c(or similar)
Summary: Developement files for libjpeg-turbo contains a wrapper library (TurboJPEG/OSS) that emulates the TurboJPEG API using libjpeg-turbo
Group: Development/Libraries/C and C++
Requires: %{name} = %{version}-%{release}
-
+Provides: libjpeg-devel
+%ifarch %{ix86}
+BuildRequires: nasm
+%endif
%description devel
-The libjpeg-turbo shared libraries can be used as drop-in replacements for libjpeg on most systems
+The libjpeg-devel package includes the header files and documentation
+necessary for developing programs which will manipulate JPEG files using
+the libjpeg library.
+
+If you are going to develop programs which will manipulate JPEG images,
+you should install libjpeg-devel. You'll also need to have the libjpeg
+package installed.
%prep
%setup -q
%build
autoreconf -fiv
-%ifarch %{arm}
-%configure --disable-static --with-jpeg8
-%else
-%configure --disable-static --with-jpeg8 --without-simd
-%endif
+%configure --enable-shared --disable-static --with-jpeg8
make %{?_smp_mflags}
#%check
%install
%makeinstall
+mkdir -p %{buildroot}/usr/share/license
+cp COPYING %{buildroot}/usr/share/license/%{name}
# Fix perms
chmod -x README-turbo.txt release/copyright
%postun -p /sbin/ldconfig
%files
+/usr/share/license/%{name}
+%manifest libjpeg-turbo.manifest
%defattr(-,root,root)
%{_libdir}/libturbojpeg.so
+%{_libdir}/libjpeg.so.*
+%{_bindir}/cjpeg
+%{_bindir}/djpeg
%exclude %{_datadir}/man/man1/*
%exclude %{_datadir}/doc/
-%exclude %{_bindir}/cjpeg
-%exclude %{_bindir}/djpeg
+#%exclude %{_bindir}/cjpeg
+#%exclude %{_bindir}/djpeg
%exclude %{_bindir}/jpegtran
%exclude %{_bindir}/rdjpgcom
%exclude %{_bindir}/tjbench
%exclude %{_bindir}/wrjpgcom
-%exclude %{_libdir}/libjpeg.so.*
%files devel
%defattr(-,root,root)
-%{_includedir}/turbojpeg.h
-%exclude %{_libdir}/libjpeg.so
-%{_includedir}/turbojpeg/jpeglib.h
-%{_includedir}/turbojpeg/jerror.h
-%{_includedir}/turbojpeg/jmorecfg.h
-%{_includedir}/turbojpeg/jconfig.h
+%{_libdir}/libjpeg.so
+%{_includedir}/*.h
%{_libdir}/pkgconfig/turbojpeg.pc
%exclude %{_libdir}/libjpeg.la
%exclude %{_libdir}/libturbojpeg.la
prefix=/usr
exec_prefix=${prefix}
libdir=${exec_prefix}/lib
-includedir=${prefix}/include/turbojpeg
+includedir=${prefix}/include
Name: libturbojpeg
Description: Loads and saves jpeg
;
; jdclrss2-64.asm - colorspace conversion (64-bit SSE2)
;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 D. R. Commander
;
; Based on
; x86 SIMD extension for IJG JPEG library
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow
jmp near .columnloop
.column_st32:
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_MMWORD
jb short .column_st7
- movq MMWORD [rdi], xmmA
+ movq XMM_MMWORD [rdi], xmmA
add rdi, byte SIZEOF_MMWORD
sub rcx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD
; space.
cmp rcx, byte SIZEOF_DWORD
jb short .column_st3
- movd DWORD [rdi], xmmA
+ movd XMM_DWORD [rdi], xmmA
add rdi, byte SIZEOF_DWORD
sub rcx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD
test rcx, rcx
jz short .nextrow
mov BYTE [rdi], al
-%else
- mov rax,rcx
- xor rcx, byte 0x0F
- shl rcx, 2
- movd xmmB,ecx
- psrlq xmmH,4
- pcmpeqb xmmE,xmmE
- psrlq xmmH,xmmB
- psrlq xmmE,xmmB
- punpcklbw xmmE,xmmH
- ; ----------------
- mov rcx,rdi
- and rcx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- add rax,rcx
- cmp rax, byte SIZEOF_XMMWORD
- ja short .adj0
- and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,rcx
- movdqa xmmG,xmmA
- movdqa xmmC,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmD,ecx
- sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmF,ecx
- psllq xmmA,xmmF
- psllq xmmE,xmmF
- jmp short .adj0
-.adj1: neg ecx
- movd xmmF,ecx
- psrlq xmmA,xmmF
- psrlq xmmE,xmmF
- psllq xmmG,xmmD
- psllq xmmC,xmmD
- por xmmA,xmmG
- por xmmE,xmmC
-.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .nextrow
jmp near .columnloop
.column_st32:
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub rcx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_XMMWORD/8
; space.
test rcx, rcx
jz short .nextrow
- movd DWORD [rdi], xmmA
-%else
- cmp rcx, byte SIZEOF_XMMWORD/16
- jb near .nextrow
- mov rax,rcx
- xor rcx, byte 0x03
- inc rcx
- shl rcx, 4
- movd xmmF,ecx
- psrlq xmmE,xmmF
- punpcklbw xmmE,xmmE
- ; ----------------
- mov rcx,rdi
- and rcx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
- cmp rax, byte SIZEOF_XMMWORD
- ja short .adj0
- and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmB,xmmA
- movdqa xmmG,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmC,ecx
- sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmH,ecx
- psllq xmmA,xmmH
- psllq xmmE,xmmH
- jmp short .adj0
-.adj1: neg rcx
- movd xmmH,ecx
- psrlq xmmA,xmmH
- psrlq xmmE,xmmH
- psllq xmmB,xmmC
- psllq xmmG,xmmC
- por xmmA,xmmB
- por xmmE,xmmG
-.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
+ movd XMM_DWORD [rdi], xmmA
%endif ; RGB_PIXELSIZE ; ---------------
;
; jdclrss2.asm - colorspace conversion (SSE2)
;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2012 D. R. Commander
;
; Based on
; x86 SIMD extension for IJG JPEG library
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow
alignx 16,7
.column_st32:
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
jb short .column_st7
- movq MMWORD [edi], xmmA
+ movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_MMWORD
sub ecx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD
; space.
cmp ecx, byte SIZEOF_DWORD
jb short .column_st3
- movd DWORD [edi], xmmA
+ movd XMM_DWORD [edi], xmmA
add edi, byte SIZEOF_DWORD
sub ecx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD
test ecx, ecx
jz short .nextrow
mov BYTE [edi], al
-%else
- mov eax,ecx
- xor ecx, byte 0x0F
- shl ecx, 2
- movd xmmB,ecx
- psrlq xmmH,4
- pcmpeqb xmmE,xmmE
- psrlq xmmH,xmmB
- psrlq xmmE,xmmB
- punpcklbw xmmE,xmmH
- ; ----------------
- mov ecx,edi
- and ecx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- add eax,ecx
- cmp eax, byte SIZEOF_XMMWORD
- ja short .adj0
- and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmG,xmmA
- movdqa xmmC,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmD,ecx
- sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmF,ecx
- psllq xmmA,xmmF
- psllq xmmE,xmmF
- jmp short .adj0
-.adj1: neg ecx
- movd xmmF,ecx
- psrlq xmmA,xmmF
- psrlq xmmE,xmmF
- psllq xmmG,xmmD
- psllq xmmC,xmmD
- por xmmA,xmmG
- por xmmE,xmmC
-.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .nextrow
alignx 16,7
.column_st32:
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub ecx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_XMMWORD/8
jb short .column_st7
- movq MMWORD [edi], xmmA
+ movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_XMMWORD/8*4
sub ecx, byte SIZEOF_XMMWORD/8
psrldq xmmA, SIZEOF_XMMWORD/8*4
; space.
test ecx, ecx
jz short .nextrow
- movd DWORD [edi], xmmA
-%else
- cmp ecx, byte SIZEOF_XMMWORD/16
- jb short .nextrow
- mov eax,ecx
- xor ecx, byte 0x03
- inc ecx
- shl ecx, 4
- movd xmmF,ecx
- psrlq xmmE,xmmF
- punpcklbw xmmE,xmmE
- ; ----------------
- mov ecx,edi
- and ecx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
- cmp eax, byte SIZEOF_XMMWORD
- ja short .adj0
- and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmB,xmmA
- movdqa xmmG,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmC,ecx
- sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmH,ecx
- psllq xmmA,xmmH
- psllq xmmE,xmmH
- jmp short .adj0
-.adj1: neg ecx
- movd xmmH,ecx
- psrlq xmmA,xmmH
- psrlq xmmE,xmmH
- psllq xmmB,xmmC
- psllq xmmG,xmmC
- por xmmA,xmmB
- por xmmE,xmmG
-.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
+ movd XMM_DWORD [edi], xmmA
%endif ; RGB_PIXELSIZE ; ---------------
;
; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 D. R. Commander
;
; Based on
; x86 SIMD extension for IJG JPEG library
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ for
+; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; [TAB8]
movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [rdi], xmmF
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn
jmp near .columnloop
.column_st32:
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea rcx, [rcx+rcx*2] ; imul ecx, RGB_PIXELSIZE
cmp rcx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub rcx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD
jb short .column_st15
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [rdi], xmmA
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_MMWORD
jb short .column_st7
- movq MMWORD [rdi], xmmA
+ movq XMM_MMWORD [rdi], xmmA
add rdi, byte SIZEOF_MMWORD
sub rcx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD
; space.
cmp rcx, byte SIZEOF_DWORD
jb short .column_st3
- movd DWORD [rdi], xmmA
+ movd XMM_DWORD [rdi], xmmA
add rdi, byte SIZEOF_DWORD
sub rcx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD
test rcx, rcx
jz short .endcolumn
mov BYTE [rdi], al
-%else
- mov rax,rcx
- xor rcx, byte 0x0F
- shl rcx, 2
- movd xmmB,ecx
- psrlq xmmH,4
- pcmpeqb xmmE,xmmE
- psrlq xmmH,xmmB
- psrlq xmmE,xmmB
- punpcklbw xmmE,xmmH
- ; ----------------
- mov rcx,rdi
- and rcx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- add rax,rcx
- cmp rax, byte SIZEOF_XMMWORD
- ja short .adj0
- and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmG,xmmA
- movdqa xmmC,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmD,ecx
- sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmF,ecx
- psllq xmmA,xmmF
- psllq xmmE,xmmF
- jmp short .adj0
-.adj1: neg rcx
- movd xmmF,ecx
- psrlq xmmA,xmmF
- psrlq xmmE,xmmF
- psllq xmmG,xmmD
- psllq xmmC,xmmD
- por xmmA,xmmG
- por xmmE,xmmC
-.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
- add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [rdi], xmmC
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [rdi], xmmH
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
.out0:
+ add rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub rcx, byte SIZEOF_XMMWORD
jz near .endcolumn
jmp near .columnloop
.column_st32:
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp rcx, byte SIZEOF_XMMWORD/2
jb short .column_st16
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [rdi], xmmA
- add rdi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [rdi], xmmD
- add rdi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+ add rdi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub rcx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp rcx, byte SIZEOF_XMMWORD/4
jb short .column_st15
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
add rdi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub rcx, byte SIZEOF_XMMWORD/4
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp rcx, byte SIZEOF_XMMWORD/8
jb short .column_st7
- movq MMWORD [rdi], xmmA
+ movq XMM_MMWORD [rdi], xmmA
add rdi, byte SIZEOF_XMMWORD/8*4
sub rcx, byte SIZEOF_XMMWORD/8
psrldq xmmA, SIZEOF_XMMWORD/8*4
; space.
test rcx, rcx
jz short .endcolumn
- movd DWORD [rdi], xmmA
-%else
- cmp rcx, byte SIZEOF_XMMWORD/16
- jb near .endcolumn
- mov rax,rcx
- xor rcx, byte 0x03
- inc rcx
- shl rcx, 4
- movd xmmF,ecx
- psrlq xmmE,xmmF
- punpcklbw xmmE,xmmE
- ; ----------------
- mov rcx,rdi
- and rcx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- lea rax, [rcx+rax*4] ; RGB_PIXELSIZE
- cmp rax, byte SIZEOF_XMMWORD
- ja short .adj0
- and rdi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl rcx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmB,xmmA
- movdqa xmmG,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmC,ecx
- sub rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmH,ecx
- psllq xmmA,xmmH
- psllq xmmE,xmmH
- jmp short .adj0
-.adj1: neg rcx
- movd xmmH,ecx
- psrlq xmmA,xmmH
- psrlq xmmE,xmmH
- psllq xmmB,xmmC
- psllq xmmG,xmmC
- por xmmA,xmmB
- por xmmE,xmmG
-.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
+ movd XMM_DWORD [rdi], xmmA
%endif ; RGB_PIXELSIZE ; ---------------
;
; jdmrgss2.asm - merged upsampling/color conversion (SSE2)
;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2012 D. R. Commander
;
; Based on
; x86 SIMD extension for IJG JPEG library
movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn
alignx 16,7
.column_st32:
- pcmpeqb xmmH,xmmH ; xmmH=(all 1's)
lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE
cmp ecx, byte 2*SIZEOF_XMMWORD
jb short .column_st16
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmF
sub ecx, byte 2*SIZEOF_XMMWORD
jmp short .column_st15
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD
jb short .column_st15
- maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store the lower 8 bytes of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_MMWORD
jb short .column_st7
- movq MMWORD [edi], xmmA
+ movq XMM_MMWORD [edi], xmmA
add edi, byte SIZEOF_MMWORD
sub ecx, byte SIZEOF_MMWORD
psrldq xmmA, SIZEOF_MMWORD
; space.
cmp ecx, byte SIZEOF_DWORD
jb short .column_st3
- movd DWORD [edi], xmmA
+ movd XMM_DWORD [edi], xmmA
add edi, byte SIZEOF_DWORD
sub ecx, byte SIZEOF_DWORD
psrldq xmmA, SIZEOF_DWORD
test ecx, ecx
jz short .endcolumn
mov BYTE [edi], al
-%else
- mov eax,ecx
- xor ecx, byte 0x0F
- shl ecx, 2
- movd xmmB,ecx
- psrlq xmmH,4
- pcmpeqb xmmE,xmmE
- psrlq xmmH,xmmB
- psrlq xmmE,xmmB
- punpcklbw xmmE,xmmH
- ; ----------------
- mov ecx,edi
- and ecx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- add eax,ecx
- cmp eax, byte SIZEOF_XMMWORD
- ja short .adj0
- and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmG,xmmA
- movdqa xmmC,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmD,ecx
- sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmF,ecx
- psllq xmmA,xmmF
- psllq xmmE,xmmF
- jmp short .adj0
-.adj1: neg ecx
- movd xmmF,ecx
- psrlq xmmA,xmmF
- psrlq xmmE,xmmF
- psllq xmmG,xmmD
- psllq xmmC,xmmD
- por xmmA,xmmG
- por xmmE,xmmC
-.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
%else ; RGB_PIXELSIZE == 4 ; -----------
movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
- add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
jmp short .out0
.out1: ; --(unaligned)-----------------
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ movdqu XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+ movdqu XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
.out0:
+ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr
sub ecx, byte SIZEOF_XMMWORD
jz near .endcolumn
alignx 16,7
.column_st32:
- pcmpeqb xmmE,xmmE ; xmmE=(all 1's)
cmp ecx, byte SIZEOF_XMMWORD/2
jb short .column_st16
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD ; outptr
- maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD
- add edi, byte SIZEOF_XMMWORD ; outptr
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+ movdqu XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+ add edi, byte 2*SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmC
movdqa xmmD,xmmH
sub ecx, byte SIZEOF_XMMWORD/2
.column_st16:
cmp ecx, byte SIZEOF_XMMWORD/4
jb short .column_st15
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
+ movdqu XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
add edi, byte SIZEOF_XMMWORD ; outptr
movdqa xmmA,xmmD
sub ecx, byte SIZEOF_XMMWORD/4
.column_st15:
-%ifdef STRICT_MEMORY_ACCESS
; Store two pixels (8 bytes) of xmmA to the output when it has enough
; space.
cmp ecx, byte SIZEOF_XMMWORD/8
jb short .column_st7
- movq MMWORD [edi], xmmA
- add edi, byte SIZEOF_XMMWORD/2
+ movq XMM_MMWORD [edi], xmmA
+ add edi, byte SIZEOF_XMMWORD/8*4
sub ecx, byte SIZEOF_XMMWORD/8
- psrldq xmmA, 64
+ psrldq xmmA, SIZEOF_XMMWORD/8*4
.column_st7:
; Store one pixel (4 bytes) of xmmA to the output when it has enough
; space.
test ecx, ecx
jz short .endcolumn
- movd DWORD [edi], xmmA
-%else
- cmp ecx, byte SIZEOF_XMMWORD/16
- jb short .endcolumn
- mov eax,ecx
- xor ecx, byte 0x03
- inc ecx
- shl ecx, 4
- movd xmmF,ecx
- psrlq xmmE,xmmF
- punpcklbw xmmE,xmmE
- ; ----------------
- mov ecx,edi
- and ecx, byte SIZEOF_XMMWORD-1
- jz short .adj0
- lea eax, [ecx+eax*4] ; RGB_PIXELSIZE
- cmp eax, byte SIZEOF_XMMWORD
- ja short .adj0
- and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary
- shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx
- movdqa xmmB,xmmA
- movdqa xmmG,xmmE
- pslldq xmmA, SIZEOF_XMMWORD/2
- pslldq xmmE, SIZEOF_XMMWORD/2
- movd xmmC,ecx
- sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
- jb short .adj1
- movd xmmH,ecx
- psllq xmmA,xmmH
- psllq xmmE,xmmH
- jmp short .adj0
-.adj1: neg ecx
- movd xmmH,ecx
- psrlq xmmA,xmmH
- psrlq xmmE,xmmH
- psllq xmmB,xmmC
- psllq xmmG,xmmC
- por xmmA,xmmB
- por xmmE,xmmG
-.adj0: ; ----------------
- maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
+ movd XMM_DWORD [edi], xmmA
%endif ; RGB_PIXELSIZE ; ---------------
JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_fancy_upsample_neon
+ JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
+ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
+
/* SIMD Sample Conversion */
EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
JDIMENSION start_col,
{
init_simd();
+ /* The code is optimised for these values only */
+ if (BITS_IN_JSAMPLE != 8)
+ return 0;
+ if (sizeof(JDIMENSION) != 4)
+ return 0;
+
+ if (simd_support & JSIMD_ARM_NEON)
+ return 1;
+
return 0;
}
JSAMPARRAY input_data,
JSAMPARRAY * output_data_ptr)
{
+ if (simd_support & JSIMD_ARM_NEON)
+ jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+ compptr->downsampled_width, input_data, output_data_ptr);
}
GLOBAL(int)
.unreq SHIFT
.unreq LOOP_COUNT
.endfunc
+
+/*****************************************************************************/
+
+/*
+ * GLOBAL(void)
+ * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
+ * JDIMENSION downsampled_width,
+ * JSAMPARRAY input_data,
+ * JSAMPARRAY * output_data_ptr);
+ *
+ * Note: the use of unaligned writes is the main remaining bottleneck in
+ * this code, which can be potentially solved to get up to tens
+ * of percents performance improvement on Cortex-A8/Cortex-A9.
+ */
+
+/*
+ * Upsample 16 source pixels to 32 destination pixels. The new 16 source
+ * pixels are loaded to q0. The previous 16 source pixels are in q1. The
+ * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
+ * Register d28 is used for multiplication by 3. Register q15 is used
+ * for adding +1 bias.
+ */
+.macro upsample16 OUTPTR, INPTR
+ vld1.8 {q0}, [\INPTR]!
+ vmovl.u8 q8, d0
+ vext.8 q2, q1, q0, #15
+ vmovl.u8 q9, d1
+ vaddw.u8 q10, q15, d4
+ vaddw.u8 q11, q15, d5
+ vmlal.u8 q8, d4, d28
+ vmlal.u8 q9, d5, d28
+ vmlal.u8 q10, d0, d28
+ vmlal.u8 q11, d1, d28
+ vmov q1, q0 /* backup source pixels to q1 */
+ vrshrn.u16 d6, q8, #2
+ vrshrn.u16 d7, q9, #2
+ vshrn.u16 d8, q10, #2
+ vshrn.u16 d9, q11, #2
+ vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
+.endm
+
+/*
+ * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
+ * macro, the roles of q0 and q1 registers are reversed for even and odd
+ * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
+ * Also this unrolling allows to reorder loads and stores to compensate
+ * multiplication latency and reduce stalls.
+ */
+.macro upsample32 OUTPTR, INPTR
+ /* even 16 pixels group */
+ vld1.8 {q0}, [\INPTR]!
+ vmovl.u8 q8, d0
+ vext.8 q2, q1, q0, #15
+ vmovl.u8 q9, d1
+ vaddw.u8 q10, q15, d4
+ vaddw.u8 q11, q15, d5
+ vmlal.u8 q8, d4, d28
+ vmlal.u8 q9, d5, d28
+ vmlal.u8 q10, d0, d28
+ vmlal.u8 q11, d1, d28
+ /* odd 16 pixels group */
+ vld1.8 {q1}, [\INPTR]!
+ vrshrn.u16 d6, q8, #2
+ vrshrn.u16 d7, q9, #2
+ vshrn.u16 d8, q10, #2
+ vshrn.u16 d9, q11, #2
+ vmovl.u8 q8, d2
+ vext.8 q2, q0, q1, #15
+ vmovl.u8 q9, d3
+ vaddw.u8 q10, q15, d4
+ vaddw.u8 q11, q15, d5
+ vmlal.u8 q8, d4, d28
+ vmlal.u8 q9, d5, d28
+ vmlal.u8 q10, d2, d28
+ vmlal.u8 q11, d3, d28
+ vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
+ vrshrn.u16 d6, q8, #2
+ vrshrn.u16 d7, q9, #2
+ vshrn.u16 d8, q10, #2
+ vshrn.u16 d9, q11, #2
+ vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
+.endm
+
+/*
+ * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
+ */
+.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
+ /* special case for the first and last pixels */
+ sub \WIDTH, \WIDTH, #1
+ add \OUTPTR, \OUTPTR, #1
+ ldrb \TMP1, [\INPTR, \WIDTH]
+ strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
+ ldrb \TMP1, [\INPTR], #1
+ strb \TMP1, [\OUTPTR, #-1]
+ vmov.8 d3[7], \TMP1
+
+ subs \WIDTH, \WIDTH, #32
+ blt 5f
+0: /* process 32 pixels per iteration */
+ upsample32 \OUTPTR, \INPTR
+ subs \WIDTH, \WIDTH, #32
+ bge 0b
+5:
+ adds \WIDTH, \WIDTH, #16
+ blt 1f
+0: /* process 16 pixels if needed */
+ upsample16 \OUTPTR, \INPTR
+ subs \WIDTH, \WIDTH, #16
+1:
+ adds \WIDTH, \WIDTH, #16
+ beq 9f
+
+ /* load the remaining 1-15 pixels */
+ add \INPTR, \INPTR, \WIDTH
+ tst \WIDTH, #1
+ beq 2f
+ sub \INPTR, \INPTR, #1
+ vld1.8 {d0[0]}, [\INPTR]
+2:
+ tst \WIDTH, #2
+ beq 2f
+ vext.8 d0, d0, d0, #6
+ sub \INPTR, \INPTR, #1
+ vld1.8 {d0[1]}, [\INPTR]
+ sub \INPTR, \INPTR, #1
+ vld1.8 {d0[0]}, [\INPTR]
+2:
+ tst \WIDTH, #4
+ beq 2f
+ vrev64.32 d0, d0
+ sub \INPTR, \INPTR, #1
+ vld1.8 {d0[3]}, [\INPTR]
+ sub \INPTR, \INPTR, #1
+ vld1.8 {d0[2]}, [\INPTR]
+ sub \INPTR, \INPTR, #1
+ vld1.8 {d0[1]}, [\INPTR]
+ sub \INPTR, \INPTR, #1
+ vld1.8 {d0[0]}, [\INPTR]
+2:
+ tst \WIDTH, #8
+ beq 2f
+ vmov d1, d0
+ sub \INPTR, \INPTR, #8
+ vld1.8 {d0}, [\INPTR]
+2: /* upsample the remaining pixels */
+ vmovl.u8 q8, d0
+ vext.8 q2, q1, q0, #15
+ vmovl.u8 q9, d1
+ vaddw.u8 q10, q15, d4
+ vaddw.u8 q11, q15, d5
+ vmlal.u8 q8, d4, d28
+ vmlal.u8 q9, d5, d28
+ vmlal.u8 q10, d0, d28
+ vmlal.u8 q11, d1, d28
+ vrshrn.u16 d10, q8, #2
+ vrshrn.u16 d12, q9, #2
+ vshrn.u16 d11, q10, #2
+ vshrn.u16 d13, q11, #2
+ vzip.8 d10, d11
+ vzip.8 d12, d13
+ /* store the remaining pixels */
+ tst \WIDTH, #8
+ beq 2f
+ vst1.8 {d10, d11}, [\OUTPTR]!
+ vmov q5, q6
+2:
+ tst \WIDTH, #4
+ beq 2f
+ vst1.8 {d10}, [\OUTPTR]!
+ vmov d10, d11
+2:
+ tst \WIDTH, #2
+ beq 2f
+ vst1.8 {d10[0]}, [\OUTPTR]!
+ vst1.8 {d10[1]}, [\OUTPTR]!
+ vst1.8 {d10[2]}, [\OUTPTR]!
+ vst1.8 {d10[3]}, [\OUTPTR]!
+ vext.8 d10, d10, d10, #4
+2:
+ tst \WIDTH, #1
+ beq 2f
+ vst1.8 {d10[0]}, [\OUTPTR]!
+ vst1.8 {d10[1]}, [\OUTPTR]!
+2:
+9:
+.endm
+
+asm_function jsimd_h2v1_fancy_upsample_neon
+
+ MAX_V_SAMP_FACTOR .req r0
+ DOWNSAMPLED_WIDTH .req r1
+ INPUT_DATA .req r2
+ OUTPUT_DATA_PTR .req r3
+ OUTPUT_DATA .req OUTPUT_DATA_PTR
+
+ OUTPTR .req r4
+ INPTR .req r5
+ WIDTH .req ip
+ TMP .req lr
+
+ push {r4, r5, r6, lr}
+ vpush {d8-d15}
+
+ ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
+ cmp MAX_V_SAMP_FACTOR, #0
+ ble 99f
+
+ /* initialize constants */
+ vmov.u8 d28, #3
+ vmov.u16 q15, #1
+11:
+ ldr INPTR, [INPUT_DATA], #4
+ ldr OUTPTR, [OUTPUT_DATA], #4
+ mov WIDTH, DOWNSAMPLED_WIDTH
+ upsample_row OUTPTR, INPTR, WIDTH, TMP
+ subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
+ bgt 11b
+
+99:
+ vpop {d8-d15}
+ pop {r4, r5, r6, pc}
+
+ .unreq MAX_V_SAMP_FACTOR
+ .unreq DOWNSAMPLED_WIDTH
+ .unreq INPUT_DATA
+ .unreq OUTPUT_DATA_PTR
+ .unreq OUTPUT_DATA
+
+ .unreq OUTPTR
+ .unreq INPTR
+ .unreq WIDTH
+ .unreq TMP
+
+.endfunc
+
+.purgem upsample16
+.purgem upsample32
+.purgem upsample_row
#include "../jmorecfg.h"
#include "jsimd.h"
-#define define(var) %define _cpp_protection_##var
-#define definev(var) %define _cpp_protection_##var var
-
;
; -- jpeglib.h
;
-definev(DCTSIZE)
-definev(DCTSIZE2)
+%define _cpp_protection_DCTSIZE DCTSIZE
+%define _cpp_protection_DCTSIZE2 DCTSIZE2
;
; -- jmorecfg.h
;
-definev(RGB_RED)
-definev(RGB_GREEN)
-definev(RGB_BLUE)
-definev(RGB_PIXELSIZE)
+%define _cpp_protection_RGB_RED RGB_RED
+%define _cpp_protection_RGB_GREEN RGB_GREEN
+%define _cpp_protection_RGB_BLUE RGB_BLUE
+%define _cpp_protection_RGB_PIXELSIZE RGB_PIXELSIZE
-definev(EXT_RGB_RED)
-definev(EXT_RGB_GREEN)
-definev(EXT_RGB_BLUE)
-definev(EXT_RGB_PIXELSIZE)
+%define _cpp_protection_EXT_RGB_RED EXT_RGB_RED
+%define _cpp_protection_EXT_RGB_GREEN EXT_RGB_GREEN
+%define _cpp_protection_EXT_RGB_BLUE EXT_RGB_BLUE
+%define _cpp_protection_EXT_RGB_PIXELSIZE EXT_RGB_PIXELSIZE
-definev(EXT_RGBX_RED)
-definev(EXT_RGBX_GREEN)
-definev(EXT_RGBX_BLUE)
-definev(EXT_RGBX_PIXELSIZE)
+%define _cpp_protection_EXT_RGBX_RED EXT_RGBX_RED
+%define _cpp_protection_EXT_RGBX_GREEN EXT_RGBX_GREEN
+%define _cpp_protection_EXT_RGBX_BLUE EXT_RGBX_BLUE
+%define _cpp_protection_EXT_RGBX_PIXELSIZE EXT_RGBX_PIXELSIZE
-definev(EXT_BGR_RED)
-definev(EXT_BGR_GREEN)
-definev(EXT_BGR_BLUE)
-definev(EXT_BGR_PIXELSIZE)
+%define _cpp_protection_EXT_BGR_RED EXT_BGR_RED
+%define _cpp_protection_EXT_BGR_GREEN EXT_BGR_GREEN
+%define _cpp_protection_EXT_BGR_BLUE EXT_BGR_BLUE
+%define _cpp_protection_EXT_BGR_PIXELSIZE EXT_BGR_PIXELSIZE
-definev(EXT_BGRX_RED)
-definev(EXT_BGRX_GREEN)
-definev(EXT_BGRX_BLUE)
-definev(EXT_BGRX_PIXELSIZE)
+%define _cpp_protection_EXT_BGRX_RED EXT_BGRX_RED
+%define _cpp_protection_EXT_BGRX_GREEN EXT_BGRX_GREEN
+%define _cpp_protection_EXT_BGRX_BLUE EXT_BGRX_BLUE
+%define _cpp_protection_EXT_BGRX_PIXELSIZE EXT_BGRX_PIXELSIZE
-definev(EXT_XBGR_RED)
-definev(EXT_XBGR_GREEN)
-definev(EXT_XBGR_BLUE)
-definev(EXT_XBGR_PIXELSIZE)
+%define _cpp_protection_EXT_XBGR_RED EXT_XBGR_RED
+%define _cpp_protection_EXT_XBGR_GREEN EXT_XBGR_GREEN
+%define _cpp_protection_EXT_XBGR_BLUE EXT_XBGR_BLUE
+%define _cpp_protection_EXT_XBGR_PIXELSIZE EXT_XBGR_PIXELSIZE
-definev(EXT_XRGB_RED)
-definev(EXT_XRGB_GREEN)
-definev(EXT_XRGB_BLUE)
-definev(EXT_XRGB_PIXELSIZE)
+%define _cpp_protection_EXT_XRGB_RED EXT_XRGB_RED
+%define _cpp_protection_EXT_XRGB_GREEN EXT_XRGB_GREEN
+%define _cpp_protection_EXT_XRGB_BLUE EXT_XRGB_BLUE
+%define _cpp_protection_EXT_XRGB_PIXELSIZE EXT_XRGB_PIXELSIZE
%define RGBX_FILLER_0XFF 1
%define JSAMPLE byte ; unsigned char
%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)
-definev(CENTERJSAMPLE)
+%define _cpp_protection_CENTERJSAMPLE CENTERJSAMPLE
; Representation of a DCT frequency coefficient.
; On this SIMD implementation, this must be 'short'.
; -- jsimd.h
;
-definev(JSIMD_NONE)
-definev(JSIMD_MMX)
-definev(JSIMD_3DNOW)
-definev(JSIMD_SSE)
-definev(JSIMD_SSE2)
+%define _cpp_protection_JSIMD_NONE JSIMD_NONE
+%define _cpp_protection_JSIMD_MMX JSIMD_MMX
+%define _cpp_protection_JSIMD_3DNOW JSIMD_3DNOW
+%define _cpp_protection_JSIMD_SSE JSIMD_SSE
+%define _cpp_protection_JSIMD_SSE2 JSIMD_SSE2
; Short forms of external names for systems with brain-damaged linkers.
;
#ifdef NEED_SHORT_EXTERNAL_NAMES
-definev(jpeg_simd_cpu_support)
-definev(jsimd_rgb_ycc_convert_mmx)
-definev(jsimd_ycc_rgb_convert_mmx)
-definev(jconst_rgb_ycc_convert_sse2)
-definev(jsimd_rgb_ycc_convert_sse2)
-definev(jconst_ycc_rgb_convert_sse2)
-definev(jsimd_ycc_rgb_convert_sse2)
-definev(jsimd_h2v2_downsample_mmx)
-definev(jsimd_h2v1_downsample_mmx)
-definev(jsimd_h2v2_downsample_sse2)
-definev(jsimd_h2v1_downsample_sse2)
-definev(jsimd_h2v2_upsample_mmx)
-definev(jsimd_h2v1_upsample_mmx)
-definev(jsimd_h2v1_fancy_upsample_mmx)
-definev(jsimd_h2v2_fancy_upsample_mmx)
-definev(jsimd_h2v1_merged_upsample_mmx)
-definev(jsimd_h2v2_merged_upsample_mmx)
-definev(jsimd_h2v2_upsample_sse2)
-definev(jsimd_h2v1_upsample_sse2)
-definev(jconst_fancy_upsample_sse2)
-definev(jsimd_h2v1_fancy_upsample_sse2)
-definev(jsimd_h2v2_fancy_upsample_sse2)
-definev(jconst_merged_upsample_sse2)
-definev(jsimd_h2v1_merged_upsample_sse2)
-definev(jsimd_h2v2_merged_upsample_sse2)
-definev(jsimd_convsamp_mmx)
-definev(jsimd_convsamp_sse2)
-definev(jsimd_convsamp_float_3dnow)
-definev(jsimd_convsamp_float_sse)
-definev(jsimd_convsamp_float_sse2)
-definev(jsimd_fdct_islow_mmx)
-definev(jsimd_fdct_ifast_mmx)
-definev(jconst_fdct_islow_sse2)
-definev(jsimd_fdct_islow_sse2)
-definev(jconst_fdct_ifast_sse2)
-definev(jsimd_fdct_ifast_sse2)
-definev(jsimd_fdct_float_3dnow)
-definev(jconst_fdct_float_sse)
-definev(jsimd_fdct_float_sse)
-definev(jsimd_quantize_mmx)
-definev(jsimd_quantize_sse2)
-definev(jsimd_quantize_float_3dnow)
-definev(jsimd_quantize_float_sse)
-definev(jsimd_quantize_float_sse2)
-definev(jsimd_idct_2x2_mmx)
-definev(jsimd_idct_4x4_mmx)
-definev(jconst_idct_red_sse2)
-definev(jsimd_idct_2x2_sse2)
-definev(jsimd_idct_4x4_sse2)
-definev(jsimd_idct_islow_mmx)
-definev(jsimd_idct_ifast_mmx)
-definev(jconst_idct_islow_sse2)
-definev(jsimd_idct_islow_sse2)
-definev(jconst_idct_ifast_sse2)
-definev(jsimd_idct_ifast_sse2)
-definev(jsimd_idct_float_3dnow)
-definev(jconst_idct_float_sse)
-definev(jsimd_idct_float_sse)
-definev(jconst_idct_float_sse2)
-definev(jsimd_idct_float_sse2)
+%define _cpp_protection_jpeg_simd_cpu_support jpeg_simd_cpu_support
+%define _cpp_protection_jsimd_rgb_ycc_convert_mmx jsimd_rgb_ycc_convert_mmx
+%define _cpp_protection_jsimd_ycc_rgb_convert_mmx jsimd_ycc_rgb_convert_mmx
+%define _cpp_protection_jconst_rgb_ycc_convert_sse2 jconst_rgb_ycc_convert_sse2
+%define _cpp_protection_jsimd_rgb_ycc_convert_sse2 jsimd_rgb_ycc_convert_sse2
+%define _cpp_protection_jconst_ycc_rgb_convert_sse2 jconst_ycc_rgb_convert_sse2
+%define _cpp_protection_jsimd_ycc_rgb_convert_sse2 jsimd_ycc_rgb_convert_sse2
+%define _cpp_protection_jsimd_h2v2_downsample_mmx jsimd_h2v2_downsample_mmx
+%define _cpp_protection_jsimd_h2v1_downsample_mmx jsimd_h2v1_downsample_mmx
+%define _cpp_protection_jsimd_h2v2_downsample_sse2 jsimd_h2v2_downsample_sse2
+%define _cpp_protection_jsimd_h2v1_downsample_sse2 jsimd_h2v1_downsample_sse2
+%define _cpp_protection_jsimd_h2v2_upsample_mmx jsimd_h2v2_upsample_mmx
+%define _cpp_protection_jsimd_h2v1_upsample_mmx jsimd_h2v1_upsample_mmx
+%define _cpp_protection_jsimd_h2v1_fancy_upsample_mmx jsimd_h2v1_fancy_upsample_mmx
+%define _cpp_protection_jsimd_h2v2_fancy_upsample_mmx jsimd_h2v2_fancy_upsample_mmx
+%define _cpp_protection_jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_merged_upsample_mmx
+%define _cpp_protection_jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_merged_upsample_mmx
+%define _cpp_protection_jsimd_h2v2_upsample_sse2 jsimd_h2v2_upsample_sse2
+%define _cpp_protection_jsimd_h2v1_upsample_sse2 jsimd_h2v1_upsample_sse2
+%define _cpp_protection_jconst_fancy_upsample_sse2 jconst_fancy_upsample_sse2
+%define _cpp_protection_jsimd_h2v1_fancy_upsample_sse2 jsimd_h2v1_fancy_upsample_sse2
+%define _cpp_protection_jsimd_h2v2_fancy_upsample_sse2 jsimd_h2v2_fancy_upsample_sse2
+%define _cpp_protection_jconst_merged_upsample_sse2 jconst_merged_upsample_sse2
+%define _cpp_protection_jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_merged_upsample_sse2
+%define _cpp_protection_jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_merged_upsample_sse2
+%define _cpp_protection_jsimd_convsamp_mmx jsimd_convsamp_mmx
+%define _cpp_protection_jsimd_convsamp_sse2 jsimd_convsamp_sse2
+%define _cpp_protection_jsimd_convsamp_float_3dnow jsimd_convsamp_float_3dnow
+%define _cpp_protection_jsimd_convsamp_float_sse jsimd_convsamp_float_sse
+%define _cpp_protection_jsimd_convsamp_float_sse2 jsimd_convsamp_float_sse2
+%define _cpp_protection_jsimd_fdct_islow_mmx jsimd_fdct_islow_mmx
+%define _cpp_protection_jsimd_fdct_ifast_mmx jsimd_fdct_ifast_mmx
+%define _cpp_protection_jconst_fdct_islow_sse2 jconst_fdct_islow_sse2
+%define _cpp_protection_jsimd_fdct_islow_sse2 jsimd_fdct_islow_sse2
+%define _cpp_protection_jconst_fdct_ifast_sse2 jconst_fdct_ifast_sse2
+%define _cpp_protection_jsimd_fdct_ifast_sse2 jsimd_fdct_ifast_sse2
+%define _cpp_protection_jsimd_fdct_float_3dnow jsimd_fdct_float_3dnow
+%define _cpp_protection_jconst_fdct_float_sse jconst_fdct_float_sse
+%define _cpp_protection_jsimd_fdct_float_sse jsimd_fdct_float_sse
+%define _cpp_protection_jsimd_quantize_mmx jsimd_quantize_mmx
+%define _cpp_protection_jsimd_quantize_sse2 jsimd_quantize_sse2
+%define _cpp_protection_jsimd_quantize_float_3dnow jsimd_quantize_float_3dnow
+%define _cpp_protection_jsimd_quantize_float_sse jsimd_quantize_float_sse
+%define _cpp_protection_jsimd_quantize_float_sse2 jsimd_quantize_float_sse2
+%define _cpp_protection_jsimd_idct_2x2_mmx jsimd_idct_2x2_mmx
+%define _cpp_protection_jsimd_idct_4x4_mmx jsimd_idct_4x4_mmx
+%define _cpp_protection_jconst_idct_red_sse2 jconst_idct_red_sse2
+%define _cpp_protection_jsimd_idct_2x2_sse2 jsimd_idct_2x2_sse2
+%define _cpp_protection_jsimd_idct_4x4_sse2 jsimd_idct_4x4_sse2
+%define _cpp_protection_jsimd_idct_islow_mmx jsimd_idct_islow_mmx
+%define _cpp_protection_jsimd_idct_ifast_mmx jsimd_idct_ifast_mmx
+%define _cpp_protection_jconst_idct_islow_sse2 jconst_idct_islow_sse2
+%define _cpp_protection_jsimd_idct_islow_sse2 jsimd_idct_islow_sse2
+%define _cpp_protection_jconst_idct_ifast_sse2 jconst_idct_ifast_sse2
+%define _cpp_protection_jsimd_idct_ifast_sse2 jsimd_idct_ifast_sse2
+%define _cpp_protection_jsimd_idct_float_3dnow jsimd_idct_float_3dnow
+%define _cpp_protection_jconst_idct_float_sse jconst_idct_float_sse
+%define _cpp_protection_jsimd_idct_float_sse jsimd_idct_float_sse
+%define _cpp_protection_jconst_idct_float_sse2 jconst_idct_float_sse2
+%define _cpp_protection_jsimd_idct_float_sse2 jsimd_idct_float_sse2
#endif /* NEED_SHORT_EXTERNAL_NAMES */
%define SEG_CONST .rodata progbits alloc noexec nowrite align=16
%endif
-%define STRICT_MEMORY_ACCESS 1
-
; To make the code position-independent, append -DPIC to the commandline
;
%define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
push rsi
push rdi
sub rsp, SIZEOF_XMMWORD
- movlpd XMMWORD [rsp], xmm6
+ movaps XMMWORD [rsp], xmm6
sub rsp, SIZEOF_XMMWORD
- movlpd XMMWORD [rsp], xmm7
+ movaps XMMWORD [rsp], xmm7
%endmacro
%imacro uncollect_args 0
- movlpd xmm7, XMMWORD [rsp]
+ movaps xmm7, XMMWORD [rsp]
add rsp, SIZEOF_XMMWORD
- movlpd xmm6, XMMWORD [rsp]
+ movaps xmm6, XMMWORD [rsp]
add rsp, SIZEOF_XMMWORD
pop rdi
pop rsi