From 42fa81275c67d7d1ad8d255120af0ffeeb46b963 Mon Sep 17 00:00:00 2001 From: Brian Paul Date: Sat, 7 May 2005 16:59:58 +0000 Subject: [PATCH] x86-64 transform optimizations (Mikko T.) --- Makefile | 1 + configs/linux-x86-64 | 6 +- configs/linux-x86-64-debug | 28 ++ src/mesa/Makefile | 2 + src/mesa/math/m_debug_util.h | 38 +++ src/mesa/math/m_debug_xform.c | 10 +- src/mesa/math/m_xform.c | 6 + src/mesa/sources | 6 +- src/mesa/x86-64/Makefile | 29 +++ src/mesa/x86-64/calling_convention.txt | 50 ++++ src/mesa/x86-64/matypes.h | 164 ++++++++++++ src/mesa/x86-64/x86-64.c | 115 +++++++++ src/mesa/x86-64/x86-64.h | 32 +++ src/mesa/x86-64/xform4.S | 458 +++++++++++++++++++++++++++++++++ src/mesa/x86/assyntax.h | 12 +- src/mesa/x86/gen_matypes.c | 4 +- src/mesa/x86/glapi_x86.S | 10 - 17 files changed, 947 insertions(+), 24 deletions(-) create mode 100644 configs/linux-x86-64-debug create mode 100644 src/mesa/x86-64/Makefile create mode 100644 src/mesa/x86-64/calling_convention.txt create mode 100644 src/mesa/x86-64/matypes.h create mode 100644 src/mesa/x86-64/x86-64.c create mode 100644 src/mesa/x86-64/x86-64.h create mode 100644 src/mesa/x86-64/xform4.S diff --git a/Makefile b/Makefile index 766e72d..7e043dc 100644 --- a/Makefile +++ b/Makefile @@ -108,6 +108,7 @@ linux-x86 \ linux-x86-debug \ linux-x86-32 \ linux-x86-64 \ +linux-x86-64-debug \ linux-x86-64-static \ linux-x86-glide \ linux-x86-static \ diff --git a/configs/linux-x86-64 b/configs/linux-x86-64 index 9f03754..8e62f91 100644 --- a/configs/linux-x86-64 +++ b/configs/linux-x86-64 @@ -8,14 +8,14 @@ CONFIG_NAME = linux-x86-64 CC = gcc CXX = g++ -CFLAGS = -m64 -Wall -O3 -ansi -pedantic -fPIC -D_POSIX_SOURCE -D_POSIX_C_SOURCE=199309L -D_SVID_SOURCE -D_BSD_SOURCE -DUSE_XSHM -DPTHREADS -I/usr/X11R6/include +CFLAGS = -m64 -Wall -O3 -std=c99 -pedantic -fPIC -D_REENTRANT -D_POSIX_SOURCE -D_POSIX_C_SOURCE=199309L -D_SVID_SOURCE -D_BSD_SOURCE -DUSE_XSHM -DPTHREADS -I/usr/X11R6/include -DUSE_X86_64_ASM -CXXFLAGS = -m64 -Wall -O3 -ansi -pedantic -fPIC -D_POSIX_SOURCE -D_POSIX_C_SOURCE=199309L -D_SVID_SOURCE -D_BSD_SOURCE +CXXFLAGS = -m64 -Wall -O3 -std=c99 -pedantic -fPIC -D_REENTRANT -D_POSIX_SOURCE -D_POSIX_C_SOURCE=199309L -D_SVID_SOURCE -D_BSD_SOURCE GLUT_CFLAGS = -fexceptions -#ASM_SOURCES = $(X86_SOURCES) +ASM_SOURCES = $(X86-64_SOURCES) LIB_DIR = $(TOP)/lib64 diff --git a/configs/linux-x86-64-debug b/configs/linux-x86-64-debug new file mode 100644 index 0000000..85a4e1f --- /dev/null +++ b/configs/linux-x86-64-debug @@ -0,0 +1,28 @@ +# Configuration for Linux for 64-bit X86 (Opteron) + +include $(TOP)/configs/default + +CONFIG_NAME = linux-x86-64-debug + +# Compiler and flags +CC = gcc +CXX = g++ + +CFLAGS = -g -m64 -Wall -O3 -std=c99 -pedantic -fPIC -D_REENTRANT -D_POSIX_SOURCE -D_POSIX_C_SOURCE=199309L -D_SVID_SOURCE -D_BSD_SOURCE -DUSE_XSHM -DPTHREADS -I/usr/X11R6/include -DUSE_X86_64_ASM -DDEBUG -DMESA_DEBUG -DRUN_DEBUG_BENCHMARK + +CXXFLAGS = -g -m64 -Wall -O3 -ansi -pedantic -fPIC -D_REENTRANT -D_POSIX_SOURCE -D_POSIX_C_SOURCE=199309L -D_SVID_SOURCE -D_BSD_SOURCE -DDEBUG -DMESA_DEBUG -DRUN_DEBUG_BENCHMARK + +GLUT_CFLAGS = -fexceptions + + +ASM_SOURCES = $(X86-64_SOURCES) + + +LIB_DIR = $(TOP)/lib64 + + +# Library/program dependencies +GL_LIB_DEPS = -L/usr/X11R6/lib64 -lX11 -lXext -lm -lpthread +GLUT_LIB_DEPS = -L$(LIB_DIR) -l$(GLU_LIB) -l$(GL_LIB) -L/usr/X11R6/lib64 -lX11 -lXmu -lXt -lXi -lm +GLW_LIB_DEPS = -L$(LIB_DIR) -l$(GL_LIB) -L/usr/X11R6/lib64 -lXt -lX11 +APP_LIB_DEPS = -L$(LIB_DIR) -l$(GLUT_LIB) -l$(GLU_LIB) -l$(GL_LIB) -lm diff --git a/src/mesa/Makefile b/src/mesa/Makefile index 49fd88f..3ff8da7 100644 --- a/src/mesa/Makefile +++ b/src/mesa/Makefile @@ -146,6 +146,7 @@ osmesa-only: depend subdirs $(LIB_DIR)/$(OSMESA_LIB_NAME) subdirs: @ (cd x86 ; $(MAKE)) + @ (cd x86-64 ; $(MAKE)) # Make the GL library $(LIB_DIR)/$(GL_LIB_NAME): $(STAND_ALONE_OBJECTS) @@ -223,5 +224,6 @@ clean: -rm -f drivers/*/*.o (cd drivers/dri ; $(MAKE) clean) (cd x86 ; $(MAKE) clean) + (cd x86-64 ; $(MAKE) clean) include depend diff --git a/src/mesa/math/m_debug_util.h b/src/mesa/math/m_debug_util.h index c07cdcf..765f54d 100644 --- a/src/mesa/math/m_debug_util.h +++ b/src/mesa/math/m_debug_util.h @@ -185,6 +185,44 @@ extern char *mesa_profile; #endif +#elif defined(__amd64__) + +#define rdtscll(val) do { \ + unsigned int a,d; \ + __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); \ + (val) = ((unsigned long)a) | (((unsigned long)d)<<32); \ +} while(0) + +/* Copied from i386 PIII version */ +#define INIT_COUNTER() \ + do { \ + int cycle_i; \ + counter_overhead = LONG_MAX; \ + for ( cycle_i = 0 ; cycle_i < 16 ; cycle_i++ ) { \ + unsigned long cycle_tmp1, cycle_tmp2; \ + rdtscll(cycle_tmp1); \ + rdtscll(cycle_tmp2); \ + if ( counter_overhead > (cycle_tmp2 - cycle_tmp1) ) { \ + counter_overhead = cycle_tmp2 - cycle_tmp1; \ + } \ + } \ + } while (0) + + +#define BEGIN_RACE(x) \ + x = LONG_MAX; \ + for ( cycle_i = 0 ; cycle_i < 10 ; cycle_i++ ) { \ + unsigned long cycle_tmp1, cycle_tmp2; \ + rdtscll(cycle_tmp1); \ + +#define END_RACE(x) \ + rdtscll(cycle_tmp2); \ + if ( x > (cycle_tmp2 - cycle_tmp1) ) { \ + x = cycle_tmp2 - cycle_tmp1; \ + } \ + } \ + x -= counter_overhead; + #elif defined(__sparc__) #define INIT_COUNTER() \ diff --git a/src/mesa/math/m_debug_xform.c b/src/mesa/math/m_debug_xform.c index b634527..d8250f2 100644 --- a/src/mesa/math/m_debug_xform.c +++ b/src/mesa/math/m_debug_xform.c @@ -166,7 +166,7 @@ ALIGN16(static GLfloat, d[TEST_COUNT][4]); ALIGN16(static GLfloat, r[TEST_COUNT][4]); static int test_transform_function( transform_func func, int psize, - int mtype, long *cycles ) + int mtype, unsigned long *cycles ) { GLvector4f source[1], dest[1], ref[1]; GLmatrix mat[1]; @@ -187,7 +187,7 @@ static int test_transform_function( transform_func func, int psize, mat->type = mtypes[mtype]; m = mat->m; - ASSERT( ((GLuint)m & 15) == 0 ); + ASSERT( ((long)m & 15) == 0 ); init_matrix( m ); @@ -279,7 +279,7 @@ static int test_transform_function( transform_func func, int psize, void _math_test_all_transform_functions( char *description ) { int psize, mtype; - long benchmark_tab[4][7]; + unsigned long benchmark_tab[4][7]; static int first_time = 1; if ( first_time ) { @@ -291,7 +291,7 @@ void _math_test_all_transform_functions( char *description ) if ( mesa_profile ) { if ( !counter_overhead ) { INIT_COUNTER(); - _mesa_printf("counter overhead: %ld cycles\n\n", counter_overhead ); + _mesa_printf("counter overhead: %lu cycles\n\n", counter_overhead ); } _mesa_printf("transform results after hooking in %s functions:\n", description ); } @@ -310,7 +310,7 @@ void _math_test_all_transform_functions( char *description ) for ( mtype = 0 ; mtype < 7 ; mtype++ ) { for ( psize = 1 ; psize <= 4 ; psize++ ) { transform_func func = _mesa_transform_tab[psize][mtypes[mtype]]; - long *cycles = &(benchmark_tab[psize-1][mtype]); + unsigned long *cycles = &(benchmark_tab[psize-1][mtype]); if ( test_transform_function( func, psize, mtype, cycles ) == 0 ) { char buf[100]; diff --git a/src/mesa/math/m_xform.c b/src/mesa/math/m_xform.c index 66dc44d..5366e34 100644 --- a/src/mesa/math/m_xform.c +++ b/src/mesa/math/m_xform.c @@ -51,6 +51,10 @@ #include "x86/common_x86_asm.h" #endif +#ifdef USE_X86_64_ASM +#include "x86-64/x86-64.h" +#endif + #ifdef USE_SPARC_ASM #include "sparc/sparc.h" #endif @@ -212,6 +216,8 @@ _math_init_transformation( void ) _mesa_init_all_sparc_transform_asm(); #elif defined( USE_PPC_ASM ) _mesa_init_all_ppc_transform_asm(); +#elif defined( USE_X86_64_ASM ) + _mesa_init_all_x86_64_transform_asm(); #endif } diff --git a/src/mesa/sources b/src/mesa/sources index c4249a7..f2f3b6b 100644 --- a/src/mesa/sources +++ b/src/mesa/sources @@ -197,7 +197,8 @@ ASM_C_SOURCES = \ x86/3dnow.c \ x86/sse.c \ sparc/sparc.c \ - ppc/common_ppc.c + ppc/common_ppc.c \ + x86-64/x86-64.c X86_SOURCES = \ x86/common_x86_asm.S \ @@ -222,6 +223,9 @@ X86_SOURCES = \ X86_API = \ x86/glapi_x86.S +X86-64_SOURCES = \ + x86-64/xform4.S + SPARC_SOURCES = \ sparc/clip.S \ sparc/norm.S \ diff --git a/src/mesa/x86-64/Makefile b/src/mesa/x86-64/Makefile new file mode 100644 index 0000000..252218c --- /dev/null +++ b/src/mesa/x86-64/Makefile @@ -0,0 +1,29 @@ +# src/mesa/x86-64/Makefile + +TOP = ../../.. + +include $(TOP)/configs/current + + + +INCLUDE_DIRS = \ + -I$(TOP)/include/GL \ + -I$(TOP)/include \ + -I.. \ + -I../main \ + -I../math \ + -I../glapi \ + -I../tnl + + +default: matypes.h + +clean: + rm -f matypes.h + + +# need some special rules here, unfortunately +matypes.h: ../main/mtypes.h ../tnl/t_context.h ../x86/gen_matypes + ../x86/gen_matypes | grep -v '#include "assyntax.h' > matypes.h + +xform4.o: matypes.h diff --git a/src/mesa/x86-64/calling_convention.txt b/src/mesa/x86-64/calling_convention.txt new file mode 100644 index 0000000..4147f7e --- /dev/null +++ b/src/mesa/x86-64/calling_convention.txt @@ -0,0 +1,50 @@ +Register Usage +rax temporary register; with variable arguments passes information + about the number of SSE registers used; 1st return register + +rbx* callee-saved register; optionally used as base pointer + +rcx used to pass 4th integer argument to functions + +rdx used to pass 3rd argument to functions 2nd return register + +rsp* stack pointer + +rbp* callee-saved register; optionally used as frame pointer + +rsi used to pass 2nd argument to functions + +rdi used to pass 1st argument to functions + +r8 used to pass 5th argument to functions + +r9 used to pass 6th argument to functions + +r10 temporary register, used for passing a function's static chain pointer + +r11 temporary register + +r12-15* callee-saved registers + +xmm0­1 used to pass and return floating point arguments + +xmm2­7 used to pass floating point arguments + +xmm8­15 temporary registers + +mmx0­7 temporary registers + +st0 temporary register; used to return long double arguments + +st1 temporary registers; used to return long double arguments + +st2­7 temporary registers + +fs Reserved for system use (as thread specific data register) + + + +*) must be preserved across function calls + +Integer arguments from list: rdi,rsi,rdx,rcx,r8,r9,stack +Floating point arguments from list: xmm0-xmm7 \ No newline at end of file diff --git a/src/mesa/x86-64/matypes.h b/src/mesa/x86-64/matypes.h new file mode 100644 index 0000000..cdface9 --- /dev/null +++ b/src/mesa/x86-64/matypes.h @@ -0,0 +1,164 @@ +/* + * This file is automatically generated from the Mesa internal type + * definitions. Do not edit directly. + */ + +#ifndef __ASM_TYPES_H__ +#define __ASM_TYPES_H__ + + + +/* ============================================================= + * Offsets for GLcontext + */ + +#define CTX_DRIVER_CTX 904 + +#define CTX_LIGHT_ENABLED 38592 +#define CTX_LIGHT_SHADE_MODEL 38596 +#define CTX_LIGHT_COLOR_MAT_FACE 38600 +#define CTX_LIGHT_COLOR_MAT_MODE 38604 +#define CTX_LIGHT_COLOR_MAT_MASK 38608 +#define CTX_LIGHT_COLOR_MAT_ENABLED 38612 +#define CTX_LIGHT_ENABLED_LIST 38616 +#define CTX_LIGHT_NEED_VERTS 42973 +#define CTX_LIGHT_FLAGS 42976 +#define CTX_LIGHT_BASE_COLOR 42980 + + +/* ============================================================= + * Offsets for struct vertex_buffer + */ + +#define VB_SIZE 0 +#define VB_COUNT 4 + +#define VB_ELTS 8 +#define VB_OBJ_PTR 12 +#define VB_EYE_PTR 16 +#define VB_CLIP_PTR 20 +#define VB_PROJ_CLIP_PTR 24 +#define VB_CLIP_OR_MASK 28 +#define VB_CLIP_MASK 32 +#define VB_NORMAL_PTR 36 +#define VB_EDGE_FLAG 44 +#define VB_TEX0_COORD_PTR 48 +#define VB_TEX1_COORD_PTR 52 +#define VB_TEX2_COORD_PTR 56 +#define VB_TEX3_COORD_PTR 60 +#define VB_INDEX_PTR 80 +#define VB_COLOR_PTR 88 +#define VB_SECONDARY_COLOR_PTR 96 +#define VB_FOG_COORD_PTR 108 +#define VB_POINT_SIZE_PTR 104 +#define VB_PRIMITIVE 112 + +#define VB_LAST_CLIPPED 244 + +/* + * Flags for struct vertex_buffer + */ + +#define VERT_BIT_OBJ 0x1 +#define VERT_BIT_NORM 0x4 +#define VERT_BIT_RGBA 0x8 +#define VERT_BIT_SPEC_RGB 0x10 +#define VERT_BIT_FOG_COORD 0x20 +#define VERT_BIT_TEX0 0x100 +#define VERT_BIT_TEX1 0x200 +#define VERT_BIT_TEX2 0x400 +#define VERT_BIT_TEX3 0x800 + + +/* ============================================================= + * Offsets for GLvector4f + */ + +#define V4F_DATA 0 +#define V4F_START 4 +#define V4F_COUNT 8 +#define V4F_STRIDE 12 +#define V4F_SIZE 16 +#define V4F_FLAGS 20 + +/* + * Flags for GLvector4f + */ + +#define VEC_MALLOC 0x10 +#define VEC_NOT_WRITEABLE 0x40 +#define VEC_BAD_STRIDE 0x100 + +#define VEC_SIZE_1 0x1 +#define VEC_SIZE_2 0x3 +#define VEC_SIZE_3 0x7 +#define VEC_SIZE_4 0xf + + +/* ============================================================= + * Offsets for GLmatrix + */ + +#define MATRIX_DATA 0 +#define MATRIX_INV 4 +#define MATRIX_FLAGS 8 +#define MATRIX_TYPE 12 + + +/* ============================================================= + * Offsets for struct gl_light + */ + +#define LIGHT_NEXT 0 +#define LIGHT_PREV 4 + +#define LIGHT_AMBIENT 8 +#define LIGHT_DIFFUSE 24 +#define LIGHT_SPECULAR 40 +#define LIGHT_EYE_POSITION 56 +#define LIGHT_EYE_DIRECTION 72 +#define LIGHT_SPOT_EXPONENT 88 +#define LIGHT_SPOT_CUTOFF 92 +#define LIGHT_COS_CUTOFF 96 +#define LIGHT_CONST_ATTEN 100 +#define LIGHT_LINEAR_ATTEN 104 +#define LIGHT_QUADRATIC_ATTEN 108 +#define LIGHT_ENABLED 112 + +#define LIGHT_FLAGS 116 + +#define LIGHT_POSITION 120 +#define LIGHT_VP_INF_NORM 136 +#define LIGHT_H_INF_NORM 148 +#define LIGHT_NORM_DIRECTION 160 +#define LIGHT_VP_INF_SPOT_ATTEN 176 + +#define LIGHT_SPOT_EXP_TABLE 180 +#define LIGHT_MAT_AMBIENT 4276 +#define LIGHT_MAT_DIFFUSE 4300 +#define LIGHT_MAT_SPECULAR 4324 + +#define SIZEOF_GL_LIGHT 4356 + +/* + * Flags for struct gl_light + */ + +#define LIGHT_SPOT 0x1 +#define LIGHT_LOCAL_VIEWER 0x2 +#define LIGHT_POSITIONAL 0x4 + +#define LIGHT_NEED_VERTICES 0x6 + + +/* ============================================================= + * Offsets for struct gl_lightmodel + */ + +#define LIGHT_MODEL_AMBIENT 0 +#define LIGHT_MODEL_LOCAL_VIEWER 16 +#define LIGHT_MODEL_TWO_SIDE 17 +#define LIGHT_MODEL_COLOR_CONTROL 20 + + +#endif /* __ASM_TYPES_H__ */ diff --git a/src/mesa/x86-64/x86-64.c b/src/mesa/x86-64/x86-64.c new file mode 100644 index 0000000..e70bc66 --- /dev/null +++ b/src/mesa/x86-64/x86-64.c @@ -0,0 +1,115 @@ +/* $Id: x86-64.c,v 1.1 2005/05/07 16:59:59 brianp Exp $ */ + +/* + * Mesa 3-D graphics library + * Version: 6.3 + * + * Copyright (C) 1999-2003 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * x86-64 optimizations shamelessy converted from x86/sse/3dnow assembly by + * Mikko Tiihonen + */ + +#ifdef USE_X86_64_ASM + +#include "glheader.h" +#include "context.h" +#include "math/m_xform.h" +#include "tnl/t_context.h" +#include "x86-64.h" +#include "../x86/common_x86_macros.h" + +#ifdef DEBUG +#include "math/m_debug.h" +#endif + +DECLARE_XFORM_GROUP( x86_64, 4 ) + +#endif + +/* +extern void _mesa_x86_64_transform_points4_general( XFORM_ARGS ); +extern void _mesa_x86_64_transform_points4_identity( XFORM_ARGS ); +extern void _mesa_x86_64_transform_points4_perspective( XFORM_ARGS ); +extern void _mesa_x86_64_transform_points4_3d( XFORM_ARGS ); +extern void _mesa_x86_64_transform_points4_3d_no_rot( XFORM_ARGS ); +extern void _mesa_x86_64_transform_points4_2d_no_rot( XFORM_ARGS ); +extern void _mesa_x86_64_transform_points4_2d( XFORM_ARGS ); +*/ + +#ifdef USE_X86_64_ASM +static void message( const char *msg ) +{ + GLboolean debug; +#ifdef DEBUG + debug = GL_TRUE; +#else + if ( _mesa_getenv( "MESA_DEBUG" ) ) { + debug = GL_TRUE; + } else { + debug = GL_FALSE; + } +#endif + if ( debug ) { + fprintf( stderr, "%s", msg ); + } +} +#endif + + +void _mesa_init_all_x86_64_transform_asm(void) +{ +#ifdef USE_X86_64_ASM + + if ( _mesa_getenv( "MESA_NO_ASM" ) ) { + return; + } + + message("Initializing x86-64 optimizations\n"); + + ASSIGN_XFORM_GROUP( x86_64, 4 ); + + /* + _mesa_transform_tab[4][MATRIX_GENERAL] = + _mesa_x86_64_transform_points4_general; + _mesa_transform_tab[4][MATRIX_IDENTITY] = + _mesa_x86_64_transform_points4_identity; + _mesa_transform_tab[4][MATRIX_3D] = + _mesa_x86_64_transform_points4_3d; + _mesa_transform_tab[4][MATRIX_3D_NO_ROT] = + _mesa_x86_64_transform_points4_3d_no_rot; + _mesa_transform_tab[4][MATRIX_PERSPECTIVE] = + _mesa_x86_64_transform_points4_perspective; + _mesa_transform_tab[4][MATRIX_2D_NO_ROT] = + _mesa_x86_64_transform_points4_2d_no_rot; + _mesa_transform_tab[4][MATRIX_2D] = + _mesa_x86_64_transform_points4_2d; + */ + +#ifdef DEBUG + _math_test_all_transform_functions("x86_64"); + _math_test_all_cliptest_functions("x86_64"); + _math_test_all_normal_transform_functions("x86_64"); +#endif + +#endif +} diff --git a/src/mesa/x86-64/x86-64.h b/src/mesa/x86-64/x86-64.h new file mode 100644 index 0000000..fdbd154 --- /dev/null +++ b/src/mesa/x86-64/x86-64.h @@ -0,0 +1,32 @@ +/* $Id: x86-64.h,v 1.1 2005/05/07 16:59:59 brianp Exp $ */ + +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef __X86_64_ASM_H__ +#define __X86_64_ASM_H__ + +extern void _mesa_init_all_x86_64_transform_asm( void ); + +#endif diff --git a/src/mesa/x86-64/xform4.S b/src/mesa/x86-64/xform4.S new file mode 100644 index 0000000..622c3f0 --- /dev/null +++ b/src/mesa/x86-64/xform4.S @@ -0,0 +1,458 @@ +/* $Id: xform4.S,v 1.1 2005/05/07 16:59:59 brianp Exp $ */ + +/* + * Mesa 3-D graphics library + * Version: 3.5 + * + * Copyright (C) 1999-2001 Brian Paul All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifdef USE_X86_64_ASM + +#include "matypes.h" + +.text + +.align 16 + +.globl _mesa_x86_64_transform_points4_general +_mesa_x86_64_transform_points4_general: +/* + * rdi = dest + * rsi = matrix + * rdx = source + */ + movl V4F_COUNT(%rdx), %ecx /* count */ + movzx V4F_STRIDE(%rdx), %eax /* stride */ + + movl %ecx, V4F_COUNT(%rdi) /* set dest count */ + movl $4, V4F_SIZE(%rdi) /* set dest size */ + .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */ + orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ + + testl %ecx, %ecx /* verify non-zero count */ + prefetchnta 64(%rsi) + jz p4_general_done + + movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ + movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ + + prefetch 16(%rdx) + + movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ + movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ + .byte 0x66, 0x66, 0x90 /* manual align += 3 */ + movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ + movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ + +p4_general_loop: + + movaps (%rdx), %xmm8 /* ox | oy | oz | ow */ + prefetchw 16(%rdi) + + pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ + addq %rax, %rdx + pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ + mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ + pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ + mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ + pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ + mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ + addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ + mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ + addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ + prefetch 16(%rdx) + addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ + + movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ + addq $16, %rdi + + decl %ecx + jnz p4_general_loop + +p4_general_done: + .byte 0xf3 + ret + +.section .rodata + +.align 16 +p4_constants: +.byte 0xff, 0xff, 0xff, 0xff +.byte 0xff, 0xff, 0xff, 0xff +.byte 0xff, 0xff, 0xff, 0xff +.byte 0x00, 0x00, 0x00, 0x00 + +.byte 0x00, 0x00, 0x00, 0x00 +.byte 0x00, 0x00, 0x00, 0x00 +.byte 0x00, 0x00, 0x00, 0x00 +.float 0f+1.0 + +.text +.align 16 +.globl _mesa_x86_64_transform_points4_3d +/* + * this is slower than _mesa_x86_64_transform_points4_general + * because it ensures that the last matrix row (or is it column?) is 0,0,0,1 + */ +_mesa_x86_64_transform_points4_3d: + + leaq p4_constants(%rip), %rax + + prefetchnta 64(%rsi) + + movaps (%rax), %xmm9 + movaps 16(%rax), %xmm10 + + movl V4F_COUNT(%rdx), %ecx /* count */ + movzx V4F_STRIDE(%rdx), %eax /* stride */ + + movl %ecx, V4F_COUNT(%rdi) /* set dest count */ + movl $4, V4F_SIZE(%rdi) /* set dest size */ + orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ + + testl %ecx, %ecx /* verify non-zero count */ + jz p4_3d_done + + movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ + movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ + + prefetch 16(%rdx) + + movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ + movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ + andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */ + movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ + andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */ + movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ + andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */ + andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */ + .byte 0x66, 0x66, 0x90 /* manual align += 3 */ + orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */ + +p4_3d_loop: + + movaps (%rdx), %xmm8 /* ox | oy | oz | ow */ + prefetchw 16(%rdi) + + pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ + addq %rax, %rdx + pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ + mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ + pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ + mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ + pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ + mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ + addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ + mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ + addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ + prefetch 16(%rdx) + addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ + + movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ + addq $16, %rdi + + dec %ecx + jnz p4_3d_loop + +p4_3d_done: + .byte 0xf3 + ret + + +.align 16 +.globl _mesa_x86_64_transform_points4_identity +_mesa_x86_64_transform_points4_identity: + + movl V4F_COUNT(%rdx), %ecx /* count */ + movzx V4F_STRIDE(%rdx), %eax /* stride */ + + movl %ecx, V4F_COUNT(%rdi) /* set dest count */ + movl $4, V4F_SIZE(%rdi) /* set dest size */ + orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ + + test %ecx, %ecx + jz p4_identity_done + + movq V4F_START(%rdx), %rsi /* ptr to first src vertex */ + movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ + prefetch 64(%rsi) + prefetchw 64(%rdi) + + add %ecx, %ecx + + rep movsq + +p4_identity_done: + .byte 0xf3 + ret + + +.align 16 +.globl _mesa_x86_64_transform_points4_3d_no_rot +_mesa_x86_64_transform_points4_3d_no_rot: + + movl V4F_COUNT(%rdx), %ecx /* count */ + movzx V4F_STRIDE(%rdx), %eax /* stride */ + + movl %ecx, V4F_COUNT(%rdi) /* set dest count */ + movl $4, V4F_SIZE(%rdi) /* set dest size */ + .byte 0x66, 0x66, 0x90 /* manual align += 3 */ + orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ + + test %ecx, %ecx + .byte 0x66, 0x66, 0x90 /* manual align += 3 */ + jz p4_3d_no_rot_done + + movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ + movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ + + prefetch (%rdx) + + movd (%rsi), %mm0 /* | m00 */ + .byte 0x66, 0x66, 0x90 /* manual align += 3 */ + punpckldq 20(%rsi), %mm0 /* m11 | m00 */ + + movd 40(%rsi), %mm2 /* | m22 */ + movq 48(%rsi), %mm1 /* m31 | m30 */ + + punpckldq 56(%rsi), %mm2 /* m11 | m00 */ + +p4_3d_no_rot_loop: + + prefetchw 32(%rdi) + + movq (%rdx), %mm4 /* x1 | x0 */ + movq 8(%rdx), %mm5 /* x3 | x2 */ + movd 12(%rdx), %mm7 /* | x3 */ + + movq %mm5, %mm6 /* x3 | x2 */ + pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ + + punpckhdq %mm6, %mm6 /* x3 | x3 */ + pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */ + + pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ + pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */ + + pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ + + addq %rax, %rdx + movq %mm4, (%rdi) /* write r0, r1 */ + movq %mm5, 8(%rdi) /* write r2, r3 */ + + addq $16, %rdi + + decl %ecx + prefetch 32(%rdx) + jnz p4_3d_no_rot_loop + +p4_3d_no_rot_done: + femms + ret + + +.align 16 +.globl _mesa_x86_64_transform_points4_perspective +_mesa_x86_64_transform_points4_perspective: + + movl V4F_COUNT(%rdx), %ecx /* count */ + movzx V4F_STRIDE(%rdx), %eax /* stride */ + + movl %ecx, V4F_COUNT(%rdi) /* set dest count */ + movl $4, V4F_SIZE(%rdi) /* set dest size */ + orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ + + test %ecx, %ecx + .byte 0x66, 0x66, 0x90 /* manual align += 3 */ + jz p4_perspective_done + + movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ + movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ + + movd (%rsi), %mm0 /* | m00 */ + pxor %mm7, %mm7 /* 0 | 0 */ + punpckldq 20(%rsi), %mm0 /* m11 | m00 */ + + movq 32(%rsi), %mm2 /* m21 | m20 */ + prefetch (%rdx) + + movd 40(%rsi), %mm1 /* | m22 */ + + .byte 0x66, 0x66, 0x90 /* manual align += 3 */ + punpckldq 56(%rsi), %mm1 /* m32 | m22 */ + + +p4_perspective_loop: + + prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ + + movq (%rdx), %mm4 /* x1 | x0 */ + movq 8(%rdx), %mm5 /* x3 | x2 */ + movd 8(%rdx), %mm3 /* | x2 */ + + movq %mm5, %mm6 /* x3 | x2 */ + pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ + + punpckldq %mm5, %mm5 /* x2 | x2 */ + + pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */ + pfsubr %mm7, %mm3 /* | -x2 */ + + pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */ + pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */ + + pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */ + + movq %mm5, (%rdi) /* write r0, r1 */ + addq %rax, %rdx + movq %mm6, 8(%rdi) /* write r2, r3 */ + + addq $16, %rdi + + decl %ecx + prefetch 32(%rdx) /* hopefully stride is zero */ + jnz p4_perspective_loop + +p4_perspective_done: + femms + ret + +.align 16 +.globl _mesa_x86_64_transform_points4_2d_no_rot +_mesa_x86_64_transform_points4_2d_no_rot: + + movl V4F_COUNT(%rdx), %ecx /* count */ + movzx V4F_STRIDE(%rdx), %eax /* stride */ + + movl %ecx, V4F_COUNT(%rdi) /* set dest count */ + movl $4, V4F_SIZE(%rdi) /* set dest size */ + orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ + + test %ecx, %ecx + .byte 0x90 /* manual align += 1 */ + jz p4_2d_no_rot_done + + movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ + movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ + + movd (%rsi), %mm0 /* | m00 */ + prefetch (%rdx) + punpckldq 20(%rsi), %mm0 /* m11 | m00 */ + + movq 48(%rsi), %mm1 /* m31 | m30 */ + +p4_2d_no_rot_loop: + + prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ + + movq (%rdx), %mm4 /* x1 | x0 */ + movq 8(%rdx), %mm5 /* x3 | x2 */ + + pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ + movq %mm5, %mm6 /* x3 | x2 */ + + punpckhdq %mm6, %mm6 /* x3 | x3 */ + + addq %rax, %rdx + pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ + + prefetch 32(%rdx) /* hopefully stride is zero */ + pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ + + movq %mm6, (%rdi) /* write r0, r1 */ + movq %mm5, 8(%rdi) /* write r2, r3 */ + + addq $16, %rdi + + decl %ecx + jnz p4_2d_no_rot_loop + +p4_2d_no_rot_done: + femms + ret + + +.align 16 +.globl _mesa_x86_64_transform_points4_2d +_mesa_x86_64_transform_points4_2d: + + movl V4F_COUNT(%rdx), %ecx /* count */ + movzx V4F_STRIDE(%rdx), %eax /* stride */ + + movl %ecx, V4F_COUNT(%rdi) /* set dest count */ + movl $4, V4F_SIZE(%rdi) /* set dest size */ + .byte 0x66, 0x66, 0x90 /* manual align += 4 */ + orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ + + test %ecx, %ecx + .byte 0x66, 0x66, 0x90 /* manual align += 4 */ + jz p4_2d_done + + movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ + movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ + + movd (%rsi), %mm0 /* | m00 */ + movd 4(%rsi), %mm1 /* | m01 */ + + prefetch (%rdx) + + punpckldq 16(%rsi), %mm0 /* m10 | m00 */ + .byte 0x66, 0x66, 0x90 /* manual align += 4 */ + punpckldq 20(%rsi), %mm1 /* m11 | m01 */ + + movq 48(%rsi), %mm2 /* m31 | m30 */ + +p4_2d_loop: + + prefetchw 32(%rdi) /* prefetch 2 vertices ahead */ + + movq (%rdx), %mm3 /* x1 | x0 */ + movq 8(%rdx), %mm5 /* x3 | x2 */ + + movq %mm3, %mm4 /* x1 | x0 */ + movq %mm5, %mm6 /* x3 | x2 */ + + pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */ + punpckhdq %mm6, %mm6 /* x3 | x3 */ + + pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */ + + addq %rax, %rdx + pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */ + + pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */ + prefetch 32(%rdx) /* hopefully stride is zero */ + + pfadd %mm6, %mm3 /* r1 | r0 */ + + movq %mm3, (%rdi) /* write r0, r1 */ + movq %mm5, 8(%rdi) /* write r2, r3 */ + + addq $16, %rdi + + decl %ecx + jnz p4_2d_loop + +p4_2d_done: + femms + ret + +#endif diff --git a/src/mesa/x86/assyntax.h b/src/mesa/x86/assyntax.h index f89cc6c..4b7317b 100644 --- a/src/mesa/x86/assyntax.h +++ b/src/mesa/x86/assyntax.h @@ -1730,11 +1730,17 @@ SECTION _DATA public align=16 class=DATA use32 flat #define TLBL(a) CONCAT(a,$) #endif -/* hidden symbol visibility support */ +/* Hidden symbol visibility support. + * If we build with gcc's -fvisibility=hidden flag, we'll need to change + * the symbol visibility mode to 'default'. + */ #if defined(GNU_ASSEMBLER) && !defined(__DJGPP__) && !defined(__MINGW32__) -#define HIDDEN(a) .hidden a +# define HIDDEN(x) .hidden x +#elif defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303 +# pragma GCC visibility push(default) +# define HIDDEN(x) .hidden x #else -#define HIDDEN(a) +# define HIDDEN(x) #endif #endif /* __ASSYNTAX_H__ */ diff --git a/src/mesa/x86/gen_matypes.c b/src/mesa/x86/gen_matypes.c index 30642e4..d5cee53 100644 --- a/src/mesa/x86/gen_matypes.c +++ b/src/mesa/x86/gen_matypes.c @@ -61,7 +61,7 @@ do { \ printf( "\n" ); \ } while (0) -#if defined(__BEOS__) +#if defined(__BEOS__) || defined(_LP64) #define OFFSET( s, t, m ) \ printf( "#define %s\t%ld\n", s, offsetof( t, m ) ); #else @@ -69,7 +69,7 @@ do { \ printf( "#define %s\t%d\n", s, offsetof( t, m ) ); #endif -#if defined(__BEOS__) +#if defined(__BEOS__) || defined(_LP64) #define SIZEOF( s, t ) \ printf( "#define %s\t%ld\n", s, sizeof(t) ); #else diff --git a/src/mesa/x86/glapi_x86.S b/src/mesa/x86/glapi_x86.S index c0a971b..6e8f32e 100644 --- a/src/mesa/x86/glapi_x86.S +++ b/src/mesa/x86/glapi_x86.S @@ -29,16 +29,6 @@ #include "assyntax.h" #include "glapioffsets.h" -/* If we build with gcc's -fvisibility=hidden flag, we'll need to change -* the symbol visibility mode to 'default'. -*/ -#if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303 -# pragma GCC visibility push(default) -# define HIDDEN(x) .hidden x -#else -# define HIDDEN(x) -#endif - #ifndef __WIN32__ #if defined(STDCALL_API) -- 2.7.4