check in Brady's second altivec-related patch that hooks up the asm routines and...
authorJosh Coalson <jcoalson@users.sourceforce.net>
Tue, 27 Jul 2004 01:13:16 +0000 (01:13 +0000)
committerJosh Coalson <jcoalson@users.sourceforce.net>
Tue, 27 Jul 2004 01:13:16 +0000 (01:13 +0000)
configure.in
src/libFLAC/cpu.c
src/libFLAC/include/private/cpu.h
src/libFLAC/stream_decoder.c

index 37163ab..607e382 100644 (file)
@@ -208,6 +208,18 @@ if test x$use_3dnow = xtrue ; then
 AC_DEFINE(FLAC__USE_3DNOW)
 fi
 
+AC_ARG_ENABLE(altivec,
+[  --disable-altivec              Disable Altivec optimizations],
+[case "${enableval}" in
+       yes) use_altivec=true ;;
+       no)  use_altivec=false ;;
+       *) AC_MSG_ERROR(bad value ${enableval} for --enable-altivec) ;;
+esac],[use_altivec=true])
+AM_CONDITIONAL(FLaC__USE_ALTIVEC, test x$use_altivec = xtrue)
+if test x$use_altivec = xtrue ; then
+AC_DEFINE(FLAC__USE_ALTIVEC)
+fi
+
 AC_ARG_ENABLE(local-xmms-plugin,
 [  --enable-local-xmms-plugin     Install XMMS plugin to ~/.xmms/Plugins instead of system location],
 [case "${enableval}" in
@@ -330,6 +342,7 @@ AH_TEMPLATE(FLAC__HAS_OGG,  [define if you have the ogg library])
 AH_TEMPLATE(FLAC__NO_ASM,  [define to disable use of assembly code])
 AH_TEMPLATE(FLAC__SSE_OS,  [define if your operating system supports SSE instructions])
 AH_TEMPLATE(FLAC__USE_3DNOW,  [define to enable use of 3Dnow! instructions])
+AH_TEMPLATE(FLAC__USE_ALTIVEC,  [define to enable use of Altivec instructions])
 AH_TEMPLATE(ID3LIB_MAJOR,  [define to major version number of id3lib])
 AH_TEMPLATE(ID3LIB_MINOR,  [define to minor version number of id3lib])
 AH_TEMPLATE(ID3LIB_PATCH,  [define to patch level of id3lib])
@@ -339,6 +352,7 @@ AC_OUTPUT( \
        src/Makefile \
        src/libFLAC/Makefile \
        src/libFLAC/ia32/Makefile \
+       src/libFLAC/ppc/Makefile \
        src/libFLAC/include/Makefile \
        src/libFLAC/include/private/Makefile \
        src/libFLAC/include/protected/Makefile \
index 61f52c7..63ce05e 100644 (file)
 #include <config.h>
 #endif
 
+#if defined FLAC__CPU_PPC
+#if !defined FLAC__NO_ASM
+#if defined __APPLE__ && defined __MACH__
+#include <sys/sysctl.h>
+#endif /* __APPLE__ && __MACH__ */
+#endif /* FLAC__NO_ASM */
+#endif /* FLAC__CPU_PPC */
+
 const unsigned FLAC__CPUINFO_IA32_CPUID_CMOV = 0x00008000;
 const unsigned FLAC__CPUINFO_IA32_CPUID_MMX = 0x00800000;
 const unsigned FLAC__CPUINFO_IA32_CPUID_FXSR = 0x01000000;
@@ -78,6 +86,30 @@ void FLAC__cpu_info(FLAC__CPUInfo *info)
 #else
        info->use_asm = false;
 #endif
+#elif defined FLAC__CPU_PPC
+       info->type = FLAC__CPUINFO_TYPE_PPC;
+#if !defined FLAC__NO_ASM
+       info->use_asm = true;
+#ifdef FLAC__USE_ALTIVEC
+#if defined __APPLE__ && defined __MACH__
+       {
+               int selectors[2] = { CTL_HW, HW_VECTORUNIT };
+               int result = 0;
+               size_t length = sizeof(result);
+               int error = sysctl(selectors, 2, &result, &length, 0, 0);
+
+               info->data.ppc.altivec = error==0 ? result!=0 : 0;
+       }
+#else /* __APPLE__ && __MACH__ */
+       /* don't know of any other thread-safe way to check */
+       info->data.ppc.altivec = 0;
+#endif /* __APPLE__ && __MACH__ */
+#else /* FLAC__USE_ALTIVEC */
+       info->data.ppc.altivec = 0;
+#endif /* FLAC__USE_ALTIVEC */
+#else /* FLAC__NO_ASM */
+       info->use_asm = false;
+#endif /* FLAC__NO_ASM */
 #else
        info->type = FLAC__CPUINFO_TYPE_UNKNOWN;
        info->use_asm = false;
index 0c4ab35..b8c001a 100644 (file)
@@ -40,6 +40,7 @@
 
 typedef enum {
        FLAC__CPUINFO_TYPE_IA32,
+       FLAC__CPUINFO_TYPE_PPC,
        FLAC__CPUINFO_TYPE_UNKNOWN
 } FLAC__CPUInfo_Type;
 
@@ -54,6 +55,10 @@ typedef struct {
        FLAC__bool extmmx;
 } FLAC__CPUInfo_IA32;
 
+typedef struct {
+       FLAC__bool altivec;
+} FLAC__CPUInfo_PPC;
+
 extern const unsigned FLAC__CPUINFO_IA32_CPUID_CMOV;
 extern const unsigned FLAC__CPUINFO_IA32_CPUID_MMX;
 extern const unsigned FLAC__CPUINFO_IA32_CPUID_FXSR;
@@ -69,6 +74,7 @@ typedef struct {
        FLAC__CPUInfo_Type type;
        union {
                FLAC__CPUInfo_IA32 ia32;
+               FLAC__CPUInfo_PPC ppc;
        } data;
 } FLAC__CPUInfo;
 
index 4369d6d..fb6cdbc 100644 (file)
@@ -41,6 +41,7 @@
 #include "private/fixed.h"
 #include "private/format.h"
 #include "private/lpc.h"
+#include "private/memory.h"
 
 #ifdef HAVE_CONFIG_H
 #include <config.h>
@@ -98,13 +99,18 @@ typedef struct FLAC__StreamDecoderPrivate {
        FLAC__StreamDecoderWriteCallback write_callback;
        FLAC__StreamDecoderMetadataCallback metadata_callback;
        FLAC__StreamDecoderErrorCallback error_callback;
+       /* generic 32-bit datapath: */
        void (*local_lpc_restore_signal)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
+       /* generic 64-bit datapath: */
        void (*local_lpc_restore_signal_64bit)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
+       /* for use when the signal is <= 16 bits-per-sample, or <= 15 bits-per-sample on a side channel (which requires 1 extra bit): */
        void (*local_lpc_restore_signal_16bit)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
+       /* for use when the signal is <= 16 bits-per-sample, or <= 15 bits-per-sample on a side channel (which requires 1 extra bit), AND order <= 8: */
+       void (*local_lpc_restore_signal_16bit_order8)(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
        void *client_data;
        FLAC__BitBuffer *input;
        FLAC__int32 *output[FLAC__MAX_CHANNELS];
-       FLAC__int32 *residual[FLAC__MAX_CHANNELS];
+       FLAC__int32 *residual[FLAC__MAX_CHANNELS]; /* WATCHOUT: these are the aligned pointers; the real pointers that should be free()'d are residual_unaligned[] below */
        FLAC__EntropyCodingMethod_PartitionedRiceContents partitioned_rice_contents[FLAC__MAX_CHANNELS];
        unsigned output_capacity, output_channels;
        FLAC__uint32 last_frame_number;
@@ -120,6 +126,8 @@ typedef struct FLAC__StreamDecoderPrivate {
        FLAC__CPUInfo cpuinfo;
        FLAC__byte header_warmup[2]; /* contains the sync code and reserved bits */
        FLAC__byte lookahead; /* temp storage when we need to look ahead one byte in the stream */
+       /* unaligned (original) pointers to allocated data */
+       FLAC__int32 *residual_unaligned[FLAC__MAX_CHANNELS];
 } FLAC__StreamDecoderPrivate;
 
 /***********************************************************************
@@ -208,7 +216,7 @@ FLAC_API FLAC__StreamDecoder *FLAC__stream_decoder_new()
 
        for(i = 0; i < FLAC__MAX_CHANNELS; i++) {
                decoder->private_->output[i] = 0;
-               decoder->private_->residual[i] = 0;
+               decoder->private_->residual_unaligned[i] = decoder->private_->residual[i] = 0;
        }
 
        decoder->private_->output_capacity = 0;
@@ -281,6 +289,7 @@ FLAC_API FLAC__StreamDecoderState FLAC__stream_decoder_init(FLAC__StreamDecoder
        decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal;
        decoder->private_->local_lpc_restore_signal_64bit = FLAC__lpc_restore_signal_wide;
        decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal;
+       decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal;
        /* now override with asm where appropriate */
 #ifndef FLAC__NO_ASM
        if(decoder->private_->cpuinfo.use_asm) {
@@ -290,12 +299,20 @@ FLAC_API FLAC__StreamDecoderState FLAC__stream_decoder_init(FLAC__StreamDecoder
                if(decoder->private_->cpuinfo.data.ia32.mmx) {
                        decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32;
                        decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32_mmx;
+                       decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal_asm_ia32_mmx;
                }
                else {
                        decoder->private_->local_lpc_restore_signal = FLAC__lpc_restore_signal_asm_ia32;
                        decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ia32;
+                       decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal_asm_ia32;
                }
 #endif
+#elif defined FLAC__CPU_PPC
+               FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_PPC);
+               if(decoder->private_->cpuinfo.data.ppc.altivec) {
+                       decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ppc_altivec_16;
+                       decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8;
+               }
 #endif
        }
 #endif
@@ -329,9 +346,9 @@ FLAC_API void FLAC__stream_decoder_finish(FLAC__StreamDecoder *decoder)
                        free(decoder->private_->output[i]-4);
                        decoder->private_->output[i] = 0;
                }
-               if(0 != decoder->private_->residual[i]) {
-                       free(decoder->private_->residual[i]);
-                       decoder->private_->residual[i] = 0;
+               if(0 != decoder->private_->residual_unaligned[i]) {
+                       free(decoder->private_->residual_unaligned[i]);
+                       decoder->private_->residual_unaligned[i] = decoder->private_->residual[i] = 0;
                }
        }
        decoder->private_->output_capacity = 0;
@@ -763,9 +780,9 @@ FLAC__bool allocate_output_(FLAC__StreamDecoder *decoder, unsigned size, unsigne
                        free(decoder->private_->output[i]-4);
                        decoder->private_->output[i] = 0;
                }
-               if(0 != decoder->private_->residual[i]) {
-                       free(decoder->private_->residual[i]);
-                       decoder->private_->residual[i] = 0;
+               if(0 != decoder->private_->residual_unaligned[i]) {
+                       free(decoder->private_->residual_unaligned[i]);
+                       decoder->private_->residual_unaligned[i] = decoder->private_->residual[i] = 0;
                }
        }
 
@@ -784,12 +801,13 @@ FLAC__bool allocate_output_(FLAC__StreamDecoder *decoder, unsigned size, unsigne
                memset(tmp, 0, sizeof(FLAC__int32)*4);
                decoder->private_->output[i] = tmp + 4;
 
-               tmp = (FLAC__int32*)malloc(sizeof(FLAC__int32)*size);
-               if(tmp == 0) {
+               /* WATCHOUT:
+                * minimum of quadword alignment for PPC vector optimizations is REQUIRED:
+                */
+               if(!FLAC__memory_alloc_aligned_int32_array(size, &decoder->private_->residual_unaligned[i], &decoder->private_->residual[i])) {
                        decoder->protected_->state = FLAC__STREAM_DECODER_MEMORY_ALLOCATION_ERROR;
                        return false;
                }
-               decoder->private_->residual[i] = tmp;
        }
 
        decoder->private_->output_capacity = size;
@@ -1974,8 +1992,12 @@ FLAC__bool read_subframe_lpc_(FLAC__StreamDecoder *decoder, unsigned channel, un
        if(do_full_decode) {
                memcpy(decoder->private_->output[channel], subframe->warmup, sizeof(FLAC__int32) * order);
                if(bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order) <= 32)
-                       if(bps <= 16 && subframe->qlp_coeff_precision <= 16)
-                               decoder->private_->local_lpc_restore_signal_16bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
+                       if(bps <= 16 && subframe->qlp_coeff_precision <= 16) {
+                               if(order <= 8)
+                                       decoder->private_->local_lpc_restore_signal_16bit_order8(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
+                               else
+                                       decoder->private_->local_lpc_restore_signal_16bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
+                       }
                        else
                                decoder->private_->local_lpc_restore_signal(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
                else