Extras/CDTestFramework/Opcode/Ice/_IceRevisitedRadix.cpp

   1 /*
   2  *      ICE / OPCODE - Optimized Collision Detection
   3  * http://www.codercorner.com/Opcode.htm
   4  *
   5  * Copyright (c) 2001-2008 Pierre Terdiman,  pierre@codercorner.com
   6
   7 This software is provided 'as-is', without any express or implied warranty.
   8 In no event will the authors be held liable for any damages arising from the use of this software.
   9 Permission is granted to anyone to use this software for any purpose,
  10 including commercial applications, and to alter it and redistribute it freely,
  11 subject to the following restrictions:
  12
  13 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  14 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  15 3. This notice may not be removed or altered from any source distribution.
  16 */
  17 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  18 /**
  19  *      Contains source code from the article "Radix Sort Revisited".
  20  *      \file           IceRevisitedRadix.cpp
  21  *      \author         Pierre Terdiman
  22  *      \date           April, 4, 2000
  23  */
  24 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  25
  26 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  27 /**
  28  *      Revisited Radix Sort.
  29  *      This is my new radix routine:
  30  *  - it uses indices and doesn't recopy the values anymore, hence wasting less ram
  31  *  - it creates all the histograms in one run instead of four
  32  *  - it sorts words faster than dwords and bytes faster than words
  33  *  - it correctly sorts negative floating-point values by patching the offsets
  34  *  - it automatically takes advantage of temporal coherence
  35  *  - multiple keys support is a side effect of temporal coherence
  36  *  - it may be worth recoding in asm... (mainly to use FCOMI, FCMOV, etc) [it's probably memory-bound anyway]
  37  *
  38  *      History:
  39  *      - 08.15.98: very first version
  40  *      - 04.04.00: recoded for the radix article
  41  *      - 12.xx.00: code lifting
  42  *      - 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
  43  *      - 10.11.01: added local ram support
  44  *      - 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
  45  *      - 01.02.02:     - "mIndices" renamed => "mRanks". That's a rank sorter after all.
  46  *                              - ranks are not "reset" anymore, but implicit on first calls
  47  *      - 07.05.02:     - offsets rewritten with one less indirection.
  48  *      - 11.03.02:     - "bool" replaced with RadixHint enum
  49  *
  50  *      \class          RadixSort
  51  *      \author         Pierre Terdiman
  52  *      \version        1.4
  53  *      \date           August, 15, 1998
  54  */
  55 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  56
  57 /*
  58 To do:
  59         - add an offset parameter between two input values (avoid some data recopy sometimes)
  60         - unroll ? asm ?
  61         - 11 bits trick & 3 passes as Michael did
  62         - prefetch stuff the day I have a P3
  63         - make a version with 16-bits indices ?
  64 */
  65
  66 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  67 // Precompiled Header
  68 #include "Stdafx.h"
  69
  70 using namespace IceCore;
  71
  72 #define INVALIDATE_RANKS        mCurrentSize|=0x80000000
  73 #define VALIDATE_RANKS          mCurrentSize&=0x7fffffff
  74 #define CURRENT_SIZE            (mCurrentSize&0x7fffffff)
  75 #define INVALID_RANKS           (mCurrentSize&0x80000000)
  76
  77 #define CHECK_RESIZE(n)                                                                                                                                         \
  78         if(n!=mPreviousSize)                                                                                                                                    \
  79         {                                                                                                                                                                               \
  80                                 if(n>mCurrentSize)      Resize(n);                                                                                              \
  81                 else                                            ResetRanks();                                                                                   \
  82                 mPreviousSize = n;                                                                                                                                      \
  83         }
  84
  85 #define CREATE_HISTOGRAMS(type, buffer)                                                                                                         \
  86         /* Clear counters/histograms */                                                                                                                 \
  87         ZeroMemory(mHistogram, 256*4*sizeof(udword));                                                                                   \
  88                                                                                                                                                                                         \
  89         /* Prepare to count */                                                                                                                                  \
  90         ubyte* p = (ubyte*)input;                                                                                                                               \
  91         ubyte* pe = &p[nb*4];                                                                                                                                   \
  92         udword* h0= &mHistogram[0];             /* Histogram for first pass (LSB)       */                                      \
  93         udword* h1= &mHistogram[256];   /* Histogram for second pass            */                                      \
  94         udword* h2= &mHistogram[512];   /* Histogram for third pass                     */                                      \
  95         udword* h3= &mHistogram[768];   /* Histogram for last pass (MSB)        */                                      \
  96                                                                                                                                                                                         \
  97         bool AlreadySorted = true;      /* Optimism... */                                                                                       \
  98                                                                                                                                                                                         \
  99         if(INVALID_RANKS)                                                                                                                                               \
 100         {                                                                                                                                                                               \
 101                 /* Prepare for temporal coherence */                                                                                            \
 102                 type* Running = (type*)buffer;                                                                                                          \
 103                 type PrevVal = *Running;                                                                                                                        \
 104                                                                                                                                                                                         \
 105                 while(p!=pe)                                                                                                                                            \
 106                 {                                                                                                                                                                       \
 107                         /* Read input buffer in previous sorted order */                                                                \
 108                         type Val = *Running++;                                                                                                                  \
 109                         /* Check whether already sorted or not */                                                                               \
 110                         if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */                               \
 111                         /* Update for next iteration */                                                                                                 \
 112                         PrevVal = Val;                                                                                                                                  \
 113                                                                                                                                                                                         \
 114                         /* Create histograms */                                                                                                                 \
 115                         h0[*p++]++;     h1[*p++]++;     h2[*p++]++;     h3[*p++]++;                                                                     \
 116                 }                                                                                                                                                                       \
 117                                                                                                                                                                                         \
 118                 /* If all input values are already sorted, we just have to return and leave the */      \
 119                 /* previous list unchanged. That way the routine may take advantage of temporal */      \
 120                 /* coherence, for example when used to sort transparent faces.                                  */      \
 121                 if(AlreadySorted)                                                                                                                                       \
 122                 {                                                                                                                                                                       \
 123                         mNbHits++;                                                                                                                                              \
 124                         for(udword i=0;i<nb;i++)        mRanks[i] = i;                                                                          \
 125                         return *this;                                                                                                                                   \
 126                 }                                                                                                                                                                       \
 127         }                                                                                                                                                                               \
 128         else                                                                                                                                                                    \
 129         {                                                                                                                                                                               \
 130                 /* Prepare for temporal coherence */                                                                                            \
 131                 udword* Indices = mRanks;                                                                                                                       \
 132                 type PrevVal = (type)buffer[*Indices];                                                                                          \
 133                                                                                                                                                                                         \
 134                 while(p!=pe)                                                                                                                                            \
 135                 {                                                                                                                                                                       \
 136                         /* Read input buffer in previous sorted order */                                                                \
 137                         type Val = (type)buffer[*Indices++];                                                                                    \
 138                         /* Check whether already sorted or not */                                                                               \
 139                         if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */                               \
 140                         /* Update for next iteration */                                                                                                 \
 141                         PrevVal = Val;                                                                                                                                  \
 142                                                                                                                                                                                         \
 143                         /* Create histograms */                                                                                                                 \
 144                         h0[*p++]++;     h1[*p++]++;     h2[*p++]++;     h3[*p++]++;                                                                     \
 145                 }                                                                                                                                                                       \
 146                                                                                                                                                                                         \
 147                 /* If all input values are already sorted, we just have to return and leave the */      \
 148                 /* previous list unchanged. That way the routine may take advantage of temporal */      \
 149                 /* coherence, for example when used to sort transparent faces.                                  */      \
 150                 if(AlreadySorted)       { mNbHits++; return *this;      }                                                                       \
 151         }                                                                                                                                                                               \
 152                                                                                                                                                                                         \
 153         /* Else there has been an early out and we must finish computing the histograms */              \
 154         while(p!=pe)                                                                                                                                                    \
 155         {                                                                                                                                                                               \
 156                 /* Create histograms without the previous overhead */                                                           \
 157                 h0[*p++]++;     h1[*p++]++;     h2[*p++]++;     h3[*p++]++;                                                                             \
 158         }
 159
 160 #define CHECK_PASS_VALIDITY(pass)                                                                                                                       \
 161         /* Shortcut to current counters */                                                                                                              \
 162         udword* CurCount = &mHistogram[pass<<8];                                                                                                \
 163                                                                                                                                                                                         \
 164         /* Reset flag. The sorting pass is supposed to be performed. (default) */                               \
 165         bool PerformPass = true;                                                                                                                                \
 166                                                                                                                                                                                         \
 167         /* Check pass validity */                                                                                                                               \
 168                                                                                                                                                                                         \
 169         /* If all values have the same byte, sorting is useless. */                                                             \
 170         /* It may happen when sorting bytes or words instead of dwords. */                                              \
 171         /* This routine actually sorts words faster than dwords, and bytes */                                   \
 172         /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */                             \
 173         /* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
 174                                                                                                                                                                                         \
 175         /* Get first byte */                                                                                                                                    \
 176         ubyte UniqueVal = *(((ubyte*)input)+pass);                                                                                              \
 177                                                                                                                                                                                         \
 178         /* Check that byte's counter */                                                                                                                 \
 179         if(CurCount[UniqueVal]==nb)     PerformPass=false;
 180
 181 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 182 /**
 183  *      Constructor.
 184  */
 185 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 186 RadixSort::RadixSort() : mRanks(null), mRanks2(null), mCurrentSize(0), mTotalCalls(0), mNbHits(0)
 187 {
 188 #ifndef RADIX_LOCAL_RAM
 189         // Allocate input-independent ram
 190         mHistogram      = new udword[256*4];
 191         mOffset         = new udword[256];
 192 #endif
 193         // Initialize indices
 194         INVALIDATE_RANKS;
 195 }
 196
 197 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 198 /**
 199  *      Destructor.
 200  */
 201 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 202 RadixSort::~RadixSort()
 203 {
 204         // Release everything
 205 #ifndef RADIX_LOCAL_RAM
 206         DELETEARRAY(mOffset);
 207         DELETEARRAY(mHistogram);
 208 #endif
 209         DELETEARRAY(mRanks2);
 210         DELETEARRAY(mRanks);
 211 }
 212
 213 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 214 /**
 215  *      Resizes the inner lists.
 216  *      \param          nb      [in] new size (number of dwords)
 217  *      \return         true if success
 218  */
 219 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 220 bool RadixSort::Resize(udword nb)
 221 {
 222         // Free previously used ram
 223         DELETEARRAY(mRanks2);
 224         DELETEARRAY(mRanks);
 225
 226         // Get some fresh one
 227         mRanks  = new udword[nb];       CHECKALLOC(mRanks);
 228         mRanks2 = new udword[nb];       CHECKALLOC(mRanks2);
 229
 230         return true;
 231 }
 232
 233 inline_ void RadixSort::CheckResize(udword nb)
 234 {
 235         udword CurSize = CURRENT_SIZE;
 236         if(nb!=CurSize)
 237         {
 238                 if(nb>CurSize)  Resize(nb);
 239                 mCurrentSize = nb;
 240                 INVALIDATE_RANKS;
 241         }
 242 }
 243
 244 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 245 /**
 246  *      Main sort routine.
 247  *      This one is for integer values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
 248  *      \param          input   [in] a list of integer values to sort
 249  *      \param          nb              [in] number of values to sort, must be < 2^31
 250  *      \param          hint    [in] RADIX_SIGNED to handle negative values, RADIX_UNSIGNED if you know your input buffer only contains positive values
 251  *      \return         Self-Reference
 252  */
 253 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 254 RadixSort& RadixSort::Sort(const udword* input, udword nb, RadixHint hint)
 255 {
 256         // Checkings
 257         if(!input || !nb || nb&0x80000000)      return *this;
 258
 259         // Stats
 260         mTotalCalls++;
 261
 262         // Resize lists if needed
 263         CheckResize(nb);
 264
 265 #ifdef RADIX_LOCAL_RAM
 266         // Allocate histograms & offsets on the stack
 267         udword mHistogram[256*4];
 268 //      udword mOffset[256];
 269         udword* mLink[256];
 270 #endif
 271
 272         // Create histograms (counters). Counters for all passes are created in one run.
 273         // Pros:        read input buffer once instead of four times
 274         // Cons:        mHistogram is 4Kb instead of 1Kb
 275         // We must take care of signed/unsigned values for temporal coherence.... I just
 276         // have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
 277         if(hint==RADIX_UNSIGNED)        { CREATE_HISTOGRAMS(udword, input);     }
 278         else                                            { CREATE_HISTOGRAMS(sdword, input);     }
 279
 280         // Compute #negative values involved if needed
 281         udword NbNegativeValues = 0;
 282         if(hint==RADIX_SIGNED)
 283         {
 284                 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
 285                 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
 286                 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
 287                 udword* h3= &mHistogram[768];
 288                 for(udword i=128;i<256;i++)     NbNegativeValues += h3[i];      // 768 for last histogram, 128 for negative part
 289         }
 290
 291         // Radix sort, j is the pass number (0=LSB, 3=MSB)
 292         for(udword j=0;j<4;j++)
 293         {
 294                 CHECK_PASS_VALIDITY(j);
 295
 296                 // Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
 297                 // not a problem, numbers are correctly sorted anyway.
 298                 if(PerformPass)
 299                 {
 300                         // Should we care about negative values?
 301                         if(j!=3 || hint==RADIX_UNSIGNED)
 302                         {
 303                                 // Here we deal with positive values only
 304
 305                                 // Create offsets
 306 //                              mOffset[0] = 0;
 307 //                              for(udword i=1;i<256;i++)               mOffset[i] = mOffset[i-1] + CurCount[i-1];
 308                                 mLink[0] = mRanks2;
 309                                 for(udword i=1;i<256;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];
 310                         }
 311                         else
 312                         {
 313                                 // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
 314
 315                                 // Create biased offsets, in order for negative numbers to be sorted as well
 316 //                              mOffset[0] = NbNegativeValues;                                                                                          // First positive number takes place after the negative ones
 317                                 mLink[0] = &mRanks2[NbNegativeValues];                                                                          // First positive number takes place after the negative ones
 318 //                              for(udword i=1;i<128;i++)               mOffset[i] = mOffset[i-1] + CurCount[i-1];      // 1 to 128 for positive numbers
 319                                 for(udword i=1;i<128;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];          // 1 to 128 for positive numbers
 320
 321                                 // Fixing the wrong place for negative values
 322 //                              mOffset[128] = 0;
 323                                 mLink[128] = mRanks2;
 324 //                              for(i=129;i<256;i++)                    mOffset[i] = mOffset[i-1] + CurCount[i-1];
 325                                 for(udword i=129;i<256;i++)             mLink[i] = mLink[i-1] + CurCount[i-1];
 326                         }
 327
 328                         // Perform Radix Sort
 329                         ubyte* InputBytes       = (ubyte*)input;
 330                         InputBytes += j;
 331                         if(INVALID_RANKS)
 332                         {
 333 //                              for(udword i=0;i<nb;i++)        mRanks2[mOffset[InputBytes[i<<2]]++] = i;
 334                                 for(udword i=0;i<nb;i++)        *mLink[InputBytes[i<<2]]++ = i;
 335                                 VALIDATE_RANKS;
 336                         }
 337                         else
 338                         {
 339                                 udword* Indices         = mRanks;
 340                                 udword* IndicesEnd      = &mRanks[nb];
 341                                 while(Indices!=IndicesEnd)
 342                                 {
 343                                         udword id = *Indices++;
 344 //                                      mRanks2[mOffset[InputBytes[id<<2]]++] = id;
 345                                         *mLink[InputBytes[id<<2]]++ = id;
 346                                 }
 347                         }
 348
 349                         // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
 350                         udword* Tmp     = mRanks;       mRanks = mRanks2; mRanks2 = Tmp;
 351                 }
 352         }
 353         return *this;
 354 }
 355
 356 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 357 /**
 358  *      Main sort routine.
 359  *      This one is for floating-point values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
 360  *      \param          input                   [in] a list of floating-point values to sort
 361  *      \param          nb                              [in] number of values to sort, must be < 2^31
 362  *      \return         Self-Reference
 363  *      \warning        only sorts IEEE floating-point values
 364  */
 365 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 366 RadixSort& RadixSort::Sort(const float* input2, udword nb)
 367 {
 368         // Checkings
 369         if(!input2 || !nb || nb&0x80000000)     return *this;
 370
 371         // Stats
 372         mTotalCalls++;
 373
 374         udword* input = (udword*)input2;
 375
 376         // Resize lists if needed
 377         CheckResize(nb);
 378
 379 #ifdef RADIX_LOCAL_RAM
 380         // Allocate histograms & offsets on the stack
 381         udword mHistogram[256*4];
 382 //      udword mOffset[256];
 383         udword* mLink[256];
 384 #endif
 385
 386         // Create histograms (counters). Counters for all passes are created in one run.
 387         // Pros:        read input buffer once instead of four times
 388         // Cons:        mHistogram is 4Kb instead of 1Kb
 389         // Floating-point values are always supposed to be signed values, so there's only one code path there.
 390         // Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
 391         // is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
 392         // generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
 393         // wouldn't work with mixed positive/negative values....
 394         { CREATE_HISTOGRAMS(float, input2); }
 395
 396         // Compute #negative values involved if needed
 397         udword NbNegativeValues = 0;
 398         // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
 399         // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
 400         // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
 401         udword* h3= &mHistogram[768];
 402         for(udword i=128;i<256;i++)     NbNegativeValues += h3[i];      // 768 for last histogram, 128 for negative part
 403
 404         // Radix sort, j is the pass number (0=LSB, 3=MSB)
 405         for(udword j=0;j<4;j++)
 406         {
 407                 // Should we care about negative values?
 408                 if(j!=3)
 409                 {
 410                         // Here we deal with positive values only
 411                         CHECK_PASS_VALIDITY(j);
 412
 413                         if(PerformPass)
 414                         {
 415                                 // Create offsets
 416 //                              mOffset[0] = 0;
 417                                 mLink[0] = mRanks2;
 418 //                              for(udword i=1;i<256;i++)               mOffset[i] = mOffset[i-1] + CurCount[i-1];
 419                                 for(udword i=1;i<256;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];
 420
 421                                 // Perform Radix Sort
 422                                 ubyte* InputBytes = (ubyte*)input;
 423                                 InputBytes += j;
 424                                 if(INVALID_RANKS)
 425                                 {
 426 //                                      for(i=0;i<nb;i++)       mRanks2[mOffset[InputBytes[i<<2]]++] = i;
 427                                         for(udword i=0;i<nb;i++)        *mLink[InputBytes[i<<2]]++ = i;
 428                                         VALIDATE_RANKS;
 429                                 }
 430                                 else
 431                                 {
 432                                         udword* Indices         = mRanks;
 433                                         udword* IndicesEnd      = &mRanks[nb];
 434                                         while(Indices!=IndicesEnd)
 435                                         {
 436                                                 udword id = *Indices++;
 437 //                                              mRanks2[mOffset[InputBytes[id<<2]]++] = id;
 438                                                 *mLink[InputBytes[id<<2]]++ = id;
 439                                         }
 440                                 }
 441
 442                                 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
 443                                 udword* Tmp     = mRanks;       mRanks = mRanks2; mRanks2 = Tmp;
 444                         }
 445                 }
 446                 else
 447                 {
 448                         // This is a special case to correctly handle negative values
 449                         CHECK_PASS_VALIDITY(j);
 450
 451                         if(PerformPass)
 452                         {
 453                                 // Create biased offsets, in order for negative numbers to be sorted as well
 454 //                              mOffset[0] = NbNegativeValues;                                                                                          // First positive number takes place after the negative ones
 455                                 mLink[0] = &mRanks2[NbNegativeValues];                                                                          // First positive number takes place after the negative ones
 456 //                              for(udword i=1;i<128;i++)               mOffset[i] = mOffset[i-1] + CurCount[i-1];      // 1 to 128 for positive numbers
 457                                 for(udword i=1;i<128;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];          // 1 to 128 for positive numbers
 458
 459                                 // We must reverse the sorting order for negative numbers!
 460 //                              mOffset[255] = 0;
 461                                 mLink[255] = mRanks2;
 462 //                              for(i=0;i<127;i++)              mOffset[254-i] = mOffset[255-i] + CurCount[255-i];      // Fixing the wrong order for negative values
 463                                 for(udword i=0;i<127;i++)       mLink[254-i] = mLink[255-i] + CurCount[255-i];          // Fixing the wrong order for negative values
 464 //                              for(i=128;i<256;i++)    mOffset[i] += CurCount[i];                                                      // Fixing the wrong place for negative values
 465                                 for(udword i=128;i<256;i++)     mLink[i] += CurCount[i];                                                        // Fixing the wrong place for negative values
 466
 467                                 // Perform Radix Sort
 468                                 if(INVALID_RANKS)
 469                                 {
 470                                         for(udword i=0;i<nb;i++)
 471                                         {
 472                                                 udword Radix = input[i]>>24;                                                    // Radix byte, same as above. AND is useless here (udword).
 473                                                 // ### cmp to be killed. Not good. Later.
 474 //                                              if(Radix<128)           mRanks2[mOffset[Radix]++] = i;          // Number is positive, same as above
 475 //                                              else                            mRanks2[--mOffset[Radix]] = i;          // Number is negative, flip the sorting order
 476                                                 if(Radix<128)           *mLink[Radix]++ = i;            // Number is positive, same as above
 477                                                 else                            *(--mLink[Radix]) = i;          // Number is negative, flip the sorting order
 478                                         }
 479                                         VALIDATE_RANKS;
 480                                 }
 481                                 else
 482                                 {
 483                                         for(udword i=0;i<nb;i++)
 484                                         {
 485                                                 udword Radix = input[mRanks[i]]>>24;                                                    // Radix byte, same as above. AND is useless here (udword).
 486                                                 // ### cmp to be killed. Not good. Later.
 487 //                                              if(Radix<128)           mRanks2[mOffset[Radix]++] = mRanks[i];          // Number is positive, same as above
 488 //                                              else                            mRanks2[--mOffset[Radix]] = mRanks[i];          // Number is negative, flip the sorting order
 489                                                 if(Radix<128)           *mLink[Radix]++ = mRanks[i];            // Number is positive, same as above
 490                                                 else                            *(--mLink[Radix]) = mRanks[i];          // Number is negative, flip the sorting order
 491                                         }
 492                                 }
 493                                 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
 494                                 udword* Tmp     = mRanks;       mRanks = mRanks2; mRanks2 = Tmp;
 495                         }
 496                         else
 497                         {
 498                                 // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
 499                                 if(UniqueVal>=128)
 500                                 {
 501                                         if(INVALID_RANKS)
 502                                         {
 503                                                 // ###Possible?
 504                                                 for(udword i=0;i<nb;i++)        mRanks2[i] = nb-i-1;
 505                                                 VALIDATE_RANKS;
 506                                         }
 507                                         else
 508                                         {
 509                                                 for(udword i=0;i<nb;i++)        mRanks2[i] = mRanks[nb-i-1];
 510                                         }
 511
 512                                         // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
 513                                         udword* Tmp     = mRanks;       mRanks = mRanks2; mRanks2 = Tmp;
 514                                 }
 515                         }
 516                 }
 517         }
 518         return *this;
 519 }
 520
 521 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 522 /**
 523  *      Gets the ram used.
 524  *      \return         memory used in bytes
 525  */
 526 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 527 udword RadixSort::GetUsedRam() const
 528 {
 529         udword UsedRam = sizeof(RadixSort);
 530 #ifndef RADIX_LOCAL_RAM
 531         UsedRam += 256*4*sizeof(udword);                        // Histograms
 532         UsedRam += 256*sizeof(udword);                          // Offsets
 533 #endif
 534         UsedRam += 2*CURRENT_SIZE*sizeof(udword);       // 2 lists of indices
 535         return UsedRam;
 536 }