Extras/CDTestFramework/Opcode/Ice/IceRevisitedRadix.cpp

   1 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
   2 /**
   3  *      Contains source code from the article "Radix Sort Revisited".
   4  *      \file           IceRevisitedRadix.cpp
   5  *      \author         Pierre Terdiman
   6  *      \date           April, 4, 2000
   7  */
   8 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
   9
  10 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  11 /**
  12  *      Revisited Radix Sort.
  13  *      This is my new radix routine:
  14  *  - it uses indices and doesn't recopy the values anymore, hence wasting less ram
  15  *  - it creates all the histograms in one run instead of four
  16  *  - it sorts words faster than dwords and bytes faster than words
  17  *  - it correctly sorts negative floating-point values by patching the offsets
  18  *  - it automatically takes advantage of temporal coherence
  19  *  - multiple keys support is a side effect of temporal coherence
  20  *  - it may be worth recoding in asm... (mainly to use FCOMI, FCMOV, etc) [it's probably memory-bound anyway]
  21  *
  22  *      History:
  23  *      - 08.15.98: very first version
  24  *      - 04.04.00: recoded for the radix article
  25  *      - 12.xx.00: code lifting
  26  *      - 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
  27  *      - 10.11.01: added local ram support
  28  *      - 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
  29  *      - 01.02.02:     - "mIndices" renamed => "mRanks". That's a rank sorter after all.
  30  *                              - ranks are not "reset" anymore, but implicit on first calls
  31  *      - 07.05.02:     offsets rewritten with one less indirection.
  32  *      - 11.03.02:     "bool" replaced with RadixHint enum
  33  *      - 07.15.04:     stack-based radix added
  34  *                              - we want to use the radix sort but without making it static, and without allocating anything.
  35  *                              - we internally allocate two arrays of ranks. Each of them has N udwords to sort N values.
  36  *                              - 1Mb/2/sizeof(udword) = 131072 values max, at the same time.
  37  *      - 09.22.04:     - adapted to MacOS by Chris Lamb
  38  *      - 01.12.06:     - added optimizations suggested by Kyle Hubert
  39  *
  40  *      \class          RadixSort
  41  *      \author         Pierre Terdiman
  42  *      \version        1.5
  43  *      \date           August, 15, 1998
  44  */
  45 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  46
  47 /*
  48 To do:
  49         - add an offset parameter between two input values (avoid some data recopy sometimes)
  50         - unroll ? asm ?
  51         - prefetch stuff the day I have a P3
  52         - make a version with 16-bits indices ?
  53 */
  54
  55 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
  56 // Precompiled Header
  57 #include "StdAfx.h"
  58
  59 using namespace Opcode;
  60
  61 #define INVALIDATE_RANKS        mCurrentSize|=0x80000000
  62 #define VALIDATE_RANKS          mCurrentSize&=0x7fffffff
  63 #define CURRENT_SIZE            (mCurrentSize&0x7fffffff)
  64 #define INVALID_RANKS           (mCurrentSize&0x80000000)
  65
  66 #define CHECK_RESIZE(n)                                                                                                                                         \
  67         if(n!=mPreviousSize)                                                                                                                                    \
  68         {                                                                                                                                                                               \
  69                                 if(n>mCurrentSize)      Resize(n);                                                                                              \
  70                 else                                            ResetRanks();                                                                                   \
  71                 mPreviousSize = n;                                                                                                                                      \
  72         }
  73
  74 #if defined(__APPLE__) || defined(_XBOX)
  75         #define H0_OFFSET       768
  76         #define H1_OFFSET       512
  77         #define H2_OFFSET       256
  78         #define H3_OFFSET       0
  79         #define BYTES_INC       (3-j)
  80 #else
  81         #define H0_OFFSET       0
  82         #define H1_OFFSET       256
  83         #define H2_OFFSET       512
  84         #define H3_OFFSET       768
  85         #define BYTES_INC       j
  86 #endif
  87
  88 #define CREATE_HISTOGRAMS(type, buffer)                                                                                                         \
  89         /* Clear counters/histograms */                                                                                                                 \
  90         ZeroMemory(mHistogram, 256*4*sizeof(udword));                                                                                   \
  91                                                                                                                                                                                         \
  92         /* Prepare to count */                                                                                                                                  \
  93         const ubyte* p = (const ubyte*)input;                                                                                                   \
  94         const ubyte* pe = &p[nb*4];                                                                                                                             \
  95         udword* h0= &mHistogram[H0_OFFSET];     /* Histogram for first pass (LSB)       */                              \
  96         udword* h1= &mHistogram[H1_OFFSET];     /* Histogram for second pass            */                              \
  97         udword* h2= &mHistogram[H2_OFFSET];     /* Histogram for third pass                     */                              \
  98         udword* h3= &mHistogram[H3_OFFSET];     /* Histogram for last pass (MSB)        */                              \
  99                                                                                                                                                                                         \
 100         bool AlreadySorted = true;      /* Optimism... */                                                                                       \
 101                                                                                                                                                                                         \
 102         if(INVALID_RANKS)                                                                                                                                               \
 103         {                                                                                                                                                                               \
 104                 /* Prepare for temporal coherence */                                                                                            \
 105                 type* Running = (type*)buffer;                                                                                                          \
 106                 type PrevVal = *Running;                                                                                                                        \
 107                                                                                                                                                                                         \
 108                 while(p!=pe)                                                                                                                                            \
 109                 {                                                                                                                                                                       \
 110                         /* Read input buffer in previous sorted order */                                                                \
 111                         type Val = *Running++;                                                                                                                  \
 112                         /* Check whether already sorted or not */                                                                               \
 113                         if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */                               \
 114                         /* Update for next iteration */                                                                                                 \
 115                         PrevVal = Val;                                                                                                                                  \
 116                                                                                                                                                                                         \
 117                         /* Create histograms */                                                                                                                 \
 118                         h0[*p++]++;     h1[*p++]++;     h2[*p++]++;     h3[*p++]++;                                                                     \
 119                 }                                                                                                                                                                       \
 120                                                                                                                                                                                         \
 121                 /* If all input values are already sorted, we just have to return and leave the */      \
 122                 /* previous list unchanged. That way the routine may take advantage of temporal */      \
 123                 /* coherence, for example when used to sort transparent faces.                                  */      \
 124                 if(AlreadySorted)                                                                                                                                       \
 125                 {                                                                                                                                                                       \
 126                         mNbHits++;                                                                                                                                              \
 127                         for(udword i=0;i<nb;i++)        mRanks[i] = i;                                                                          \
 128                         return *this;                                                                                                                                   \
 129                 }                                                                                                                                                                       \
 130         }                                                                                                                                                                               \
 131         else                                                                                                                                                                    \
 132         {                                                                                                                                                                               \
 133                 /* Prepare for temporal coherence */                                                                                            \
 134                 const udword* Indices = mRanks;                                                                                                         \
 135                 type PrevVal = (type)buffer[*Indices];                                                                                          \
 136                                                                                                                                                                                         \
 137                 while(p!=pe)                                                                                                                                            \
 138                 {                                                                                                                                                                       \
 139                         /* Read input buffer in previous sorted order */                                                                \
 140                         type Val = (type)buffer[*Indices++];                                                                                    \
 141                         /* Check whether already sorted or not */                                                                               \
 142                         if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */                               \
 143                         /* Update for next iteration */                                                                                                 \
 144                         PrevVal = Val;                                                                                                                                  \
 145                                                                                                                                                                                         \
 146                         /* Create histograms */                                                                                                                 \
 147                         h0[*p++]++;     h1[*p++]++;     h2[*p++]++;     h3[*p++]++;                                                                     \
 148                 }                                                                                                                                                                       \
 149                                                                                                                                                                                         \
 150                 /* If all input values are already sorted, we just have to return and leave the */      \
 151                 /* previous list unchanged. That way the routine may take advantage of temporal */      \
 152                 /* coherence, for example when used to sort transparent faces.                                  */      \
 153                 if(AlreadySorted)       { mNbHits++; return *this;      }                                                                       \
 154         }                                                                                                                                                                               \
 155                                                                                                                                                                                         \
 156         /* Else there has been an early out and we must finish computing the histograms */              \
 157         while(p!=pe)                                                                                                                                                    \
 158         {                                                                                                                                                                               \
 159                 /* Create histograms without the previous overhead */                                                           \
 160                 h0[*p++]++;     h1[*p++]++;     h2[*p++]++;     h3[*p++]++;                                                                             \
 161         }
 162
 163 #define CHECK_PASS_VALIDITY(pass)                                                                                                                       \
 164         /* Shortcut to current counters */                                                                                                              \
 165         const udword* CurCount = &mHistogram[pass<<8];                                                                                  \
 166                                                                                                                                                                                         \
 167         /* Reset flag. The sorting pass is supposed to be performed. (default) */                               \
 168         bool PerformPass = true;                                                                                                                                \
 169                                                                                                                                                                                         \
 170         /* Check pass validity */                                                                                                                               \
 171                                                                                                                                                                                         \
 172         /* If all values have the same byte, sorting is useless. */                                                             \
 173         /* It may happen when sorting bytes or words instead of dwords. */                                              \
 174         /* This routine actually sorts words faster than dwords, and bytes */                                   \
 175         /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */                             \
 176         /* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
 177                                                                                                                                                                                         \
 178         /* Get first byte */                                                                                                                                    \
 179         ubyte UniqueVal = *(((ubyte*)input)+pass);                                                                                              \
 180                                                                                                                                                                                         \
 181         /* Check that byte's counter */                                                                                                                 \
 182         if(CurCount[UniqueVal]==nb)     PerformPass=false;
 183
 184 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 185 /**
 186  *      Constructor.
 187  */
 188 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 189 RadixSort::RadixSort() : mRanks(null), mRanks2(null), mCurrentSize(0), mTotalCalls(0), mNbHits(0), mDeleteRanks(true)
 190 {
 191 #ifndef RADIX_LOCAL_RAM
 192         // Allocate input-independent ram
 193         mHistogram      = ICE_ALLOC(sizeof(udword)*256*4);
 194         mOffset         = ICE_ALLOC(sizeof(udword)*256);
 195 #endif
 196         // Initialize indices
 197         INVALIDATE_RANKS;
 198 }
 199
 200 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 201 /**
 202  *      Destructor.
 203  */
 204 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 205 RadixSort::~RadixSort()
 206 {
 207         // Release everything
 208 #ifndef RADIX_LOCAL_RAM
 209         ICE_FREE(mOffset);
 210         ICE_FREE(mHistogram);
 211 #endif
 212         if(mDeleteRanks)
 213         {
 214                 ICE_FREE(mRanks2);
 215                 ICE_FREE(mRanks);
 216         }
 217 }
 218
 219 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 220 /**
 221  *      Resizes the inner lists.
 222  *      \param          nb      [in] new size (number of dwords)
 223  *      \return         true if success
 224  */
 225 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 226 bool RadixSort::Resize(udword nb)
 227 {
 228         if(mDeleteRanks)
 229         {
 230                 // Free previously used ram
 231                 ICE_FREE(mRanks2);
 232                 ICE_FREE(mRanks);
 233
 234                 // Get some fresh one
 235                 mRanks  = (udword*)ICE_ALLOC(sizeof(udword)*nb);        CHECKALLOC(mRanks);
 236                 mRanks2 = (udword*)ICE_ALLOC(sizeof(udword)*nb);        CHECKALLOC(mRanks2);
 237         }
 238         return true;
 239 }
 240
 241 inline_ void RadixSort::CheckResize(udword nb)
 242 {
 243         udword CurSize = CURRENT_SIZE;
 244         if(nb!=CurSize)
 245         {
 246                 if(nb>CurSize)  Resize(nb);
 247                 mCurrentSize = nb;
 248                 INVALIDATE_RANKS;
 249         }
 250 }
 251
 252 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 253 /**
 254  *      Main sort routine.
 255  *      This one is for integer values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
 256  *      \param          input   [in] a list of integer values to sort
 257  *      \param          nb              [in] number of values to sort, must be < 2^31
 258  *      \param          hint    [in] RADIX_SIGNED to handle negative values, RADIX_UNSIGNED if you know your input buffer only contains positive values
 259  *      \return         Self-Reference
 260  */
 261 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 262 RadixSort& RadixSort::Sort(const udword* input, udword nb, RadixHint hint)
 263 {
 264         // Checkings
 265         if(!input || !nb || nb&0x80000000)      return *this;
 266
 267         // Stats
 268         mTotalCalls++;
 269
 270         // Resize lists if needed
 271         CheckResize(nb);
 272
 273 #ifdef RADIX_LOCAL_RAM
 274         // Allocate histograms & offsets on the stack
 275         udword mHistogram[256*4];
 276 //      udword mOffset[256];
 277         udword* mLink[256];
 278 #endif
 279
 280         // Create histograms (counters). Counters for all passes are created in one run.
 281         // Pros:        read input buffer once instead of four times
 282         // Cons:        mHistogram is 4Kb instead of 1Kb
 283         // We must take care of signed/unsigned values for temporal coherence.... I just
 284         // have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
 285         if(hint==RADIX_UNSIGNED)        { CREATE_HISTOGRAMS(udword, input);     }
 286         else                                            { CREATE_HISTOGRAMS(sdword, input);     }
 287 /*
 288         // Compute #negative values involved if needed
 289         udword NbNegativeValues = 0;
 290         if(hint==RADIX_SIGNED)
 291         {
 292                 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
 293                 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
 294                 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
 295                 udword* h3= &mHistogram[768];
 296                 for(udword i=128;i<256;i++)     NbNegativeValues += h3[i];      // 768 for last histogram, 128 for negative part
 297         }
 298 */
 299         // Radix sort, j is the pass number (0=LSB, 3=MSB)
 300         for(udword j=0;j<4;j++)
 301         {
 302                 CHECK_PASS_VALIDITY(j);
 303
 304                 // Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
 305                 // not a problem, numbers are correctly sorted anyway.
 306                 if(PerformPass)
 307                 {
 308                         // Should we care about negative values?
 309                         if(j!=3 || hint==RADIX_UNSIGNED)
 310                         {
 311                                 // Here we deal with positive values only
 312
 313                                 // Create offsets
 314 //                              mOffset[0] = 0;
 315 //                              for(udword i=1;i<256;i++)               mOffset[i] = mOffset[i-1] + CurCount[i-1];
 316                                 mLink[0] = mRanks2;
 317                                 for(udword i=1;i<256;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];
 318                         }
 319                         else
 320                         {
 321                                 // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
 322 /*
 323                                 // Create biased offsets, in order for negative numbers to be sorted as well
 324 //                              mOffset[0] = NbNegativeValues;                                                                                          // First positive number takes place after the negative ones
 325 //                              for(udword i=1;i<128;i++)               mOffset[i] = mOffset[i-1] + CurCount[i-1];      // 1 to 128 for positive numbers
 326                                 mLink[0] = &mRanks2[NbNegativeValues];                                                                          // First positive number takes place after the negative ones
 327                                 for(udword i=1;i<128;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];          // 1 to 128 for positive numbers
 328
 329                                 // Fixing the wrong place for negative values
 330 //                              mOffset[128] = 0;
 331 //                              for(i=129;i<256;i++)                    mOffset[i] = mOffset[i-1] + CurCount[i-1];
 332                                 mLink[128] = mRanks2;
 333                                 for(udword i=129;i<256;i++)             mLink[i] = mLink[i-1] + CurCount[i-1];
 334 */
 335
 336 // From Kyle Hubert:
 337
 338 //mOffset[128] = 0;
 339 mLink[128] = mRanks2;
 340 //for(i=129;i<256;i++)  mOffset[i] = mOffset[i-1] + CurCount[i-1];
 341 for(udword i=129;i<256;i++)     mLink[i] = mLink[i-1] + CurCount[i-1];
 342
 343 //mOffset[0] = mOffset[255] + CurCount[255];
 344 mLink[0] = mLink[255] + CurCount[255];
 345 //for(i=1;i<128;i++)    mOffset[i] = mOffset[i-1] + CurCount[i-1];
 346 for(udword i=1;i<128;i++)       mLink[i] = mLink[i-1] + CurCount[i-1];
 347
 348
 349                         }
 350
 351                         // Perform Radix Sort
 352                         const ubyte* InputBytes = (const ubyte*)input;
 353                         InputBytes += BYTES_INC;
 354                         if(INVALID_RANKS)
 355                         {
 356 //                              for(udword i=0;i<nb;i++)        mRanks2[mOffset[InputBytes[i<<2]]++] = i;
 357                                 for(udword i=0;i<nb;i++)        *mLink[InputBytes[i<<2]]++ = i;
 358                                 VALIDATE_RANKS;
 359                         }
 360                         else
 361                         {
 362                                 const udword* Indices           = mRanks;
 363                                 const udword* IndicesEnd        = &mRanks[nb];
 364                                 while(Indices!=IndicesEnd)
 365                                 {
 366                                         udword id = *Indices++;
 367 //                                      mRanks2[mOffset[InputBytes[id<<2]]++] = id;
 368                                         *mLink[InputBytes[id<<2]]++ = id;
 369                                 }
 370                         }
 371
 372                         // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
 373                         udword* Tmp = mRanks;
 374                         mRanks = mRanks2;
 375                         mRanks2 = Tmp;
 376                 }
 377         }
 378         return *this;
 379 }
 380
 381 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 382 /**
 383  *      Main sort routine.
 384  *      This one is for floating-point values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
 385  *      \param          input                   [in] a list of floating-point values to sort
 386  *      \param          nb                              [in] number of values to sort, must be < 2^31
 387  *      \return         Self-Reference
 388  *      \warning        only sorts IEEE floating-point values
 389  */
 390 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 391 RadixSort& RadixSort::Sort(const float* input2, udword nb)
 392 {
 393         // Checkings
 394         if(!input2 || !nb || nb&0x80000000)     return *this;
 395
 396         // Stats
 397         mTotalCalls++;
 398
 399         const udword* input = (const udword*)input2;
 400
 401         // Resize lists if needed
 402         CheckResize(nb);
 403
 404 #ifdef RADIX_LOCAL_RAM
 405         // Allocate histograms & offsets on the stack
 406         udword mHistogram[256*4];
 407 //      udword mOffset[256];
 408         udword* mLink[256];
 409 #endif
 410
 411         // Create histograms (counters). Counters for all passes are created in one run.
 412         // Pros:        read input buffer once instead of four times
 413         // Cons:        mHistogram is 4Kb instead of 1Kb
 414         // Floating-point values are always supposed to be signed values, so there's only one code path there.
 415         // Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
 416         // is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
 417         // generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
 418         // wouldn't work with mixed positive/negative values....
 419         { CREATE_HISTOGRAMS(float, input2); }
 420 /*
 421         // Compute #negative values involved if needed
 422         udword NbNegativeValues = 0;
 423         // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
 424         // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
 425         // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
 426         // ### is that ok on Apple ?!
 427         udword* h3= &mHistogram[768];
 428         for(udword i=128;i<256;i++)     NbNegativeValues += h3[i];      // 768 for last histogram, 128 for negative part
 429 */
 430         // Radix sort, j is the pass number (0=LSB, 3=MSB)
 431         for(udword j=0;j<4;j++)
 432         {
 433                 // Should we care about negative values?
 434                 if(j!=3)
 435                 {
 436                         // Here we deal with positive values only
 437                         CHECK_PASS_VALIDITY(j);
 438
 439                         if(PerformPass)
 440                         {
 441                                 // Create offsets
 442 //                              mOffset[0] = 0;
 443                                 mLink[0] = mRanks2;
 444 //                              for(udword i=1;i<256;i++)               mOffset[i] = mOffset[i-1] + CurCount[i-1];
 445                                 for(udword i=1;i<256;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];
 446
 447                                 // Perform Radix Sort
 448                                 const ubyte* InputBytes = (const ubyte*)input;
 449                                 InputBytes += BYTES_INC;
 450                                 if(INVALID_RANKS)
 451                                 {
 452 //                                      for(i=0;i<nb;i++)       mRanks2[mOffset[InputBytes[i<<2]]++] = i;
 453                                         for(udword i=0;i<nb;i++)        *mLink[InputBytes[i<<2]]++ = i;
 454                                         VALIDATE_RANKS;
 455                                 }
 456                                 else
 457                                 {
 458                                         const udword* Indices           = mRanks;
 459                                         const udword* IndicesEnd        = &mRanks[nb];
 460                                         while(Indices!=IndicesEnd)
 461                                         {
 462                                                 udword id = *Indices++;
 463 //                                              mRanks2[mOffset[InputBytes[id<<2]]++] = id;
 464                                                 *mLink[InputBytes[id<<2]]++ = id;
 465                                         }
 466                                 }
 467
 468                                 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
 469                                 udword* Tmp = mRanks;
 470                                 mRanks = mRanks2;
 471                                 mRanks2 = Tmp;
 472                         }
 473                 }
 474                 else
 475                 {
 476                         // This is a special case to correctly handle negative values
 477                         CHECK_PASS_VALIDITY(j);
 478
 479                         if(PerformPass)
 480                         {
 481 /*
 482                                 // Create biased offsets, in order for negative numbers to be sorted as well
 483 //                              mOffset[0] = NbNegativeValues;                                                                                          // First positive number takes place after the negative ones
 484 //                              for(udword i=1;i<128;i++)               mOffset[i] = mOffset[i-1] + CurCount[i-1];      // 1 to 128 for positive numbers
 485                                 mLink[0] = &mRanks2[NbNegativeValues];                                                                          // First positive number takes place after the negative ones
 486                                 for(udword i=1;i<128;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];          // 1 to 128 for positive numbers
 487
 488                                 // We must reverse the sorting order for negative numbers!
 489 //                              mOffset[255] = 0;
 490 //                              for(i=0;i<127;i++)              mOffset[254-i] = mOffset[255-i] + CurCount[255-i];      // Fixing the wrong order for negative values
 491 //                              for(i=128;i<256;i++)    mOffset[i] += CurCount[i];                                                      // Fixing the wrong place for negative values
 492                                 mLink[255] = mRanks2;
 493                                 for(udword i=0;i<127;i++)       mLink[254-i] = mLink[255-i] + CurCount[255-i];          // Fixing the wrong order for negative values
 494                                 for(udword i=128;i<256;i++)     mLink[i] += CurCount[i];                                                        // Fixing the wrong place for negative values
 495 */
 496
 497 // From Kyle Hubert:
 498
 499 //mOffset[255] = CurCount[255];
 500 mLink[255] = mRanks2 + CurCount[255];
 501 //for(udword i=254;i>127;i--)   mOffset[i] = mOffset[i+1] + CurCount[i];
 502 for(udword i=254;i>127;i--)     mLink[i] = mLink[i+1] + CurCount[i];
 503 //mOffset[0] = mOffset[128] + CurCount[128];
 504 mLink[0] = mLink[128] + CurCount[128];
 505 //for(udword i=1;i<128;i++)     mOffset[i] = mOffset[i-1] + CurCount[i-1];
 506 for(udword i=1;i<128;i++)       mLink[i] = mLink[i-1] + CurCount[i-1];
 507
 508                                 // Perform Radix Sort
 509                                 if(INVALID_RANKS)
 510                                 {
 511                                         for(udword i=0;i<nb;i++)
 512                                         {
 513                                                 udword Radix = input[i]>>24;                                                    // Radix byte, same as above. AND is useless here (udword).
 514                                                 // ### cmp to be killed. Not good. Later.
 515 //                                              if(Radix<128)           mRanks2[mOffset[Radix]++] = i;          // Number is positive, same as above
 516 //                                              else                            mRanks2[--mOffset[Radix]] = i;          // Number is negative, flip the sorting order
 517                                                 if(Radix<128)           *mLink[Radix]++ = i;            // Number is positive, same as above
 518                                                 else                            *(--mLink[Radix]) = i;          // Number is negative, flip the sorting order
 519                                         }
 520                                         VALIDATE_RANKS;
 521                                 }
 522                                 else
 523                                 {
 524                                         for(udword i=0;i<nb;i++)
 525                                         {
 526                                                 udword Radix = input[mRanks[i]]>>24;                                                    // Radix byte, same as above. AND is useless here (udword).
 527                                                 // ### cmp to be killed. Not good. Later.
 528 //                                              if(Radix<128)           mRanks2[mOffset[Radix]++] = mRanks[i];          // Number is positive, same as above
 529 //                                              else                            mRanks2[--mOffset[Radix]] = mRanks[i];          // Number is negative, flip the sorting order
 530                                                 if(Radix<128)           *mLink[Radix]++ = mRanks[i];            // Number is positive, same as above
 531                                                 else                            *(--mLink[Radix]) = mRanks[i];          // Number is negative, flip the sorting order
 532                                         }
 533                                 }
 534                                 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
 535                                 udword* Tmp = mRanks;
 536                                 mRanks = mRanks2;
 537                                 mRanks2 = Tmp;
 538                         }
 539                         else
 540                         {
 541                                 // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
 542                                 if(UniqueVal>=128)
 543                                 {
 544                                         if(INVALID_RANKS)
 545                                         {
 546                                                 // ###Possible?
 547                                                 for(udword i=0;i<nb;i++)        mRanks2[i] = nb-i-1;
 548                                                 VALIDATE_RANKS;
 549                                         }
 550                                         else
 551                                         {
 552                                                 for(udword i=0;i<nb;i++)        mRanks2[i] = mRanks[nb-i-1];
 553                                         }
 554
 555                                         // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
 556                                         udword* Tmp = mRanks;
 557                                         mRanks = mRanks2;
 558                                         mRanks2 = Tmp;
 559                                 }
 560                         }
 561                 }
 562         }
 563         return *this;
 564 }
 565
 566 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 567 /**
 568  *      Gets the ram used.
 569  *      \return         memory used in bytes
 570  */
 571 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 572 udword RadixSort::GetUsedRam() const
 573 {
 574         udword UsedRam = sizeof(RadixSort);
 575 #ifndef RADIX_LOCAL_RAM
 576         UsedRam += 256*4*sizeof(udword);                        // Histograms
 577         UsedRam += 256*sizeof(udword);                          // Offsets
 578 #endif
 579         UsedRam += 2*CURRENT_SIZE*sizeof(udword);       // 2 lists of indices
 580         return UsedRam;
 581 }
 582
 583 bool RadixSort::SetRankBuffers(udword* ranks0, udword* ranks1)
 584 {
 585         if(!ranks0 || !ranks1)  return false;
 586
 587         mRanks                  = ranks0;
 588         mRanks2                 = ranks1;
 589         mDeleteRanks    = false;
 590
 591         return true;
 592 }