Tizen 2.1 base
[platform/upstream/libbullet.git] / Extras / CDTestFramework / Opcode / Ice / IceRevisitedRadix.cpp
1 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
2 /**
3  *      Contains source code from the article "Radix Sort Revisited".
4  *      \file           IceRevisitedRadix.cpp
5  *      \author         Pierre Terdiman
6  *      \date           April, 4, 2000
7  */
8 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
9
10 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
11 /**
12  *      Revisited Radix Sort.
13  *      This is my new radix routine:
14  *  - it uses indices and doesn't recopy the values anymore, hence wasting less ram
15  *  - it creates all the histograms in one run instead of four
16  *  - it sorts words faster than dwords and bytes faster than words
17  *  - it correctly sorts negative floating-point values by patching the offsets
18  *  - it automatically takes advantage of temporal coherence
19  *  - multiple keys support is a side effect of temporal coherence
20  *  - it may be worth recoding in asm... (mainly to use FCOMI, FCMOV, etc) [it's probably memory-bound anyway]
21  *
22  *      History:
23  *      - 08.15.98: very first version
24  *      - 04.04.00: recoded for the radix article
25  *      - 12.xx.00: code lifting
26  *      - 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
27  *      - 10.11.01: added local ram support
28  *      - 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
29  *      - 01.02.02:     - "mIndices" renamed => "mRanks". That's a rank sorter after all.
30  *                              - ranks are not "reset" anymore, but implicit on first calls
31  *      - 07.05.02:     offsets rewritten with one less indirection.
32  *      - 11.03.02:     "bool" replaced with RadixHint enum
33  *      - 07.15.04:     stack-based radix added
34  *                              - we want to use the radix sort but without making it static, and without allocating anything.
35  *                              - we internally allocate two arrays of ranks. Each of them has N udwords to sort N values.
36  *                              - 1Mb/2/sizeof(udword) = 131072 values max, at the same time.
37  *      - 09.22.04:     - adapted to MacOS by Chris Lamb
38  *      - 01.12.06:     - added optimizations suggested by Kyle Hubert
39  *
40  *      \class          RadixSort
41  *      \author         Pierre Terdiman
42  *      \version        1.5
43  *      \date           August, 15, 1998
44  */
45 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
46
47 /*
48 To do:
49         - add an offset parameter between two input values (avoid some data recopy sometimes)
50         - unroll ? asm ?
51         - prefetch stuff the day I have a P3
52         - make a version with 16-bits indices ?
53 */
54
55 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
56 // Precompiled Header
57 #include "StdAfx.h"
58
59 using namespace Opcode;
60
61 #define INVALIDATE_RANKS        mCurrentSize|=0x80000000
62 #define VALIDATE_RANKS          mCurrentSize&=0x7fffffff
63 #define CURRENT_SIZE            (mCurrentSize&0x7fffffff)
64 #define INVALID_RANKS           (mCurrentSize&0x80000000)
65
66 #define CHECK_RESIZE(n)                                                                                                                                         \
67         if(n!=mPreviousSize)                                                                                                                                    \
68         {                                                                                                                                                                               \
69                                 if(n>mCurrentSize)      Resize(n);                                                                                              \
70                 else                                            ResetRanks();                                                                                   \
71                 mPreviousSize = n;                                                                                                                                      \
72         }
73
74 #if defined(__APPLE__) || defined(_XBOX)
75         #define H0_OFFSET       768
76         #define H1_OFFSET       512
77         #define H2_OFFSET       256
78         #define H3_OFFSET       0
79         #define BYTES_INC       (3-j)
80 #else 
81         #define H0_OFFSET       0
82         #define H1_OFFSET       256
83         #define H2_OFFSET       512
84         #define H3_OFFSET       768
85         #define BYTES_INC       j
86 #endif
87
88 #define CREATE_HISTOGRAMS(type, buffer)                                                                                                         \
89         /* Clear counters/histograms */                                                                                                                 \
90         ZeroMemory(mHistogram, 256*4*sizeof(udword));                                                                                   \
91                                                                                                                                                                                         \
92         /* Prepare to count */                                                                                                                                  \
93         const ubyte* p = (const ubyte*)input;                                                                                                   \
94         const ubyte* pe = &p[nb*4];                                                                                                                             \
95         udword* h0= &mHistogram[H0_OFFSET];     /* Histogram for first pass (LSB)       */                              \
96         udword* h1= &mHistogram[H1_OFFSET];     /* Histogram for second pass            */                              \
97         udword* h2= &mHistogram[H2_OFFSET];     /* Histogram for third pass                     */                              \
98         udword* h3= &mHistogram[H3_OFFSET];     /* Histogram for last pass (MSB)        */                              \
99                                                                                                                                                                                         \
100         bool AlreadySorted = true;      /* Optimism... */                                                                                       \
101                                                                                                                                                                                         \
102         if(INVALID_RANKS)                                                                                                                                               \
103         {                                                                                                                                                                               \
104                 /* Prepare for temporal coherence */                                                                                            \
105                 type* Running = (type*)buffer;                                                                                                          \
106                 type PrevVal = *Running;                                                                                                                        \
107                                                                                                                                                                                         \
108                 while(p!=pe)                                                                                                                                            \
109                 {                                                                                                                                                                       \
110                         /* Read input buffer in previous sorted order */                                                                \
111                         type Val = *Running++;                                                                                                                  \
112                         /* Check whether already sorted or not */                                                                               \
113                         if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */                               \
114                         /* Update for next iteration */                                                                                                 \
115                         PrevVal = Val;                                                                                                                                  \
116                                                                                                                                                                                         \
117                         /* Create histograms */                                                                                                                 \
118                         h0[*p++]++;     h1[*p++]++;     h2[*p++]++;     h3[*p++]++;                                                                     \
119                 }                                                                                                                                                                       \
120                                                                                                                                                                                         \
121                 /* If all input values are already sorted, we just have to return and leave the */      \
122                 /* previous list unchanged. That way the routine may take advantage of temporal */      \
123                 /* coherence, for example when used to sort transparent faces.                                  */      \
124                 if(AlreadySorted)                                                                                                                                       \
125                 {                                                                                                                                                                       \
126                         mNbHits++;                                                                                                                                              \
127                         for(udword i=0;i<nb;i++)        mRanks[i] = i;                                                                          \
128                         return *this;                                                                                                                                   \
129                 }                                                                                                                                                                       \
130         }                                                                                                                                                                               \
131         else                                                                                                                                                                    \
132         {                                                                                                                                                                               \
133                 /* Prepare for temporal coherence */                                                                                            \
134                 const udword* Indices = mRanks;                                                                                                         \
135                 type PrevVal = (type)buffer[*Indices];                                                                                          \
136                                                                                                                                                                                         \
137                 while(p!=pe)                                                                                                                                            \
138                 {                                                                                                                                                                       \
139                         /* Read input buffer in previous sorted order */                                                                \
140                         type Val = (type)buffer[*Indices++];                                                                                    \
141                         /* Check whether already sorted or not */                                                                               \
142                         if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */                               \
143                         /* Update for next iteration */                                                                                                 \
144                         PrevVal = Val;                                                                                                                                  \
145                                                                                                                                                                                         \
146                         /* Create histograms */                                                                                                                 \
147                         h0[*p++]++;     h1[*p++]++;     h2[*p++]++;     h3[*p++]++;                                                                     \
148                 }                                                                                                                                                                       \
149                                                                                                                                                                                         \
150                 /* If all input values are already sorted, we just have to return and leave the */      \
151                 /* previous list unchanged. That way the routine may take advantage of temporal */      \
152                 /* coherence, for example when used to sort transparent faces.                                  */      \
153                 if(AlreadySorted)       { mNbHits++; return *this;      }                                                                       \
154         }                                                                                                                                                                               \
155                                                                                                                                                                                         \
156         /* Else there has been an early out and we must finish computing the histograms */              \
157         while(p!=pe)                                                                                                                                                    \
158         {                                                                                                                                                                               \
159                 /* Create histograms without the previous overhead */                                                           \
160                 h0[*p++]++;     h1[*p++]++;     h2[*p++]++;     h3[*p++]++;                                                                             \
161         }
162
163 #define CHECK_PASS_VALIDITY(pass)                                                                                                                       \
164         /* Shortcut to current counters */                                                                                                              \
165         const udword* CurCount = &mHistogram[pass<<8];                                                                                  \
166                                                                                                                                                                                         \
167         /* Reset flag. The sorting pass is supposed to be performed. (default) */                               \
168         bool PerformPass = true;                                                                                                                                \
169                                                                                                                                                                                         \
170         /* Check pass validity */                                                                                                                               \
171                                                                                                                                                                                         \
172         /* If all values have the same byte, sorting is useless. */                                                             \
173         /* It may happen when sorting bytes or words instead of dwords. */                                              \
174         /* This routine actually sorts words faster than dwords, and bytes */                                   \
175         /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */                             \
176         /* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
177                                                                                                                                                                                         \
178         /* Get first byte */                                                                                                                                    \
179         ubyte UniqueVal = *(((ubyte*)input)+pass);                                                                                              \
180                                                                                                                                                                                         \
181         /* Check that byte's counter */                                                                                                                 \
182         if(CurCount[UniqueVal]==nb)     PerformPass=false;
183
184 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
185 /**
186  *      Constructor.
187  */
188 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
189 RadixSort::RadixSort() : mRanks(null), mRanks2(null), mCurrentSize(0), mTotalCalls(0), mNbHits(0), mDeleteRanks(true)
190 {
191 #ifndef RADIX_LOCAL_RAM
192         // Allocate input-independent ram
193         mHistogram      = ICE_ALLOC(sizeof(udword)*256*4);
194         mOffset         = ICE_ALLOC(sizeof(udword)*256);
195 #endif
196         // Initialize indices
197         INVALIDATE_RANKS;
198 }
199
200 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
201 /**
202  *      Destructor.
203  */
204 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
205 RadixSort::~RadixSort()
206 {
207         // Release everything
208 #ifndef RADIX_LOCAL_RAM
209         ICE_FREE(mOffset);
210         ICE_FREE(mHistogram);
211 #endif
212         if(mDeleteRanks)
213         {
214                 ICE_FREE(mRanks2);
215                 ICE_FREE(mRanks);
216         }
217 }
218
219 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
220 /**
221  *      Resizes the inner lists.
222  *      \param          nb      [in] new size (number of dwords)
223  *      \return         true if success
224  */
225 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
226 bool RadixSort::Resize(udword nb)
227 {
228         if(mDeleteRanks)
229         {
230                 // Free previously used ram
231                 ICE_FREE(mRanks2);
232                 ICE_FREE(mRanks);
233
234                 // Get some fresh one
235                 mRanks  = (udword*)ICE_ALLOC(sizeof(udword)*nb);        CHECKALLOC(mRanks);
236                 mRanks2 = (udword*)ICE_ALLOC(sizeof(udword)*nb);        CHECKALLOC(mRanks2);
237         }
238         return true;
239 }
240
241 inline_ void RadixSort::CheckResize(udword nb)
242 {
243         udword CurSize = CURRENT_SIZE;
244         if(nb!=CurSize)
245         {
246                 if(nb>CurSize)  Resize(nb);
247                 mCurrentSize = nb;
248                 INVALIDATE_RANKS;
249         }
250 }
251
252 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
253 /**
254  *      Main sort routine.
255  *      This one is for integer values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
256  *      \param          input   [in] a list of integer values to sort
257  *      \param          nb              [in] number of values to sort, must be < 2^31
258  *      \param          hint    [in] RADIX_SIGNED to handle negative values, RADIX_UNSIGNED if you know your input buffer only contains positive values
259  *      \return         Self-Reference
260  */
261 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
262 RadixSort& RadixSort::Sort(const udword* input, udword nb, RadixHint hint)
263 {
264         // Checkings
265         if(!input || !nb || nb&0x80000000)      return *this;
266
267         // Stats
268         mTotalCalls++;
269
270         // Resize lists if needed
271         CheckResize(nb);
272
273 #ifdef RADIX_LOCAL_RAM
274         // Allocate histograms & offsets on the stack
275         udword mHistogram[256*4];
276 //      udword mOffset[256];
277         udword* mLink[256];
278 #endif
279
280         // Create histograms (counters). Counters for all passes are created in one run.
281         // Pros:        read input buffer once instead of four times
282         // Cons:        mHistogram is 4Kb instead of 1Kb
283         // We must take care of signed/unsigned values for temporal coherence.... I just
284         // have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
285         if(hint==RADIX_UNSIGNED)        { CREATE_HISTOGRAMS(udword, input);     }
286         else                                            { CREATE_HISTOGRAMS(sdword, input);     }
287 /*
288         // Compute #negative values involved if needed
289         udword NbNegativeValues = 0;
290         if(hint==RADIX_SIGNED)
291         {
292                 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
293                 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
294                 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
295                 udword* h3= &mHistogram[768];
296                 for(udword i=128;i<256;i++)     NbNegativeValues += h3[i];      // 768 for last histogram, 128 for negative part
297         }
298 */
299         // Radix sort, j is the pass number (0=LSB, 3=MSB)
300         for(udword j=0;j<4;j++)
301         {
302                 CHECK_PASS_VALIDITY(j);
303
304                 // Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
305                 // not a problem, numbers are correctly sorted anyway.
306                 if(PerformPass)
307                 {
308                         // Should we care about negative values?
309                         if(j!=3 || hint==RADIX_UNSIGNED)
310                         {
311                                 // Here we deal with positive values only
312
313                                 // Create offsets
314 //                              mOffset[0] = 0;
315 //                              for(udword i=1;i<256;i++)               mOffset[i] = mOffset[i-1] + CurCount[i-1];
316                                 mLink[0] = mRanks2;
317                                 for(udword i=1;i<256;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];
318                         }
319                         else
320                         {
321                                 // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
322 /*
323                                 // Create biased offsets, in order for negative numbers to be sorted as well
324 //                              mOffset[0] = NbNegativeValues;                                                                                          // First positive number takes place after the negative ones
325 //                              for(udword i=1;i<128;i++)               mOffset[i] = mOffset[i-1] + CurCount[i-1];      // 1 to 128 for positive numbers
326                                 mLink[0] = &mRanks2[NbNegativeValues];                                                                          // First positive number takes place after the negative ones
327                                 for(udword i=1;i<128;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];          // 1 to 128 for positive numbers
328
329                                 // Fixing the wrong place for negative values
330 //                              mOffset[128] = 0;
331 //                              for(i=129;i<256;i++)                    mOffset[i] = mOffset[i-1] + CurCount[i-1];
332                                 mLink[128] = mRanks2;
333                                 for(udword i=129;i<256;i++)             mLink[i] = mLink[i-1] + CurCount[i-1];
334 */
335
336 // From Kyle Hubert:
337
338 //mOffset[128] = 0;
339 mLink[128] = mRanks2;
340 //for(i=129;i<256;i++)  mOffset[i] = mOffset[i-1] + CurCount[i-1];
341 for(udword i=129;i<256;i++)     mLink[i] = mLink[i-1] + CurCount[i-1];
342
343 //mOffset[0] = mOffset[255] + CurCount[255];
344 mLink[0] = mLink[255] + CurCount[255];
345 //for(i=1;i<128;i++)    mOffset[i] = mOffset[i-1] + CurCount[i-1];
346 for(udword i=1;i<128;i++)       mLink[i] = mLink[i-1] + CurCount[i-1];
347
348
349                         }
350
351                         // Perform Radix Sort
352                         const ubyte* InputBytes = (const ubyte*)input;
353                         InputBytes += BYTES_INC;
354                         if(INVALID_RANKS)
355                         {
356 //                              for(udword i=0;i<nb;i++)        mRanks2[mOffset[InputBytes[i<<2]]++] = i;
357                                 for(udword i=0;i<nb;i++)        *mLink[InputBytes[i<<2]]++ = i;
358                                 VALIDATE_RANKS;
359                         }
360                         else
361                         {
362                                 const udword* Indices           = mRanks;
363                                 const udword* IndicesEnd        = &mRanks[nb];
364                                 while(Indices!=IndicesEnd)
365                                 {
366                                         udword id = *Indices++;
367 //                                      mRanks2[mOffset[InputBytes[id<<2]]++] = id;
368                                         *mLink[InputBytes[id<<2]]++ = id;
369                                 }
370                         }
371
372                         // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
373                         udword* Tmp = mRanks;
374                         mRanks = mRanks2;
375                         mRanks2 = Tmp;
376                 }
377         }
378         return *this;
379 }
380
381 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
382 /**
383  *      Main sort routine.
384  *      This one is for floating-point values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
385  *      \param          input                   [in] a list of floating-point values to sort
386  *      \param          nb                              [in] number of values to sort, must be < 2^31
387  *      \return         Self-Reference
388  *      \warning        only sorts IEEE floating-point values
389  */
390 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
391 RadixSort& RadixSort::Sort(const float* input2, udword nb)
392 {
393         // Checkings
394         if(!input2 || !nb || nb&0x80000000)     return *this;
395
396         // Stats
397         mTotalCalls++;
398
399         const udword* input = (const udword*)input2;
400
401         // Resize lists if needed
402         CheckResize(nb);
403
404 #ifdef RADIX_LOCAL_RAM
405         // Allocate histograms & offsets on the stack
406         udword mHistogram[256*4];
407 //      udword mOffset[256];
408         udword* mLink[256];
409 #endif
410
411         // Create histograms (counters). Counters for all passes are created in one run.
412         // Pros:        read input buffer once instead of four times
413         // Cons:        mHistogram is 4Kb instead of 1Kb
414         // Floating-point values are always supposed to be signed values, so there's only one code path there.
415         // Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
416         // is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
417         // generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
418         // wouldn't work with mixed positive/negative values....
419         { CREATE_HISTOGRAMS(float, input2); }
420 /*
421         // Compute #negative values involved if needed
422         udword NbNegativeValues = 0;
423         // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
424         // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
425         // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
426         // ### is that ok on Apple ?!
427         udword* h3= &mHistogram[768];
428         for(udword i=128;i<256;i++)     NbNegativeValues += h3[i];      // 768 for last histogram, 128 for negative part
429 */
430         // Radix sort, j is the pass number (0=LSB, 3=MSB)
431         for(udword j=0;j<4;j++)
432         {
433                 // Should we care about negative values?
434                 if(j!=3)
435                 {
436                         // Here we deal with positive values only
437                         CHECK_PASS_VALIDITY(j);
438
439                         if(PerformPass)
440                         {
441                                 // Create offsets
442 //                              mOffset[0] = 0;
443                                 mLink[0] = mRanks2;
444 //                              for(udword i=1;i<256;i++)               mOffset[i] = mOffset[i-1] + CurCount[i-1];
445                                 for(udword i=1;i<256;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];
446
447                                 // Perform Radix Sort
448                                 const ubyte* InputBytes = (const ubyte*)input;
449                                 InputBytes += BYTES_INC;
450                                 if(INVALID_RANKS)
451                                 {
452 //                                      for(i=0;i<nb;i++)       mRanks2[mOffset[InputBytes[i<<2]]++] = i;
453                                         for(udword i=0;i<nb;i++)        *mLink[InputBytes[i<<2]]++ = i;
454                                         VALIDATE_RANKS;
455                                 }
456                                 else
457                                 {
458                                         const udword* Indices           = mRanks;
459                                         const udword* IndicesEnd        = &mRanks[nb];
460                                         while(Indices!=IndicesEnd)
461                                         {
462                                                 udword id = *Indices++;
463 //                                              mRanks2[mOffset[InputBytes[id<<2]]++] = id;
464                                                 *mLink[InputBytes[id<<2]]++ = id;
465                                         }
466                                 }
467
468                                 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
469                                 udword* Tmp = mRanks;
470                                 mRanks = mRanks2;
471                                 mRanks2 = Tmp;
472                         }
473                 }
474                 else
475                 {
476                         // This is a special case to correctly handle negative values
477                         CHECK_PASS_VALIDITY(j);
478
479                         if(PerformPass)
480                         {
481 /*
482                                 // Create biased offsets, in order for negative numbers to be sorted as well
483 //                              mOffset[0] = NbNegativeValues;                                                                                          // First positive number takes place after the negative ones
484 //                              for(udword i=1;i<128;i++)               mOffset[i] = mOffset[i-1] + CurCount[i-1];      // 1 to 128 for positive numbers
485                                 mLink[0] = &mRanks2[NbNegativeValues];                                                                          // First positive number takes place after the negative ones
486                                 for(udword i=1;i<128;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];          // 1 to 128 for positive numbers
487
488                                 // We must reverse the sorting order for negative numbers!
489 //                              mOffset[255] = 0;
490 //                              for(i=0;i<127;i++)              mOffset[254-i] = mOffset[255-i] + CurCount[255-i];      // Fixing the wrong order for negative values
491 //                              for(i=128;i<256;i++)    mOffset[i] += CurCount[i];                                                      // Fixing the wrong place for negative values
492                                 mLink[255] = mRanks2;
493                                 for(udword i=0;i<127;i++)       mLink[254-i] = mLink[255-i] + CurCount[255-i];          // Fixing the wrong order for negative values
494                                 for(udword i=128;i<256;i++)     mLink[i] += CurCount[i];                                                        // Fixing the wrong place for negative values
495 */
496
497 // From Kyle Hubert:
498
499 //mOffset[255] = CurCount[255];
500 mLink[255] = mRanks2 + CurCount[255];
501 //for(udword i=254;i>127;i--)   mOffset[i] = mOffset[i+1] + CurCount[i];
502 for(udword i=254;i>127;i--)     mLink[i] = mLink[i+1] + CurCount[i];
503 //mOffset[0] = mOffset[128] + CurCount[128];
504 mLink[0] = mLink[128] + CurCount[128];
505 //for(udword i=1;i<128;i++)     mOffset[i] = mOffset[i-1] + CurCount[i-1];
506 for(udword i=1;i<128;i++)       mLink[i] = mLink[i-1] + CurCount[i-1];
507
508                                 // Perform Radix Sort
509                                 if(INVALID_RANKS)
510                                 {
511                                         for(udword i=0;i<nb;i++)
512                                         {
513                                                 udword Radix = input[i]>>24;                                                    // Radix byte, same as above. AND is useless here (udword).
514                                                 // ### cmp to be killed. Not good. Later.
515 //                                              if(Radix<128)           mRanks2[mOffset[Radix]++] = i;          // Number is positive, same as above
516 //                                              else                            mRanks2[--mOffset[Radix]] = i;          // Number is negative, flip the sorting order
517                                                 if(Radix<128)           *mLink[Radix]++ = i;            // Number is positive, same as above
518                                                 else                            *(--mLink[Radix]) = i;          // Number is negative, flip the sorting order
519                                         }
520                                         VALIDATE_RANKS;
521                                 }
522                                 else
523                                 {
524                                         for(udword i=0;i<nb;i++)
525                                         {
526                                                 udword Radix = input[mRanks[i]]>>24;                                                    // Radix byte, same as above. AND is useless here (udword).
527                                                 // ### cmp to be killed. Not good. Later.
528 //                                              if(Radix<128)           mRanks2[mOffset[Radix]++] = mRanks[i];          // Number is positive, same as above
529 //                                              else                            mRanks2[--mOffset[Radix]] = mRanks[i];          // Number is negative, flip the sorting order
530                                                 if(Radix<128)           *mLink[Radix]++ = mRanks[i];            // Number is positive, same as above
531                                                 else                            *(--mLink[Radix]) = mRanks[i];          // Number is negative, flip the sorting order
532                                         }
533                                 }
534                                 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
535                                 udword* Tmp = mRanks;
536                                 mRanks = mRanks2;
537                                 mRanks2 = Tmp;
538                         }
539                         else
540                         {
541                                 // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
542                                 if(UniqueVal>=128)
543                                 {
544                                         if(INVALID_RANKS)
545                                         {
546                                                 // ###Possible?
547                                                 for(udword i=0;i<nb;i++)        mRanks2[i] = nb-i-1;
548                                                 VALIDATE_RANKS;
549                                         }
550                                         else
551                                         {
552                                                 for(udword i=0;i<nb;i++)        mRanks2[i] = mRanks[nb-i-1];
553                                         }
554
555                                         // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
556                                         udword* Tmp = mRanks;
557                                         mRanks = mRanks2;
558                                         mRanks2 = Tmp;
559                                 }
560                         }
561                 }
562         }
563         return *this;
564 }
565
566 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
567 /**
568  *      Gets the ram used.
569  *      \return         memory used in bytes
570  */
571 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
572 udword RadixSort::GetUsedRam() const
573 {
574         udword UsedRam = sizeof(RadixSort);
575 #ifndef RADIX_LOCAL_RAM
576         UsedRam += 256*4*sizeof(udword);                        // Histograms
577         UsedRam += 256*sizeof(udword);                          // Offsets
578 #endif
579         UsedRam += 2*CURRENT_SIZE*sizeof(udword);       // 2 lists of indices
580         return UsedRam;
581 }
582
583 bool RadixSort::SetRankBuffers(udword* ranks0, udword* ranks1)
584 {
585         if(!ranks0 || !ranks1)  return false;
586
587         mRanks                  = ranks0;
588         mRanks2                 = ranks1;
589         mDeleteRanks    = false;
590
591         return true;
592 }