1 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
3 * Contains source code from the article "Radix Sort Revisited".
4 * \file IceRevisitedRadix.cpp
5 * \author Pierre Terdiman
8 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
10 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
12 * Revisited Radix Sort.
13 * This is my new radix routine:
14 * - it uses indices and doesn't recopy the values anymore, hence wasting less ram
15 * - it creates all the histograms in one run instead of four
16 * - it sorts words faster than dwords and bytes faster than words
17 * - it correctly sorts negative floating-point values by patching the offsets
18 * - it automatically takes advantage of temporal coherence
19 * - multiple keys support is a side effect of temporal coherence
20 * - it may be worth recoding in asm... (mainly to use FCOMI, FCMOV, etc) [it's probably memory-bound anyway]
23 * - 08.15.98: very first version
24 * - 04.04.00: recoded for the radix article
25 * - 12.xx.00: code lifting
26 * - 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
27 * - 10.11.01: added local ram support
28 * - 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
29 * - 01.02.02: - "mIndices" renamed => "mRanks". That's a rank sorter after all.
30 * - ranks are not "reset" anymore, but implicit on first calls
31 * - 07.05.02: offsets rewritten with one less indirection.
32 * - 11.03.02: "bool" replaced with RadixHint enum
33 * - 07.15.04: stack-based radix added
34 * - we want to use the radix sort but without making it static, and without allocating anything.
35 * - we internally allocate two arrays of ranks. Each of them has N udwords to sort N values.
36 * - 1Mb/2/sizeof(udword) = 131072 values max, at the same time.
37 * - 09.22.04: - adapted to MacOS by Chris Lamb
38 * - 01.12.06: - added optimizations suggested by Kyle Hubert
41 * \author Pierre Terdiman
43 * \date August, 15, 1998
45 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
49 - add an offset parameter between two input values (avoid some data recopy sometimes)
51 - prefetch stuff the day I have a P3
52 - make a version with 16-bits indices ?
55 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
59 using namespace Opcode;
61 #define INVALIDATE_RANKS mCurrentSize|=0x80000000
62 #define VALIDATE_RANKS mCurrentSize&=0x7fffffff
63 #define CURRENT_SIZE (mCurrentSize&0x7fffffff)
64 #define INVALID_RANKS (mCurrentSize&0x80000000)
66 #define CHECK_RESIZE(n) \
67 if(n!=mPreviousSize) \
69 if(n>mCurrentSize) Resize(n); \
74 #if defined(__APPLE__) || defined(_XBOX)
79 #define BYTES_INC (3-j)
88 #define CREATE_HISTOGRAMS(type, buffer) \
89 /* Clear counters/histograms */ \
90 ZeroMemory(mHistogram, 256*4*sizeof(udword)); \
92 /* Prepare to count */ \
93 const ubyte* p = (const ubyte*)input; \
94 const ubyte* pe = &p[nb*4]; \
95 udword* h0= &mHistogram[H0_OFFSET]; /* Histogram for first pass (LSB) */ \
96 udword* h1= &mHistogram[H1_OFFSET]; /* Histogram for second pass */ \
97 udword* h2= &mHistogram[H2_OFFSET]; /* Histogram for third pass */ \
98 udword* h3= &mHistogram[H3_OFFSET]; /* Histogram for last pass (MSB) */ \
100 bool AlreadySorted = true; /* Optimism... */ \
104 /* Prepare for temporal coherence */ \
105 type* Running = (type*)buffer; \
106 type PrevVal = *Running; \
110 /* Read input buffer in previous sorted order */ \
111 type Val = *Running++; \
112 /* Check whether already sorted or not */ \
113 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
114 /* Update for next iteration */ \
117 /* Create histograms */ \
118 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
121 /* If all input values are already sorted, we just have to return and leave the */ \
122 /* previous list unchanged. That way the routine may take advantage of temporal */ \
123 /* coherence, for example when used to sort transparent faces. */ \
127 for(udword i=0;i<nb;i++) mRanks[i] = i; \
133 /* Prepare for temporal coherence */ \
134 const udword* Indices = mRanks; \
135 type PrevVal = (type)buffer[*Indices]; \
139 /* Read input buffer in previous sorted order */ \
140 type Val = (type)buffer[*Indices++]; \
141 /* Check whether already sorted or not */ \
142 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
143 /* Update for next iteration */ \
146 /* Create histograms */ \
147 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
150 /* If all input values are already sorted, we just have to return and leave the */ \
151 /* previous list unchanged. That way the routine may take advantage of temporal */ \
152 /* coherence, for example when used to sort transparent faces. */ \
153 if(AlreadySorted) { mNbHits++; return *this; } \
156 /* Else there has been an early out and we must finish computing the histograms */ \
159 /* Create histograms without the previous overhead */ \
160 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
163 #define CHECK_PASS_VALIDITY(pass) \
164 /* Shortcut to current counters */ \
165 const udword* CurCount = &mHistogram[pass<<8]; \
167 /* Reset flag. The sorting pass is supposed to be performed. (default) */ \
168 bool PerformPass = true; \
170 /* Check pass validity */ \
172 /* If all values have the same byte, sorting is useless. */ \
173 /* It may happen when sorting bytes or words instead of dwords. */ \
174 /* This routine actually sorts words faster than dwords, and bytes */ \
175 /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */ \
176 /* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
178 /* Get first byte */ \
179 ubyte UniqueVal = *(((ubyte*)input)+pass); \
181 /* Check that byte's counter */ \
182 if(CurCount[UniqueVal]==nb) PerformPass=false;
184 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
188 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
189 RadixSort::RadixSort() : mRanks(null), mRanks2(null), mCurrentSize(0), mTotalCalls(0), mNbHits(0), mDeleteRanks(true)
191 #ifndef RADIX_LOCAL_RAM
192 // Allocate input-independent ram
193 mHistogram = ICE_ALLOC(sizeof(udword)*256*4);
194 mOffset = ICE_ALLOC(sizeof(udword)*256);
196 // Initialize indices
200 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
204 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
205 RadixSort::~RadixSort()
207 // Release everything
208 #ifndef RADIX_LOCAL_RAM
210 ICE_FREE(mHistogram);
219 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
221 * Resizes the inner lists.
222 * \param nb [in] new size (number of dwords)
223 * \return true if success
225 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
226 bool RadixSort::Resize(udword nb)
230 // Free previously used ram
234 // Get some fresh one
235 mRanks = (udword*)ICE_ALLOC(sizeof(udword)*nb); CHECKALLOC(mRanks);
236 mRanks2 = (udword*)ICE_ALLOC(sizeof(udword)*nb); CHECKALLOC(mRanks2);
241 inline_ void RadixSort::CheckResize(udword nb)
243 udword CurSize = CURRENT_SIZE;
246 if(nb>CurSize) Resize(nb);
252 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
255 * This one is for integer values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
256 * \param input [in] a list of integer values to sort
257 * \param nb [in] number of values to sort, must be < 2^31
258 * \param hint [in] RADIX_SIGNED to handle negative values, RADIX_UNSIGNED if you know your input buffer only contains positive values
259 * \return Self-Reference
261 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
262 RadixSort& RadixSort::Sort(const udword* input, udword nb, RadixHint hint)
265 if(!input || !nb || nb&0x80000000) return *this;
270 // Resize lists if needed
273 #ifdef RADIX_LOCAL_RAM
274 // Allocate histograms & offsets on the stack
275 udword mHistogram[256*4];
276 // udword mOffset[256];
280 // Create histograms (counters). Counters for all passes are created in one run.
281 // Pros: read input buffer once instead of four times
282 // Cons: mHistogram is 4Kb instead of 1Kb
283 // We must take care of signed/unsigned values for temporal coherence.... I just
284 // have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
285 if(hint==RADIX_UNSIGNED) { CREATE_HISTOGRAMS(udword, input); }
286 else { CREATE_HISTOGRAMS(sdword, input); }
288 // Compute #negative values involved if needed
289 udword NbNegativeValues = 0;
290 if(hint==RADIX_SIGNED)
292 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
293 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
294 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
295 udword* h3= &mHistogram[768];
296 for(udword i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
299 // Radix sort, j is the pass number (0=LSB, 3=MSB)
300 for(udword j=0;j<4;j++)
302 CHECK_PASS_VALIDITY(j);
304 // Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
305 // not a problem, numbers are correctly sorted anyway.
308 // Should we care about negative values?
309 if(j!=3 || hint==RADIX_UNSIGNED)
311 // Here we deal with positive values only
315 // for(udword i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
317 for(udword i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
321 // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
323 // Create biased offsets, in order for negative numbers to be sorted as well
324 // mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
325 // for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
326 mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
327 for(udword i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
329 // Fixing the wrong place for negative values
331 // for(i=129;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
332 mLink[128] = mRanks2;
333 for(udword i=129;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
339 mLink[128] = mRanks2;
340 //for(i=129;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
341 for(udword i=129;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
343 //mOffset[0] = mOffset[255] + CurCount[255];
344 mLink[0] = mLink[255] + CurCount[255];
345 //for(i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
346 for(udword i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
351 // Perform Radix Sort
352 const ubyte* InputBytes = (const ubyte*)input;
353 InputBytes += BYTES_INC;
356 // for(udword i=0;i<nb;i++) mRanks2[mOffset[InputBytes[i<<2]]++] = i;
357 for(udword i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
362 const udword* Indices = mRanks;
363 const udword* IndicesEnd = &mRanks[nb];
364 while(Indices!=IndicesEnd)
366 udword id = *Indices++;
367 // mRanks2[mOffset[InputBytes[id<<2]]++] = id;
368 *mLink[InputBytes[id<<2]]++ = id;
372 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
373 udword* Tmp = mRanks;
381 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
384 * This one is for floating-point values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
385 * \param input [in] a list of floating-point values to sort
386 * \param nb [in] number of values to sort, must be < 2^31
387 * \return Self-Reference
388 * \warning only sorts IEEE floating-point values
390 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
391 RadixSort& RadixSort::Sort(const float* input2, udword nb)
394 if(!input2 || !nb || nb&0x80000000) return *this;
399 const udword* input = (const udword*)input2;
401 // Resize lists if needed
404 #ifdef RADIX_LOCAL_RAM
405 // Allocate histograms & offsets on the stack
406 udword mHistogram[256*4];
407 // udword mOffset[256];
411 // Create histograms (counters). Counters for all passes are created in one run.
412 // Pros: read input buffer once instead of four times
413 // Cons: mHistogram is 4Kb instead of 1Kb
414 // Floating-point values are always supposed to be signed values, so there's only one code path there.
415 // Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
416 // is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
417 // generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
418 // wouldn't work with mixed positive/negative values....
419 { CREATE_HISTOGRAMS(float, input2); }
421 // Compute #negative values involved if needed
422 udword NbNegativeValues = 0;
423 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
424 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
425 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
426 // ### is that ok on Apple ?!
427 udword* h3= &mHistogram[768];
428 for(udword i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
430 // Radix sort, j is the pass number (0=LSB, 3=MSB)
431 for(udword j=0;j<4;j++)
433 // Should we care about negative values?
436 // Here we deal with positive values only
437 CHECK_PASS_VALIDITY(j);
444 // for(udword i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
445 for(udword i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
447 // Perform Radix Sort
448 const ubyte* InputBytes = (const ubyte*)input;
449 InputBytes += BYTES_INC;
452 // for(i=0;i<nb;i++) mRanks2[mOffset[InputBytes[i<<2]]++] = i;
453 for(udword i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
458 const udword* Indices = mRanks;
459 const udword* IndicesEnd = &mRanks[nb];
460 while(Indices!=IndicesEnd)
462 udword id = *Indices++;
463 // mRanks2[mOffset[InputBytes[id<<2]]++] = id;
464 *mLink[InputBytes[id<<2]]++ = id;
468 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
469 udword* Tmp = mRanks;
476 // This is a special case to correctly handle negative values
477 CHECK_PASS_VALIDITY(j);
482 // Create biased offsets, in order for negative numbers to be sorted as well
483 // mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
484 // for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
485 mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
486 for(udword i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
488 // We must reverse the sorting order for negative numbers!
490 // for(i=0;i<127;i++) mOffset[254-i] = mOffset[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
491 // for(i=128;i<256;i++) mOffset[i] += CurCount[i]; // Fixing the wrong place for negative values
492 mLink[255] = mRanks2;
493 for(udword i=0;i<127;i++) mLink[254-i] = mLink[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
494 for(udword i=128;i<256;i++) mLink[i] += CurCount[i]; // Fixing the wrong place for negative values
499 //mOffset[255] = CurCount[255];
500 mLink[255] = mRanks2 + CurCount[255];
501 //for(udword i=254;i>127;i--) mOffset[i] = mOffset[i+1] + CurCount[i];
502 for(udword i=254;i>127;i--) mLink[i] = mLink[i+1] + CurCount[i];
503 //mOffset[0] = mOffset[128] + CurCount[128];
504 mLink[0] = mLink[128] + CurCount[128];
505 //for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
506 for(udword i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
508 // Perform Radix Sort
511 for(udword i=0;i<nb;i++)
513 udword Radix = input[i]>>24; // Radix byte, same as above. AND is useless here (udword).
514 // ### cmp to be killed. Not good. Later.
515 // if(Radix<128) mRanks2[mOffset[Radix]++] = i; // Number is positive, same as above
516 // else mRanks2[--mOffset[Radix]] = i; // Number is negative, flip the sorting order
517 if(Radix<128) *mLink[Radix]++ = i; // Number is positive, same as above
518 else *(--mLink[Radix]) = i; // Number is negative, flip the sorting order
524 for(udword i=0;i<nb;i++)
526 udword Radix = input[mRanks[i]]>>24; // Radix byte, same as above. AND is useless here (udword).
527 // ### cmp to be killed. Not good. Later.
528 // if(Radix<128) mRanks2[mOffset[Radix]++] = mRanks[i]; // Number is positive, same as above
529 // else mRanks2[--mOffset[Radix]] = mRanks[i]; // Number is negative, flip the sorting order
530 if(Radix<128) *mLink[Radix]++ = mRanks[i]; // Number is positive, same as above
531 else *(--mLink[Radix]) = mRanks[i]; // Number is negative, flip the sorting order
534 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
535 udword* Tmp = mRanks;
541 // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
547 for(udword i=0;i<nb;i++) mRanks2[i] = nb-i-1;
552 for(udword i=0;i<nb;i++) mRanks2[i] = mRanks[nb-i-1];
555 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
556 udword* Tmp = mRanks;
566 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
569 * \return memory used in bytes
571 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
572 udword RadixSort::GetUsedRam() const
574 udword UsedRam = sizeof(RadixSort);
575 #ifndef RADIX_LOCAL_RAM
576 UsedRam += 256*4*sizeof(udword); // Histograms
577 UsedRam += 256*sizeof(udword); // Offsets
579 UsedRam += 2*CURRENT_SIZE*sizeof(udword); // 2 lists of indices
583 bool RadixSort::SetRankBuffers(udword* ranks0, udword* ranks1)
585 if(!ranks0 || !ranks1) return false;
589 mDeleteRanks = false;