2 * ICE / OPCODE - Optimized Collision Detection
3 * http://www.codercorner.com/Opcode.htm
5 * Copyright (c) 2001-2008 Pierre Terdiman, pierre@codercorner.com
7 This software is provided 'as-is', without any express or implied warranty.
8 In no event will the authors be held liable for any damages arising from the use of this software.
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it freely,
11 subject to the following restrictions:
13 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
14 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
15 3. This notice may not be removed or altered from any source distribution.
17 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
19 * Contains source code from the article "Radix Sort Revisited".
20 * \file IceRevisitedRadix.cpp
21 * \author Pierre Terdiman
22 * \date April, 4, 2000
24 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
26 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
28 * Revisited Radix Sort.
29 * This is my new radix routine:
30 * - it uses indices and doesn't recopy the values anymore, hence wasting less ram
31 * - it creates all the histograms in one run instead of four
32 * - it sorts words faster than dwords and bytes faster than words
33 * - it correctly sorts negative floating-point values by patching the offsets
34 * - it automatically takes advantage of temporal coherence
35 * - multiple keys support is a side effect of temporal coherence
36 * - it may be worth recoding in asm... (mainly to use FCOMI, FCMOV, etc) [it's probably memory-bound anyway]
39 * - 08.15.98: very first version
40 * - 04.04.00: recoded for the radix article
41 * - 12.xx.00: code lifting
42 * - 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
43 * - 10.11.01: added local ram support
44 * - 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
45 * - 01.02.02: - "mIndices" renamed => "mRanks". That's a rank sorter after all.
46 * - ranks are not "reset" anymore, but implicit on first calls
47 * - 07.05.02: - offsets rewritten with one less indirection.
48 * - 11.03.02: - "bool" replaced with RadixHint enum
51 * \author Pierre Terdiman
53 * \date August, 15, 1998
55 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
59 - add an offset parameter between two input values (avoid some data recopy sometimes)
61 - 11 bits trick & 3 passes as Michael did
62 - prefetch stuff the day I have a P3
63 - make a version with 16-bits indices ?
66 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
70 using namespace IceCore;
72 #define INVALIDATE_RANKS mCurrentSize|=0x80000000
73 #define VALIDATE_RANKS mCurrentSize&=0x7fffffff
74 #define CURRENT_SIZE (mCurrentSize&0x7fffffff)
75 #define INVALID_RANKS (mCurrentSize&0x80000000)
77 #define CHECK_RESIZE(n) \
78 if(n!=mPreviousSize) \
80 if(n>mCurrentSize) Resize(n); \
85 #define CREATE_HISTOGRAMS(type, buffer) \
86 /* Clear counters/histograms */ \
87 ZeroMemory(mHistogram, 256*4*sizeof(udword)); \
89 /* Prepare to count */ \
90 ubyte* p = (ubyte*)input; \
91 ubyte* pe = &p[nb*4]; \
92 udword* h0= &mHistogram[0]; /* Histogram for first pass (LSB) */ \
93 udword* h1= &mHistogram[256]; /* Histogram for second pass */ \
94 udword* h2= &mHistogram[512]; /* Histogram for third pass */ \
95 udword* h3= &mHistogram[768]; /* Histogram for last pass (MSB) */ \
97 bool AlreadySorted = true; /* Optimism... */ \
101 /* Prepare for temporal coherence */ \
102 type* Running = (type*)buffer; \
103 type PrevVal = *Running; \
107 /* Read input buffer in previous sorted order */ \
108 type Val = *Running++; \
109 /* Check whether already sorted or not */ \
110 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
111 /* Update for next iteration */ \
114 /* Create histograms */ \
115 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
118 /* If all input values are already sorted, we just have to return and leave the */ \
119 /* previous list unchanged. That way the routine may take advantage of temporal */ \
120 /* coherence, for example when used to sort transparent faces. */ \
124 for(udword i=0;i<nb;i++) mRanks[i] = i; \
130 /* Prepare for temporal coherence */ \
131 udword* Indices = mRanks; \
132 type PrevVal = (type)buffer[*Indices]; \
136 /* Read input buffer in previous sorted order */ \
137 type Val = (type)buffer[*Indices++]; \
138 /* Check whether already sorted or not */ \
139 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
140 /* Update for next iteration */ \
143 /* Create histograms */ \
144 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
147 /* If all input values are already sorted, we just have to return and leave the */ \
148 /* previous list unchanged. That way the routine may take advantage of temporal */ \
149 /* coherence, for example when used to sort transparent faces. */ \
150 if(AlreadySorted) { mNbHits++; return *this; } \
153 /* Else there has been an early out and we must finish computing the histograms */ \
156 /* Create histograms without the previous overhead */ \
157 h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
160 #define CHECK_PASS_VALIDITY(pass) \
161 /* Shortcut to current counters */ \
162 udword* CurCount = &mHistogram[pass<<8]; \
164 /* Reset flag. The sorting pass is supposed to be performed. (default) */ \
165 bool PerformPass = true; \
167 /* Check pass validity */ \
169 /* If all values have the same byte, sorting is useless. */ \
170 /* It may happen when sorting bytes or words instead of dwords. */ \
171 /* This routine actually sorts words faster than dwords, and bytes */ \
172 /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */ \
173 /* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
175 /* Get first byte */ \
176 ubyte UniqueVal = *(((ubyte*)input)+pass); \
178 /* Check that byte's counter */ \
179 if(CurCount[UniqueVal]==nb) PerformPass=false;
181 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
185 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
186 RadixSort::RadixSort() : mRanks(null), mRanks2(null), mCurrentSize(0), mTotalCalls(0), mNbHits(0)
188 #ifndef RADIX_LOCAL_RAM
189 // Allocate input-independent ram
190 mHistogram = new udword[256*4];
191 mOffset = new udword[256];
193 // Initialize indices
197 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
201 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
202 RadixSort::~RadixSort()
204 // Release everything
205 #ifndef RADIX_LOCAL_RAM
206 DELETEARRAY(mOffset);
207 DELETEARRAY(mHistogram);
209 DELETEARRAY(mRanks2);
213 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
215 * Resizes the inner lists.
216 * \param nb [in] new size (number of dwords)
217 * \return true if success
219 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
220 bool RadixSort::Resize(udword nb)
222 // Free previously used ram
223 DELETEARRAY(mRanks2);
226 // Get some fresh one
227 mRanks = new udword[nb]; CHECKALLOC(mRanks);
228 mRanks2 = new udword[nb]; CHECKALLOC(mRanks2);
233 inline_ void RadixSort::CheckResize(udword nb)
235 udword CurSize = CURRENT_SIZE;
238 if(nb>CurSize) Resize(nb);
244 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
247 * This one is for integer values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
248 * \param input [in] a list of integer values to sort
249 * \param nb [in] number of values to sort, must be < 2^31
250 * \param hint [in] RADIX_SIGNED to handle negative values, RADIX_UNSIGNED if you know your input buffer only contains positive values
251 * \return Self-Reference
253 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
254 RadixSort& RadixSort::Sort(const udword* input, udword nb, RadixHint hint)
257 if(!input || !nb || nb&0x80000000) return *this;
262 // Resize lists if needed
265 #ifdef RADIX_LOCAL_RAM
266 // Allocate histograms & offsets on the stack
267 udword mHistogram[256*4];
268 // udword mOffset[256];
272 // Create histograms (counters). Counters for all passes are created in one run.
273 // Pros: read input buffer once instead of four times
274 // Cons: mHistogram is 4Kb instead of 1Kb
275 // We must take care of signed/unsigned values for temporal coherence.... I just
276 // have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
277 if(hint==RADIX_UNSIGNED) { CREATE_HISTOGRAMS(udword, input); }
278 else { CREATE_HISTOGRAMS(sdword, input); }
280 // Compute #negative values involved if needed
281 udword NbNegativeValues = 0;
282 if(hint==RADIX_SIGNED)
284 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
285 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
286 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
287 udword* h3= &mHistogram[768];
288 for(udword i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
291 // Radix sort, j is the pass number (0=LSB, 3=MSB)
292 for(udword j=0;j<4;j++)
294 CHECK_PASS_VALIDITY(j);
296 // Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
297 // not a problem, numbers are correctly sorted anyway.
300 // Should we care about negative values?
301 if(j!=3 || hint==RADIX_UNSIGNED)
303 // Here we deal with positive values only
307 // for(udword i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
309 for(udword i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
313 // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
315 // Create biased offsets, in order for negative numbers to be sorted as well
316 // mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
317 mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
318 // for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
319 for(udword i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
321 // Fixing the wrong place for negative values
323 mLink[128] = mRanks2;
324 // for(i=129;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
325 for(udword i=129;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
328 // Perform Radix Sort
329 ubyte* InputBytes = (ubyte*)input;
333 // for(udword i=0;i<nb;i++) mRanks2[mOffset[InputBytes[i<<2]]++] = i;
334 for(udword i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
339 udword* Indices = mRanks;
340 udword* IndicesEnd = &mRanks[nb];
341 while(Indices!=IndicesEnd)
343 udword id = *Indices++;
344 // mRanks2[mOffset[InputBytes[id<<2]]++] = id;
345 *mLink[InputBytes[id<<2]]++ = id;
349 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
350 udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
356 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
359 * This one is for floating-point values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
360 * \param input [in] a list of floating-point values to sort
361 * \param nb [in] number of values to sort, must be < 2^31
362 * \return Self-Reference
363 * \warning only sorts IEEE floating-point values
365 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
366 RadixSort& RadixSort::Sort(const float* input2, udword nb)
369 if(!input2 || !nb || nb&0x80000000) return *this;
374 udword* input = (udword*)input2;
376 // Resize lists if needed
379 #ifdef RADIX_LOCAL_RAM
380 // Allocate histograms & offsets on the stack
381 udword mHistogram[256*4];
382 // udword mOffset[256];
386 // Create histograms (counters). Counters for all passes are created in one run.
387 // Pros: read input buffer once instead of four times
388 // Cons: mHistogram is 4Kb instead of 1Kb
389 // Floating-point values are always supposed to be signed values, so there's only one code path there.
390 // Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
391 // is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
392 // generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
393 // wouldn't work with mixed positive/negative values....
394 { CREATE_HISTOGRAMS(float, input2); }
396 // Compute #negative values involved if needed
397 udword NbNegativeValues = 0;
398 // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
399 // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
400 // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
401 udword* h3= &mHistogram[768];
402 for(udword i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
404 // Radix sort, j is the pass number (0=LSB, 3=MSB)
405 for(udword j=0;j<4;j++)
407 // Should we care about negative values?
410 // Here we deal with positive values only
411 CHECK_PASS_VALIDITY(j);
418 // for(udword i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
419 for(udword i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
421 // Perform Radix Sort
422 ubyte* InputBytes = (ubyte*)input;
426 // for(i=0;i<nb;i++) mRanks2[mOffset[InputBytes[i<<2]]++] = i;
427 for(udword i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
432 udword* Indices = mRanks;
433 udword* IndicesEnd = &mRanks[nb];
434 while(Indices!=IndicesEnd)
436 udword id = *Indices++;
437 // mRanks2[mOffset[InputBytes[id<<2]]++] = id;
438 *mLink[InputBytes[id<<2]]++ = id;
442 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
443 udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
448 // This is a special case to correctly handle negative values
449 CHECK_PASS_VALIDITY(j);
453 // Create biased offsets, in order for negative numbers to be sorted as well
454 // mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
455 mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
456 // for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
457 for(udword i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
459 // We must reverse the sorting order for negative numbers!
461 mLink[255] = mRanks2;
462 // for(i=0;i<127;i++) mOffset[254-i] = mOffset[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
463 for(udword i=0;i<127;i++) mLink[254-i] = mLink[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
464 // for(i=128;i<256;i++) mOffset[i] += CurCount[i]; // Fixing the wrong place for negative values
465 for(udword i=128;i<256;i++) mLink[i] += CurCount[i]; // Fixing the wrong place for negative values
467 // Perform Radix Sort
470 for(udword i=0;i<nb;i++)
472 udword Radix = input[i]>>24; // Radix byte, same as above. AND is useless here (udword).
473 // ### cmp to be killed. Not good. Later.
474 // if(Radix<128) mRanks2[mOffset[Radix]++] = i; // Number is positive, same as above
475 // else mRanks2[--mOffset[Radix]] = i; // Number is negative, flip the sorting order
476 if(Radix<128) *mLink[Radix]++ = i; // Number is positive, same as above
477 else *(--mLink[Radix]) = i; // Number is negative, flip the sorting order
483 for(udword i=0;i<nb;i++)
485 udword Radix = input[mRanks[i]]>>24; // Radix byte, same as above. AND is useless here (udword).
486 // ### cmp to be killed. Not good. Later.
487 // if(Radix<128) mRanks2[mOffset[Radix]++] = mRanks[i]; // Number is positive, same as above
488 // else mRanks2[--mOffset[Radix]] = mRanks[i]; // Number is negative, flip the sorting order
489 if(Radix<128) *mLink[Radix]++ = mRanks[i]; // Number is positive, same as above
490 else *(--mLink[Radix]) = mRanks[i]; // Number is negative, flip the sorting order
493 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
494 udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
498 // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
504 for(udword i=0;i<nb;i++) mRanks2[i] = nb-i-1;
509 for(udword i=0;i<nb;i++) mRanks2[i] = mRanks[nb-i-1];
512 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
513 udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
521 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
524 * \return memory used in bytes
526 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
527 udword RadixSort::GetUsedRam() const
529 udword UsedRam = sizeof(RadixSort);
530 #ifndef RADIX_LOCAL_RAM
531 UsedRam += 256*4*sizeof(udword); // Histograms
532 UsedRam += 256*sizeof(udword); // Offsets
534 UsedRam += 2*CURRENT_SIZE*sizeof(udword); // 2 lists of indices