25 #include "ndCoreStdafx.h"
27 #include "ndProfiler.h"
28 #include "ndThreadPool.h"
30 template <
class T,
class dCompareKey>
31 void ndSort(T*
const array, ndInt32 elements,
void*
const context)
34 const ndInt32 batchSize = 8;
35 ndInt32 stack[128][2];
38 stack[0][1] = elements - 1;
39 ndInt32 stackIndex = 1;
40 const dCompareKey comparator;
44 ndInt32 lo = stack[stackIndex][0];
45 ndInt32 hi = stack[stackIndex][1];
46 if ((hi - lo) > batchSize)
48 ndInt32 mid = (lo + hi) >> 1;
49 if (comparator.Compare(array[lo], array[mid], context) > 0)
51 ndSwap(array[lo], array[mid]);
53 if (comparator.Compare(array[mid], array[hi], context) > 0)
55 ndSwap(array[mid], array[hi]);
57 if (comparator.Compare(array[lo], array[mid], context) > 0)
59 ndSwap(array[lo], array[mid]);
63 const T pivot(array[mid]);
66 while (comparator.Compare(array[i], pivot, context) < 0)
70 while (comparator.Compare(array[j], pivot, context) > 0)
77 ndSwap(array[i], array[j]);
85 stack[stackIndex][0] = i;
86 stack[stackIndex][1] = hi;
91 stack[stackIndex][0] = lo;
92 stack[stackIndex][1] = j;
95 ndAssert(stackIndex < ndInt32(
sizeof(stack) / (2 *
sizeof(stack[0][0]))));
99 ndInt32 stride = batchSize + 1;
100 if (elements < stride)
104 for (ndInt32 i = 1; i < stride; ++i)
106 if (comparator.Compare(array[0], array[i], context) > 0)
108 ndSwap(array[0], array[i]);
112 for (ndInt32 i = 1; i < elements; ++i)
115 const T tmp(array[i]);
116 for (; comparator.Compare(array[j - 1], tmp, context) > 0; --j)
119 array[j] = array[j - 1];
126 for (ndInt32 i = 0; i < (elements - 1); ++i)
128 ndAssert(comparator.Compare(array[i], array[i + 1], context) <= 0);
133 template <
class T,
class ndEvaluateKey, ndInt32 keyBitSize>
134 void ndCountingSortInPlace(T*
const array, T*
const scratchBuffer, ndInt32 size, ndUnsigned32*
const prefixScanOut,
void*
const context)
137 ndAssert(keyBitSize > 0);
138 ndUnsigned32 scans[(1 << keyBitSize) + 1];
139 ndEvaluateKey evaluator(context);
140 for (ndInt32 i = 0; i < (1 << keyBitSize); ++i)
144 for (ndInt32 i = 0; i < size; ++i)
146 const T& entry = array[i];
147 scratchBuffer[i] = entry;
148 const ndInt32 key = evaluator.GetKey(entry);
150 ndAssert(key < (1 << keyBitSize));
154 ndUnsigned32 sum = 0;
155 for (ndInt32 i = 0; i < (1 << keyBitSize); ++i)
157 ndUnsigned32 partialSum = scans[i];
164 for (ndInt32 i = 0; i < ((1 << keyBitSize) + 1); ++i)
166 prefixScanOut[i] = scans[i];
170 for (ndInt32 i = 0; i < size; ++i)
172 const T& entry = scratchBuffer[i];
173 const ndInt32 key = evaluator.GetKey(entry);
175 ndAssert(key < (1 << keyBitSize));
176 const ndUnsigned32 index = scans[key];
177 array[index] = entry;
178 scans[key] = index + 1;
183 for (ndInt32 i = size - 2; i >= 0; --i)
185 ndAssert(evaluator.GetKey(scratchBuffer[i]) <= evaluator.GetKey(scratchBuffer[i + 1]));
190 template <
class T,
class ndEvaluateKey, ndInt32 keyBitSize>
191 void ndCountingSortInPlace(
ndThreadPool& threadPool, T*
const array, T*
const scratchBuffer, ndInt32 size, ndUnsigned32*
const prefixScanOut,
void*
const context)
194 ndEvaluateKey evaluator(context);
195 const ndInt32 threadCount = threadPool.GetThreadCount();
197 ndUnsigned32*
const sum = ndAlloca(ndUnsigned32, 1 << keyBitSize);
198 ndUnsigned32*
const scans = ndAlloca(ndUnsigned32, threadCount * (1 << keyBitSize));
200 auto ndBuildHistogram = ndMakeObject::ndFunction([&array, &scratchBuffer, size, &evaluator, &scans](ndInt32 threadIndex, ndInt32 threadCount)
202 D_TRACKTIME_NAMED(ndBuildHistogram);
203 ndUnsigned32*
const scan = &scans[threadIndex * (1 << keyBitSize)];
205 for (ndInt32 i = 0; i < (1 << keyBitSize); ++i)
210 ndStartEnd startEnd(size, threadIndex, threadCount);
211 for (ndInt32 i = startEnd.m_start; i < startEnd.m_end; ++i)
213 const T& entry = array[i];
214 const ndInt32 key = evaluator.GetKey(entry);
216 ndAssert(key < (1 << keyBitSize));
218 scratchBuffer[i] = entry;
222 auto ndShuffleArray = ndMakeObject::ndFunction([&array, &scratchBuffer, size, &evaluator, &scans, &sum](ndInt32 threadIndex, ndInt32 threadCount)
224 D_TRACKTIME_NAMED(ndShuffleArray);
225 ndUnsigned32*
const scan = &scans[threadIndex * (1 << keyBitSize)];
227 for (ndInt32 i = 0; i < (1 << keyBitSize); ++i)
231 ndStartEnd startEnd(size, threadIndex, threadCount);
232 for (ndInt32 i = startEnd.m_start; i < startEnd.m_end; ++i)
234 const T& entry = scratchBuffer[i];
235 const ndInt32 key = evaluator.GetKey(entry);
237 ndAssert(key < (1 << keyBitSize));
238 const ndUnsigned32 index = scan[key];
239 array[index] = entry;
240 scan[key] = index + 1;
244 threadPool.ParallelExecute(ndBuildHistogram);
246 ndInt32 bits = keyBitSize;
249 for (ndInt32 i = 0; i < (1 << keyBitSize); ++i)
253 for (ndInt32 j = 0; j < threadCount; ++j)
255 ndUnsigned32*
const scan = &scans[j * (1 << keyBitSize)];
256 for (ndInt32 i = 0; i < (1 << keyBitSize); ++i)
258 ndUnsigned32 partialSum = scan[i];
260 sum[i] += partialSum;
264 ndUnsigned32 accSum = 0;
265 for (ndInt32 i = 0; i < (1 << keyBitSize); ++i)
267 ndUnsigned32 partialSum = sum[i];
269 accSum += partialSum;
279 for (ndInt32 i = 0; i < (1 << keyBitSize); ++i)
281 prefixScanOut[i] = sum[i];
283 prefixScanOut[1 << keyBitSize] = ndUnsigned32(size);
286 threadPool.ParallelExecute(ndShuffleArray);
290 for (ndInt32 i = 1; i < size; ++i)
292 ndInt32 key0 = evaluator.GetKey(array[i - 1]);
293 ndInt32 key1 = evaluator.GetKey(array[i + 0]);
294 ndAssert(key0 <= key1);
299 template <
class T,
class ndEvaluateKey, ndInt32 keyBitSize>
300 void ndCountingSort(
const T*
const srcArray, T*
const dstArray, ndInt32 size, ndUnsigned32*
const prefixScanOut,
void*
const context)
303 ndAssert(keyBitSize > 0);
304 ndUnsigned32 scans[(1 << keyBitSize) + 1];
305 ndEvaluateKey evaluator(context);
306 for (ndInt32 i = 0; i < (1 << keyBitSize); ++i)
310 for (ndInt32 i = 0; i < size; ++i)
312 const T& entry = srcArray[i];
313 const ndInt32 key = evaluator.GetKey(entry);
315 ndAssert(key < (1 << keyBitSize));
319 ndUnsigned32 sum = 0;
320 for (ndInt32 i = 0; i < (1 << keyBitSize); ++i)
322 ndUnsigned32 partialSum = scans[i];
329 for (ndInt32 i = 0; i < ((1 << keyBitSize) + 1); ++i)
331 prefixScanOut[i] = scans[i];
335 for (ndInt32 i = 0; i < size; ++i)
337 const T& entry = srcArray[i];
338 const ndInt32 key = evaluator.GetKey(entry);
340 ndAssert(key < (1 << keyBitSize));
341 const ndUnsigned32 index = scans[key];
342 dstArray[index] = entry;
343 scans[key] = index + 1;
348 for (ndInt32 i = size - 2; i >= 0; --i)
350 ndAssert(evaluator.GetKey(scratchBuffer[i]) <= evaluator.GetKey(scratchBuffer[i + 1]));
355 template <
class T,
class ndEvaluateKey, ndInt32 keyBitSize>
356 void ndCountingSort(
ndThreadPool& threadPool,
const T*
const srcArray, T*
const dstArray, ndInt32 size, ndUnsigned32*
const prefixScanOut,
void*
const context)
359 ndEvaluateKey evaluator(context);
360 const ndInt32 threadCount = threadPool.GetThreadCount();
362 ndUnsigned32*
const sum = ndAlloca(ndUnsigned32, 1 << keyBitSize);
363 ndUnsigned32*
const scans = ndAlloca(ndUnsigned32, threadCount * (1 << keyBitSize));
365 auto ndBuildHistogram = ndMakeObject::ndFunction([&srcArray, size, &evaluator, &scans](ndInt32 threadIndex, ndInt32 threadCount)
367 D_TRACKTIME_NAMED(ndBuildHistogram);
368 ndUnsigned32*
const scan = &scans[threadIndex * (1 << keyBitSize)];
370 for (ndInt32 i = 0; i < (1 << keyBitSize); ++i)
375 ndStartEnd startEnd(size, threadIndex, threadCount);
376 for (ndInt32 i = startEnd.m_start; i < startEnd.m_end; ++i)
378 const T& entry = srcArray[i];
379 const ndInt32 key = evaluator.GetKey(entry);
381 ndAssert(key < (1 << keyBitSize));
386 auto ndShuffleArray = ndMakeObject::ndFunction([&srcArray, &dstArray, size, &evaluator, &scans, &sum](ndInt32 threadIndex, ndInt32 threadCount)
388 D_TRACKTIME_NAMED(ndShuffleArray);
389 ndUnsigned32*
const scan = &scans[threadIndex * (1 << keyBitSize)];
391 for (ndInt32 i = 0; i < (1 << keyBitSize); ++i)
396 ndStartEnd startEnd(size, threadIndex, threadCount);
397 for (ndInt32 i = startEnd.m_start; i < startEnd.m_end; ++i)
399 const T& entry = srcArray[i];
400 const ndInt32 key = evaluator.GetKey(entry);
402 ndAssert(key < (1 << keyBitSize));
403 const ndUnsigned32 index = scan[key];
404 dstArray[index] = entry;
405 scan[key] = index + 1;
409 threadPool.ParallelExecute(ndBuildHistogram);
411 ndInt32 bits = keyBitSize;
414 for (ndInt32 i = 0; i < (1 << keyBitSize); ++i)
418 for (ndInt32 j = 0; j < threadCount; ++j)
420 ndUnsigned32*
const scan = &scans[j * (1 << keyBitSize)];
421 for (ndInt32 i = 0; i < (1 << keyBitSize); ++i)
423 ndUnsigned32 partialSum = scan[i];
425 sum[i] += partialSum;
429 ndUnsigned32 accSum = 0;
430 for (ndInt32 i = 0; i < (1 << keyBitSize); ++i)
432 ndUnsigned32 partialSum = sum[i];
434 accSum += partialSum;
444 for (ndInt32 i = 0; i < (1 << keyBitSize); ++i)
446 prefixScanOut[i] = sum[i];
448 prefixScanOut[1 << keyBitSize] = ndUnsigned32(size);
451 threadPool.ParallelExecute(ndShuffleArray);
455 for (ndInt32 i = 1; i < size; ++i)
457 ndInt32 key0 = evaluator.GetKey(scratchBuffer[i - 1]);
458 ndInt32 key1 = evaluator.GetKey(scratchBuffer[i + 0]);
459 ndAssert(key0 <= key1);
464 template <
class T,
class ndEvaluateKey, ndInt32 keyBitSize>
468 ndCountingSort<T, ndEvaluateKey, keyBitSize>(threadPool, &array[0], &scratchBuffer[0], array.
GetCount(), prefixScanOut, context);
469 array.
Swap(scratchBuffer);
472 template <
class T,
class ndEvaluateKey, ndInt32 keyBitSize>
473 void ndCountingSort(
ndArray<T>& array,
ndArray<T>& scratchBuffer, ndUnsigned32*
const prefixScanOut,
void*
const context)
476 ndCountingSort<T, ndEvaluateKey, keyBitSize>(&array[0], &scratchBuffer[0], array.
GetCount(), prefixScanOut, context);
477 array.
Swap(scratchBuffer);