1 // Copyright 2014 The Native Client Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
11 // @EXEMPTION[include]
12 #include "./framework.h"
13 #include "./thread_pool.h"
16 using sdk_util::ThreadPool; // For sdk_util::ThreadPool
20 const int kCellAlignment = 0x10;
21 const int kWidth = 2048;
22 const int kHeight = 2048;
24 #if defined(HAVE_SIMD)
25 // 128 bit vector types
26 typedef uint8_t u8x16_t __attribute__((vector_size(16)))
27 __attribute__((aligned(1)));
28 // TODO(dschuff): remove aligned(1) attribute above once nacl-clang has
29 // same vector alignment rules as pnacl.
31 // Helper function to broadcast x across 16 element vector.
32 INLINE u8x16_t broadcast(uint8_t x) {
33 u8x16_t r = {x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
45 void wSimulate(int y);
46 static void wSimulateEntry(int y, void* data);
59 // Query system for number of processors via sysconf()
60 int num_threads = sysconf(_SC_NPROCESSORS_ONLN);
61 workers_ = num_threads < 2 ? NULL : new ThreadPool(num_threads);
62 cell_stride_ = (kWidth + kCellAlignment - 1) &
63 ~(kCellAlignment - 1);
64 size_ = cell_stride_ * kHeight;
66 // Create a new context
67 void* in_buffer = NULL;
68 void* out_buffer = NULL;
69 // alloc buffers aligned on 16 bytes
70 posix_memalign(&in_buffer, kCellAlignment, size_);
71 posix_memalign(&out_buffer, kCellAlignment, size_);
72 cell_in_ = (uint8_t*) in_buffer;
73 cell_out_ = (uint8_t*) out_buffer;
82 void Life::wSimulate(int y) {
83 // These represent the new health value of a cell based on its neighboring
84 // values. The health is binary: either alive or dead.
85 const uint8_t kIsAlive[] = {
86 0, 0, 0, 0, 0, 1, 1, 1, 0,
87 0, 0, 0, 0, 0, 0, 0, 0, 0
90 // Don't run simulation on top and bottom borders
91 if (y < 1 || y >= kHeight - 1)
94 // Do neighbor summation; apply rules, output pixel color. Note that a 1 cell
95 // wide perimeter is excluded from the simulation update; only cells from
96 // x = 1 to x < width - 1 and y = 1 to y < height - 1 are updated.
97 uint8_t *src0 = (cell_in_ + (y - 1) * cell_stride_);
98 uint8_t *src1 = src0 + cell_stride_;
99 uint8_t *src2 = src1 + cell_stride_;
100 uint8_t *dst = (cell_out_ + y * cell_stride_) + 1;
103 #if defined(HAVE_SIMD)
104 const u8x16_t kOne = broadcast(1);
105 const u8x16_t kFour = broadcast(4);
106 const u8x16_t kEight = broadcast(8);
109 u8x16_t src00 = *reinterpret_cast<u8x16_t*>(&src0[0]);
110 u8x16_t src01 = *reinterpret_cast<u8x16_t*>(&src0[16]);
111 u8x16_t src10 = *reinterpret_cast<u8x16_t*>(&src1[0]);
112 u8x16_t src11 = *reinterpret_cast<u8x16_t*>(&src1[16]);
113 u8x16_t src20 = *reinterpret_cast<u8x16_t*>(&src2[0]);
114 u8x16_t src21 = *reinterpret_cast<u8x16_t*>(&src2[16]);
116 // This inner loop is SIMD - each loop iteration will process 16 cells.
117 for (; (x + 15) < (kWidth - 1); x += 16) {
118 // Construct jittered source temps, using __builtin_shufflevector(..) to
119 // extract a shifted 16 element vector from the 32 element concatenation
120 // of two source vectors.
121 u8x16_t src0j0 = src00;
122 u8x16_t src0j1 = __builtin_shufflevector(src00, src01,
123 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
124 u8x16_t src0j2 = __builtin_shufflevector(src00, src01,
125 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
126 u8x16_t src1j0 = src10;
127 u8x16_t src1j1 = __builtin_shufflevector(src10, src11,
128 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
129 u8x16_t src1j2 = __builtin_shufflevector(src10, src11,
130 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
131 u8x16_t src2j0 = src20;
132 u8x16_t src2j1 = __builtin_shufflevector(src20, src21,
133 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
134 u8x16_t src2j2 = __builtin_shufflevector(src20, src21,
135 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
137 // Sum the jittered sources to construct neighbor count.
138 u8x16_t count = src0j0 + src0j1 + src0j2 +
140 src2j0 + src2j1 + src2j2;
141 // Add the center cell.
142 count = count + count + src1j1;
143 // If count > 4 and < 8, center cell will be alive in the next frame.
144 u8x16_t alive1 = count > kFour;
145 u8x16_t alive2 = count < kEight;
146 // Intersect the two comparisons from above.
147 u8x16_t alive = alive1 & alive2;
149 // Convert alive mask to 1 or 0 and store in destination cell array.
150 *reinterpret_cast<u8x16_t*>(dst) = alive & kOne;
152 // Increment pointers.
158 // Shift source over by 16 cells and read the next 16 cells.
160 src01 = *reinterpret_cast<u8x16_t*>(&src0[16]);
162 src11 = *reinterpret_cast<u8x16_t*>(&src1[16]);
164 src21 = *reinterpret_cast<u8x16_t*>(&src2[16]);
168 // The SIMD loop above does 16 cells at a time. The loop below is the
169 // regular version which processes one cell at a time. It is used to
170 // finish the remainder of the scanline not handled by the SIMD loop.
171 for (; x < (kWidth - 1); ++x) {
172 // Sum the jittered sources to construct neighbor count.
173 int count = src0[0] + src0[1] + src0[2] +
174 src1[0] + + src1[2] +
175 src2[0] + src2[1] + src2[2];
176 // Add the center cell.
177 count = count + count + src1[1];
178 // Use table lookup indexed by count to determine pixel & alive state.
179 *dst++ = kIsAlive[count];
186 // Static entry point for worker thread.
187 void Life::wSimulateEntry(int slice, void* thiz) {
188 static_cast<Life*>(thiz)->wSimulate(slice);
191 void Life::SimulateFrame() {
193 // If multi-threading enabled, dispatch tasks to pool of worker threads.
194 workers_->Dispatch(kHeight, wSimulateEntry, this);
196 // Else manually simulate each line on this thread.
197 for (int y = 0; y < kHeight; y++) {
198 wSimulateEntry(y, this);
201 std::swap(cell_in_, cell_out_);
205 memset(cell_out_, 0, size_);
206 for (size_t index = 0; index < size_; index++) {
207 cell_in_[index] = rand() & 1;
212 // Wrap life in benchmark harness
213 class BenchmarkLife : public Benchmark {
216 const int kFramesToBenchmark = 100;
218 for (int i = 0; i < kFramesToBenchmark; ++i)
219 life_.SimulateFrame();
220 // TODO(nfullagar): make simulation deterministic & compute a checksum on
221 // the last frame. Return success or failure based on the checksum.
224 virtual const std::string Name() { return "Life"; }
225 virtual const std::string Notes() {
226 #if defined(HAVE_SIMD)
227 return "SIMD version";
229 return "scalar version";
238 // Register an instance to the list of benchmarks to be run.
239 RegisterBenchmark<BenchmarkLife> benchmark_life;