1 // Copyright 2013 Howling Moon Software. All rights reserved.
2 // See http://chipmunk2d.net/legal.php for more information.
7 //TODO: Move all the thread stuff to another file
9 //#include <sys/param.h >
12 #include <sys/sysctl.h>
17 #elif defined(__MINGW32__)
20 #ifndef WIN32_LEAN_AND_MEAN
21 #define WIN32_LEAN_AND_MEAN
28 #include <process.h> // _beginthreadex
35 // Simple pthread implementation for Windows
36 // Made from scratch to avoid the LGPL licence from pthread-win32
43 typedef HANDLE pthread_t;
46 // Based on http://www.cs.wustl.edu/~schmidt/win32-cv-1.html since Windows has no condition variable until NT6
48 // Count of the number of waiters.
50 CRITICAL_SECTION waiters_count_lock;
51 // Serialize access to <waiters_count_>.
53 HANDLE events[MAX_EVENTS];
55 typedef CRITICAL_SECTION pthread_mutex_t;
57 typedef struct {} pthread_condattr_t; // Dummy;
59 int pthread_cond_destroy(pthread_cond_t* cv)
61 CloseHandle(cv->events[BROADCAST]);
62 CloseHandle(cv->events[SIGNAL]);
64 DeleteCriticalSection(&cv->waiters_count_lock);
69 int pthread_cond_init(pthread_cond_t* cv, const pthread_condattr_t* attr)
71 // Initialize the count to 0.
72 cv->waiters_count = 0;
74 // Create an auto-reset event.
75 cv->events[SIGNAL] = CreateEvent(NULL, // no security
76 FALSE, // auto-reset event
77 FALSE, // non-signaled initially
80 // Create a manual-reset event.
81 cv->events[BROADCAST] = CreateEvent(NULL, // no security
83 FALSE, // non-signaled initially
86 InitializeCriticalSection(&cv->waiters_count_lock);
91 int pthread_cond_broadcast(pthread_cond_t *cv)
93 // Avoid race conditions.
94 EnterCriticalSection(&cv->waiters_count_lock);
95 int have_waiters = cv->waiters_count > 0;
96 LeaveCriticalSection(&cv->waiters_count_lock);
99 SetEvent(cv->events[BROADCAST]);
104 int pthread_cond_signal(pthread_cond_t* cv)
106 // Avoid race conditions.
107 EnterCriticalSection(&cv->waiters_count_lock);
108 int have_waiters = cv->waiters_count > 0;
109 LeaveCriticalSection(&cv->waiters_count_lock);
112 SetEvent(cv->events[SIGNAL]);
117 int pthread_cond_wait(pthread_cond_t* cv, pthread_mutex_t* external_mutex)
119 // Avoid race conditions.
120 EnterCriticalSection(&cv->waiters_count_lock);
122 LeaveCriticalSection(&cv->waiters_count_lock);
124 // It's ok to release the <external_mutex> here since Win32
125 // manual-reset events maintain state when used with
126 // <SetEvent>. This avoids the "lost wakeup" bug...
127 LeaveCriticalSection(external_mutex);
129 // Wait for either event to become signaled due to <pthread_cond_signal>
130 // being called or <pthread_cond_broadcast> being called.
131 int result = WaitForMultipleObjects(2, cv->events, FALSE, INFINITE);
133 EnterCriticalSection(&cv->waiters_count_lock);
136 result == WAIT_OBJECT_0 + BROADCAST
137 && cv->waiters_count == 0;
138 LeaveCriticalSection(&cv->waiters_count_lock);
140 // Some thread called <pthread_cond_broadcast>.
142 // We're the last waiter to be notified or to stop waiting, so
143 // reset the manual event.
144 ResetEvent(cv->events[BROADCAST]);
146 // Reacquire the <external_mutex>.
147 EnterCriticalSection(external_mutex);
149 return result == WAIT_TIMEOUT ? ETIMEDOUT : 0;
152 typedef struct {} pthread_mutexattr_t; //< Dummy
154 int pthread_mutex_init(pthread_mutex_t* mutex, const pthread_mutexattr_t* attr)
156 InitializeCriticalSection(mutex);
160 int pthread_mutex_destroy(pthread_mutex_t* mutex)
162 DeleteCriticalSection(mutex);
166 int pthread_mutex_lock(pthread_mutex_t* mutex)
168 EnterCriticalSection(mutex);
172 int pthread_mutex_unlock(pthread_mutex_t* mutex)
174 LeaveCriticalSection(mutex);
178 typedef struct {} pthread_attr_t;
182 void *(*start_routine) (void *);
184 } pthread_internal_thread;
186 unsigned int __stdcall ThreadProc(void* userdata)
188 pthread_internal_thread* ud = (pthread_internal_thread*) userdata;
189 ud->start_routine(ud->arg);
196 int pthread_create(pthread_t* thread, const pthread_attr_t* attr, void *(*start_routine) (void *), void *arg)
198 pthread_internal_thread* ud = (pthread_internal_thread*) malloc(sizeof(pthread_internal_thread));
199 ud->start_routine = start_routine;
202 *thread = (HANDLE) (_beginthreadex(NULL, 0, &ThreadProc, ud, 0, NULL));
209 int pthread_join(pthread_t thread, void **value_ptr)
211 WaitForSingleObject(thread, INFINITE);
219 #include "chipmunk/chipmunk_private.h"
220 #include "chipmunk/cpHastySpace.h"
223 //MARK: ARM NEON Solver
226 #include <arm_neon.h>
228 // Tested and known to work fine with Clang 3.0 and GCC 4.2
229 // Doesn't work with Clang 1.6, and I have no idea why.
230 #if defined(__clang_major__) && __clang_major__ < 3
231 #error Compiler not supported.
236 #error Cannot use CP_USE_DOUBLES on 32 bit ARM.
239 typedef float64_t cpFloat_t;
240 typedef float64x2_t cpFloatx2_t;
241 #define vld vld1q_f64
242 #define vdup_n vdupq_n_f64
243 #define vst vst1q_f64
244 #define vst_lane vst1q_lane_f64
245 #define vadd vaddq_f64
246 #define vsub vsubq_f64
247 #define vpadd vpaddq_f64
248 #define vmul vmulq_f64
249 #define vmul_n vmulq_n_f64
250 #define vneg vnegq_f64
251 #define vget_lane vgetq_lane_f64
252 #define vset_lane vsetq_lane_f64
253 #define vmin vminq_f64
254 #define vmax vmaxq_f64
255 #define vrev(__a) __builtin_shufflevector(__a, __a, 1, 0)
257 typedef float32_t cpFloat_t;
258 typedef float32x2_t cpFloatx2_t;
260 #define vdup_n vdup_n_f32
262 #define vst_lane vst1_lane_f32
263 #define vadd vadd_f32
264 #define vsub vsub_f32
265 #define vpadd vpadd_f32
266 #define vmul vmul_f32
267 #define vmul_n vmul_n_f32
268 #define vneg vneg_f32
269 #define vget_lane vget_lane_f32
270 #define vset_lane vset_lane_f32
271 #define vmin vmin_f32
272 #define vmax vmax_f32
273 #define vrev vrev64_f32
276 // TODO could probably do better here, maybe using vcreate?
277 // especially for the constants
278 // Maybe use the {} notation for GCC/Clang?
279 static inline cpFloatx2_t
280 vmake(cpFloat_t x, cpFloat_t y)
282 // cpFloatx2_t v = {};
283 // v = vset_lane(x, v, 0);
284 // v = vset_lane(y, v, 1);
288 // This might not be super compatible, but all the NEON headers use it...
289 return (cpFloatx2_t){x, y};
293 cpArbiterApplyImpulse_NEON(cpArbiter *arb)
295 cpBody *a = arb->body_a;
296 cpBody *b = arb->body_b;
297 cpFloatx2_t surface_vr = vld((cpFloat_t *)&arb->surface_vr);
298 cpFloatx2_t n = vld((cpFloat_t *)&arb->n);
299 cpFloat_t friction = arb->u;
301 int numContacts = arb->count;
302 struct cpContact *contacts = arb->contacts;
303 for(int i=0; i<numContacts; i++){
304 struct cpContact *con = contacts + i;
305 cpFloatx2_t r1 = vld((cpFloat_t *)&con->r1);
306 cpFloatx2_t r2 = vld((cpFloat_t *)&con->r2);
308 cpFloatx2_t perp = vmake(-1.0, 1.0);
309 cpFloatx2_t r1p = vmul(vrev(r1), perp);
310 cpFloatx2_t r2p = vmul(vrev(r2), perp);
312 cpFloatx2_t vBias_a = vld((cpFloat_t *)&a->v_bias);
313 cpFloatx2_t vBias_b = vld((cpFloat_t *)&b->v_bias);
314 cpFloatx2_t wBias = vmake(a->w_bias, b->w_bias);
316 cpFloatx2_t vb1 = vadd(vBias_a, vmul_n(r1p, vget_lane(wBias, 0)));
317 cpFloatx2_t vb2 = vadd(vBias_b, vmul_n(r2p, vget_lane(wBias, 1)));
318 cpFloatx2_t vbr = vsub(vb2, vb1);
320 cpFloatx2_t v_a = vld((cpFloat_t *)&a->v);
321 cpFloatx2_t v_b = vld((cpFloat_t *)&b->v);
322 cpFloatx2_t w = vmake(a->w, b->w);
323 cpFloatx2_t v1 = vadd(v_a, vmul_n(r1p, vget_lane(w, 0)));
324 cpFloatx2_t v2 = vadd(v_b, vmul_n(r2p, vget_lane(w, 1)));
325 cpFloatx2_t vr = vsub(v2, v1);
327 cpFloatx2_t vbn_vrn = vpadd(vmul(vbr, n), vmul(vr, n));
329 cpFloatx2_t v_offset = vmake(con->bias, -con->bounce);
330 cpFloatx2_t jOld = vmake(con->jBias, con->jnAcc);
331 cpFloatx2_t jbn_jn = vmul_n(vsub(v_offset, vbn_vrn), con->nMass);
332 jbn_jn = vmax(vadd(jOld, jbn_jn), vdup_n(0.0));
333 cpFloatx2_t jApply = vsub(jbn_jn, jOld);
335 cpFloatx2_t t = vmul(vrev(n), perp);
336 cpFloatx2_t vrt_tmp = vmul(vadd(vr, surface_vr), t);
337 cpFloatx2_t vrt = vpadd(vrt_tmp, vrt_tmp);
339 cpFloatx2_t jtOld = {}; jtOld = vset_lane(con->jtAcc, jtOld, 0);
340 cpFloatx2_t jtMax = vrev(vmul_n(jbn_jn, friction));
341 cpFloatx2_t jt = vmul_n(vrt, -con->tMass);
342 jt = vmax(vneg(jtMax), vmin(vadd(jtOld, jt), jtMax));
343 cpFloatx2_t jtApply = vsub(jt, jtOld);
345 cpFloatx2_t i_inv = vmake(-a->i_inv, b->i_inv);
346 cpFloatx2_t nperp = vmake(1.0, -1.0);
348 cpFloatx2_t jBias = vmul_n(n, vget_lane(jApply, 0));
349 cpFloatx2_t jBiasCross = vmul(vrev(jBias), nperp);
350 cpFloatx2_t biasCrosses = vpadd(vmul(r1, jBiasCross), vmul(r2, jBiasCross));
351 wBias = vadd(wBias, vmul(i_inv, biasCrosses));
353 vBias_a = vsub(vBias_a, vmul_n(jBias, a->m_inv));
354 vBias_b = vadd(vBias_b, vmul_n(jBias, b->m_inv));
356 cpFloatx2_t j = vadd(vmul_n(n, vget_lane(jApply, 1)), vmul_n(t, vget_lane(jtApply, 0)));
357 cpFloatx2_t jCross = vmul(vrev(j), nperp);
358 cpFloatx2_t crosses = vpadd(vmul(r1, jCross), vmul(r2, jCross));
359 w = vadd(w, vmul(i_inv, crosses));
361 v_a = vsub(v_a, vmul_n(j, a->m_inv));
362 v_b = vadd(v_b, vmul_n(j, b->m_inv));
364 // TODO would moving these earlier help pipeline them better?
365 vst((cpFloat_t *)&a->v_bias, vBias_a);
366 vst((cpFloat_t *)&b->v_bias, vBias_b);
367 vst_lane((cpFloat_t *)&a->w_bias, wBias, 0);
368 vst_lane((cpFloat_t *)&b->w_bias, wBias, 1);
370 vst((cpFloat_t *)&a->v, v_a);
371 vst((cpFloat_t *)&b->v, v_b);
372 vst_lane((cpFloat_t *)&a->w, w, 0);
373 vst_lane((cpFloat_t *)&b->w, w, 1);
375 vst_lane((cpFloat_t *)&con->jBias, jbn_jn, 0);
376 vst_lane((cpFloat_t *)&con->jnAcc, jbn_jn, 1);
377 vst_lane((cpFloat_t *)&con->jtAcc, jt, 0);
385 // Right now using more than 2 threads probably wont help your performance any.
386 // If you are using a ridiculous number of iterations it could help though.
387 #define MAX_THREADS 2
389 struct ThreadContext {
392 unsigned long thread_num;
395 typedef void (*cpHastySpaceWorkFunction)(cpSpace *space, unsigned long worker, unsigned long worker_count);
397 struct cpHastySpace {
400 // Number of worker threads (including the main thread)
401 unsigned long num_threads;
403 // Number of worker threads currently executing. (also including the main thread)
404 unsigned long num_working;
406 // Number of constraints (plus contacts) that must exist per step to start the worker threads.
407 unsigned long constraint_count_threshold;
409 pthread_mutex_t mutex;
410 pthread_cond_t cond_work, cond_resume;
412 // Work function to invoke.
413 cpHastySpaceWorkFunction work;
415 struct ThreadContext workers[MAX_THREADS - 1];
419 WorkerThreadLoop(struct ThreadContext *context)
421 cpHastySpace *hasty = context->space;
423 unsigned long thread = context->thread_num;
424 unsigned long num_threads = hasty->num_threads;
427 pthread_mutex_lock(&hasty->mutex); {
428 if(--hasty->num_working == 0){
429 pthread_cond_signal(&hasty->cond_resume);
432 pthread_cond_wait(&hasty->cond_work, &hasty->mutex);
433 } pthread_mutex_unlock(&hasty->mutex);
435 cpHastySpaceWorkFunction func = hasty->work;
437 hasty->work(&hasty->space, thread, num_threads);
447 RunWorkers(cpHastySpace *hasty, cpHastySpaceWorkFunction func)
449 hasty->num_working = hasty->num_threads - 1;
452 if(hasty->num_working > 0){
453 pthread_mutex_lock(&hasty->mutex); {
454 pthread_cond_broadcast(&hasty->cond_work);
455 } pthread_mutex_unlock(&hasty->mutex);
457 func((cpSpace *)hasty, 0, hasty->num_threads);
459 pthread_mutex_lock(&hasty->mutex); {
460 if(hasty->num_working > 0){
461 pthread_cond_wait(&hasty->cond_resume, &hasty->mutex);
463 } pthread_mutex_unlock(&hasty->mutex);
465 func((cpSpace *)hasty, 0, hasty->num_threads);
472 Solver(cpSpace *space, unsigned long worker, unsigned long worker_count)
474 cpArray *constraints = space->constraints;
475 cpArray *arbiters = space->arbiters;
477 cpFloat dt = space->curr_dt;
478 unsigned long iterations = (space->iterations + worker_count - 1)/worker_count;
480 for(unsigned long i=0; i<iterations; i++){
481 for(int j=0; j<arbiters->num; j++){
482 cpArbiter *arb = (cpArbiter *)arbiters->arr[j];
484 cpArbiterApplyImpulse_NEON(arb);
486 cpArbiterApplyImpulse(arb);
490 for(int j=0; j<constraints->num; j++){
491 cpConstraint *constraint = (cpConstraint *)constraints->arr[j];
492 constraint->klass->applyImpulse(constraint, dt);
497 //MARK: Thread Management Functions
500 HaltThreads(cpHastySpace *hasty)
502 pthread_mutex_t *mutex = &hasty->mutex;
503 pthread_mutex_lock(mutex); {
504 hasty->work = NULL; // NULL work function means break and exit
505 pthread_cond_broadcast(&hasty->cond_work);
506 } pthread_mutex_unlock(mutex);
508 for(unsigned long i=0; i<(hasty->num_threads-1); i++){
509 pthread_join(hasty->workers[i].thread, NULL);
514 cpHastySpaceSetThreads(cpSpace *space, unsigned long threads)
516 #if TARGET_IPHONE_SIMULATOR == 1
517 // Individual values appear to be written non-atomically when compiled as debug for the simulator.
518 // No idea why, so threads are disabled.
522 cpHastySpace *hasty = (cpHastySpace *)space;
527 size_t size = sizeof(threads);
528 sysctlbyname("hw.ncpu", &threads, &size, NULL, 0);
531 if(threads == 0) threads = 1;
534 hasty->num_threads = (threads < MAX_THREADS ? threads : MAX_THREADS);
535 hasty->num_working = hasty->num_threads - 1;
537 // Create the worker threads and wait for them to signal ready.
538 if(hasty->num_working > 0){
539 pthread_mutex_lock(&hasty->mutex);
540 for(unsigned long i=0; i<(hasty->num_threads-1); i++){
541 hasty->workers[i].space = hasty;
542 hasty->workers[i].thread_num = i + 1;
544 pthread_create(&hasty->workers[i].thread, NULL, (void*(*)(void*))WorkerThreadLoop, &hasty->workers[i]);
547 pthread_cond_wait(&hasty->cond_resume, &hasty->mutex);
548 pthread_mutex_unlock(&hasty->mutex);
553 cpHastySpaceGetThreads(cpSpace *space)
555 return ((cpHastySpace *)space)->num_threads;
558 //MARK: Overriden cpSpace Functions.
561 cpHastySpaceNew(void)
563 cpHastySpace *hasty = (cpHastySpace *)cpcalloc(1, sizeof(cpHastySpace));
564 cpSpaceInit((cpSpace *)hasty);
566 pthread_mutex_init(&hasty->mutex, NULL);
567 pthread_cond_init(&hasty->cond_work, NULL);
568 pthread_cond_init(&hasty->cond_resume, NULL);
570 // TODO magic number, should test this more thoroughly.
571 hasty->constraint_count_threshold = 50;
573 // Default to 1 thread for determinism.
574 hasty->num_threads = 1;
575 cpHastySpaceSetThreads((cpSpace *)hasty, 1);
577 return (cpSpace *)hasty;
581 cpHastySpaceFree(cpSpace *space)
583 cpHastySpace *hasty = (cpHastySpace *)space;
587 pthread_mutex_destroy(&hasty->mutex);
588 pthread_cond_destroy(&hasty->cond_work);
589 pthread_cond_destroy(&hasty->cond_resume);
595 cpHastySpaceStep(cpSpace *space, cpFloat dt)
597 // don't step if the timestep is 0!
598 if(dt == 0.0f) return;
602 cpFloat prev_dt = space->curr_dt;
605 cpArray *bodies = space->dynamicBodies;
606 cpArray *constraints = space->constraints;
607 cpArray *arbiters = space->arbiters;
609 // Reset and empty the arbiter list.
610 for(int i=0; i<arbiters->num; i++){
611 cpArbiter *arb = (cpArbiter *)arbiters->arr[i];
612 arb->state = CP_ARBITER_STATE_NORMAL;
614 // If both bodies are awake, unthread the arbiter from the contact graph.
615 if(!cpBodyIsSleeping(arb->body_a) && !cpBodyIsSleeping(arb->body_b)){
616 cpArbiterUnthread(arb);
621 cpSpaceLock(space); {
622 // Integrate positions
623 for(int i=0; i<bodies->num; i++){
624 cpBody *body = (cpBody *)bodies->arr[i];
625 body->position_func(body, dt);
628 // Find colliding pairs.
629 cpSpacePushFreshContactBuffer(space);
630 cpSpatialIndexEach(space->dynamicShapes, (cpSpatialIndexIteratorFunc)cpShapeUpdateFunc, NULL);
631 cpSpatialIndexReindexQuery(space->dynamicShapes, (cpSpatialIndexQueryFunc)cpSpaceCollideShapes, space);
632 } cpSpaceUnlock(space, cpFalse);
634 // Rebuild the contact graph (and detect sleeping components if sleeping is enabled)
635 cpSpaceProcessComponents(space, dt);
637 cpSpaceLock(space); {
638 // Clear out old cached arbiters and call separate callbacks
639 cpHashSetFilter(space->cachedArbiters, (cpHashSetFilterFunc)cpSpaceArbiterSetFilter, space);
641 // Prestep the arbiters and constraints.
642 cpFloat slop = space->collisionSlop;
643 cpFloat biasCoef = 1.0f - cpfpow(space->collisionBias, dt);
644 for(int i=0; i<arbiters->num; i++){
645 cpArbiterPreStep((cpArbiter *)arbiters->arr[i], dt, slop, biasCoef);
648 for(int i=0; i<constraints->num; i++){
649 cpConstraint *constraint = (cpConstraint *)constraints->arr[i];
651 cpConstraintPreSolveFunc preSolve = constraint->preSolve;
652 if(preSolve) preSolve(constraint, space);
654 constraint->klass->preStep(constraint, dt);
657 // Integrate velocities.
658 cpFloat damping = cpfpow(space->damping, dt);
659 cpVect gravity = space->gravity;
660 for(int i=0; i<bodies->num; i++){
661 cpBody *body = (cpBody *)bodies->arr[i];
662 body->velocity_func(body, gravity, damping, dt);
665 // Apply cached impulses
666 cpFloat dt_coef = (prev_dt == 0.0f ? 0.0f : dt/prev_dt);
667 for(int i=0; i<arbiters->num; i++){
668 cpArbiterApplyCachedImpulse((cpArbiter *)arbiters->arr[i], dt_coef);
671 for(int i=0; i<constraints->num; i++){
672 cpConstraint *constraint = (cpConstraint *)constraints->arr[i];
673 constraint->klass->applyCachedImpulse(constraint, dt_coef);
676 // Run the impulse solver.
677 cpHastySpace *hasty = (cpHastySpace *)space;
678 if((unsigned long)(arbiters->num + constraints->num) > hasty->constraint_count_threshold){
679 RunWorkers(hasty, Solver);
684 // Run the constraint post-solve callbacks
685 for(int i=0; i<constraints->num; i++){
686 cpConstraint *constraint = (cpConstraint *)constraints->arr[i];
688 cpConstraintPostSolveFunc postSolve = constraint->postSolve;
689 if(postSolve) postSolve(constraint, space);
692 // run the post-solve callbacks
693 for(int i=0; i<arbiters->num; i++){
694 cpArbiter *arb = (cpArbiter *) arbiters->arr[i];
696 cpCollisionHandler *handler = arb->handler;
697 handler->postSolveFunc(arb, space, handler->userData);
699 } cpSpaceUnlock(space, cpTrue);