dali-physics/third-party/chipmunk2d/src/cpHastySpace.c

   1 // Copyright 2013 Howling Moon Software. All rights reserved.
   2 // See http://chipmunk2d.net/legal.php for more information.
   3
   4 #include <stdlib.h>
   5 #include <stdio.h>
   6
   7 //TODO: Move all the thread stuff to another file
   8
   9 //#include <sys/param.h >
  10
  11 #ifdef __APPLE__
  12 #include <sys/sysctl.h>
  13 #endif
  14
  15 #ifndef _WIN32
  16 #include <pthread.h>
  17 #elif defined(__MINGW32__)
  18 #include <pthread.h>
  19 #else
  20 #ifndef WIN32_LEAN_AND_MEAN
  21 #define WIN32_LEAN_AND_MEAN
  22 #endif
  23
  24 #ifndef NOMINMAX
  25 #define NOMINMAX
  26 #endif
  27
  28 #include <process.h> // _beginthreadex
  29 #include <windows.h>
  30
  31 #ifndef ETIMEDOUT
  32 #define ETIMEDOUT 1
  33 #endif
  34
  35 // Simple pthread implementation for Windows
  36 // Made from scratch to avoid the LGPL licence from pthread-win32
  37 enum {
  38         SIGNAL = 0,
  39         BROADCAST = 1,
  40         MAX_EVENTS = 2
  41 };
  42
  43 typedef HANDLE pthread_t;
  44 typedef struct
  45 {
  46         // Based on http://www.cs.wustl.edu/~schmidt/win32-cv-1.html since Windows has no condition variable until NT6
  47         UINT waiters_count;
  48         // Count of the number of waiters.
  49
  50         CRITICAL_SECTION waiters_count_lock;
  51         // Serialize access to <waiters_count_>.
  52
  53         HANDLE events[MAX_EVENTS];
  54 } pthread_cond_t;
  55 typedef CRITICAL_SECTION pthread_mutex_t;
  56
  57 typedef struct {} pthread_condattr_t; // Dummy;
  58
  59 int pthread_cond_destroy(pthread_cond_t* cv)
  60 {
  61         CloseHandle(cv->events[BROADCAST]);
  62         CloseHandle(cv->events[SIGNAL]);
  63
  64         DeleteCriticalSection(&cv->waiters_count_lock);
  65
  66         return 0;
  67 }
  68
  69 int pthread_cond_init(pthread_cond_t* cv, const pthread_condattr_t* attr)
  70 {
  71         // Initialize the count to 0.
  72         cv->waiters_count = 0;
  73
  74         // Create an auto-reset event.
  75         cv->events[SIGNAL] = CreateEvent(NULL,  // no security
  76                                          FALSE, // auto-reset event
  77                                          FALSE, // non-signaled initially
  78                                          NULL); // unnamed
  79
  80         // Create a manual-reset event.
  81         cv->events[BROADCAST] = CreateEvent(NULL,  // no security
  82                                             TRUE,  // manual-reset
  83                                             FALSE, // non-signaled initially
  84                                             NULL); // unnamed
  85
  86         InitializeCriticalSection(&cv->waiters_count_lock);
  87
  88         return 0;
  89 }
  90
  91 int pthread_cond_broadcast(pthread_cond_t *cv)
  92 {
  93         // Avoid race conditions.
  94         EnterCriticalSection(&cv->waiters_count_lock);
  95         int have_waiters = cv->waiters_count > 0;
  96         LeaveCriticalSection(&cv->waiters_count_lock);
  97
  98         if (have_waiters)
  99                 SetEvent(cv->events[BROADCAST]);
 100
 101         return 0;
 102 }
 103
 104 int pthread_cond_signal(pthread_cond_t* cv)
 105 {
 106         // Avoid race conditions.
 107         EnterCriticalSection(&cv->waiters_count_lock);
 108         int have_waiters = cv->waiters_count > 0;
 109         LeaveCriticalSection(&cv->waiters_count_lock);
 110
 111         if (have_waiters)
 112                 SetEvent(cv->events[SIGNAL]);
 113
 114         return 0;
 115 }
 116
 117 int pthread_cond_wait(pthread_cond_t* cv, pthread_mutex_t* external_mutex)
 118 {
 119         // Avoid race conditions.
 120         EnterCriticalSection(&cv->waiters_count_lock);
 121         cv->waiters_count++;
 122         LeaveCriticalSection(&cv->waiters_count_lock);
 123
 124         // It's ok to release the <external_mutex> here since Win32
 125         // manual-reset events maintain state when used with
 126         // <SetEvent>.  This avoids the "lost wakeup" bug...
 127         LeaveCriticalSection(external_mutex);
 128
 129         // Wait for either event to become signaled due to <pthread_cond_signal>
 130         // being called or <pthread_cond_broadcast> being called.
 131         int result = WaitForMultipleObjects(2, cv->events, FALSE, INFINITE);
 132
 133         EnterCriticalSection(&cv->waiters_count_lock);
 134         cv->waiters_count--;
 135         int last_waiter =
 136                 result == WAIT_OBJECT_0 + BROADCAST
 137                 && cv->waiters_count == 0;
 138         LeaveCriticalSection(&cv->waiters_count_lock);
 139
 140         // Some thread called <pthread_cond_broadcast>.
 141         if (last_waiter)
 142                 // We're the last waiter to be notified or to stop waiting, so
 143                 // reset the manual event.
 144                 ResetEvent(cv->events[BROADCAST]);
 145
 146         // Reacquire the <external_mutex>.
 147         EnterCriticalSection(external_mutex);
 148
 149         return result == WAIT_TIMEOUT ? ETIMEDOUT : 0;
 150 }
 151
 152 typedef struct {} pthread_mutexattr_t; //< Dummy
 153
 154 int pthread_mutex_init(pthread_mutex_t* mutex, const pthread_mutexattr_t* attr)
 155 {
 156         InitializeCriticalSection(mutex);
 157         return 0;
 158 }
 159
 160 int pthread_mutex_destroy(pthread_mutex_t* mutex)
 161 {
 162         DeleteCriticalSection(mutex);
 163         return 0;
 164 }
 165
 166 int pthread_mutex_lock(pthread_mutex_t* mutex)
 167 {
 168         EnterCriticalSection(mutex);
 169         return 0;
 170 }
 171
 172 int pthread_mutex_unlock(pthread_mutex_t* mutex)
 173 {
 174         LeaveCriticalSection(mutex);
 175         return 0;
 176 }
 177
 178 typedef struct {} pthread_attr_t;
 179
 180 typedef struct
 181 {
 182         void *(*start_routine) (void *);
 183         void* arg;
 184 } pthread_internal_thread;
 185
 186 unsigned int __stdcall ThreadProc(void* userdata)
 187 {
 188         pthread_internal_thread* ud = (pthread_internal_thread*) userdata;
 189         ud->start_routine(ud->arg);
 190
 191         free(ud);
 192
 193         return 0;
 194 }
 195
 196 int pthread_create(pthread_t* thread, const pthread_attr_t* attr, void *(*start_routine) (void *), void *arg)
 197 {
 198         pthread_internal_thread* ud = (pthread_internal_thread*) malloc(sizeof(pthread_internal_thread));
 199         ud->start_routine = start_routine;
 200         ud->arg = arg;
 201
 202         *thread = (HANDLE) (_beginthreadex(NULL, 0, &ThreadProc, ud, 0, NULL));
 203         if (!*thread)
 204                 return 1;
 205
 206         return 0;
 207 }
 208
 209 int pthread_join(pthread_t thread, void **value_ptr)
 210 {
 211         WaitForSingleObject(thread, INFINITE);
 212         CloseHandle(thread);
 213
 214         return 0;
 215 }
 216
 217 #endif
 218
 219 #include "chipmunk/chipmunk_private.h"
 220 #include "chipmunk/cpHastySpace.h"
 221
 222
 223 //MARK: ARM NEON Solver
 224
 225 #if __ARM_NEON__
 226 #include <arm_neon.h>
 227
 228 // Tested and known to work fine with Clang 3.0 and GCC 4.2
 229 // Doesn't work with Clang 1.6, and I have no idea why.
 230 #if defined(__clang_major__) && __clang_major__ < 3
 231         #error Compiler not supported.
 232 #endif
 233
 234 #if CP_USE_DOUBLES
 235         #if !__arm64
 236                 #error Cannot use CP_USE_DOUBLES on 32 bit ARM.
 237         #endif
 238
 239         typedef float64_t cpFloat_t;
 240         typedef float64x2_t cpFloatx2_t;
 241         #define vld vld1q_f64
 242         #define vdup_n vdupq_n_f64
 243         #define vst vst1q_f64
 244         #define vst_lane vst1q_lane_f64
 245         #define vadd vaddq_f64
 246         #define vsub vsubq_f64
 247         #define vpadd vpaddq_f64
 248         #define vmul vmulq_f64
 249         #define vmul_n vmulq_n_f64
 250         #define vneg vnegq_f64
 251         #define vget_lane vgetq_lane_f64
 252         #define vset_lane vsetq_lane_f64
 253         #define vmin vminq_f64
 254         #define vmax vmaxq_f64
 255         #define vrev(__a) __builtin_shufflevector(__a, __a, 1, 0)
 256 #else
 257         typedef float32_t cpFloat_t;
 258         typedef float32x2_t cpFloatx2_t;
 259         #define vld vld1_f32
 260         #define vdup_n vdup_n_f32
 261         #define vst vst1_f32
 262         #define vst_lane vst1_lane_f32
 263         #define vadd vadd_f32
 264         #define vsub vsub_f32
 265         #define vpadd vpadd_f32
 266         #define vmul vmul_f32
 267         #define vmul_n vmul_n_f32
 268         #define vneg vneg_f32
 269         #define vget_lane vget_lane_f32
 270         #define vset_lane vset_lane_f32
 271         #define vmin vmin_f32
 272         #define vmax vmax_f32
 273         #define vrev vrev64_f32
 274 #endif
 275
 276 // TODO could probably do better here, maybe using vcreate?
 277 // especially for the constants
 278 // Maybe use the {} notation for GCC/Clang?
 279 static inline cpFloatx2_t
 280 vmake(cpFloat_t x, cpFloat_t y)
 281 {
 282 //      cpFloatx2_t v = {};
 283 //      v = vset_lane(x, v, 0);
 284 //      v = vset_lane(y, v, 1);
 285 //
 286 //      return v;
 287
 288         // This might not be super compatible, but all the NEON headers use it...
 289         return (cpFloatx2_t){x, y};
 290 }
 291
 292 static void
 293 cpArbiterApplyImpulse_NEON(cpArbiter *arb)
 294 {
 295         cpBody *a = arb->body_a;
 296         cpBody *b = arb->body_b;
 297         cpFloatx2_t surface_vr = vld((cpFloat_t *)&arb->surface_vr);
 298         cpFloatx2_t n = vld((cpFloat_t *)&arb->n);
 299         cpFloat_t friction = arb->u;
 300
 301         int numContacts = arb->count;
 302         struct cpContact *contacts = arb->contacts;
 303         for(int i=0; i<numContacts; i++){
 304                 struct cpContact *con = contacts + i;
 305                 cpFloatx2_t r1 = vld((cpFloat_t *)&con->r1);
 306                 cpFloatx2_t r2 = vld((cpFloat_t *)&con->r2);
 307
 308                 cpFloatx2_t perp = vmake(-1.0, 1.0);
 309                 cpFloatx2_t r1p = vmul(vrev(r1), perp);
 310                 cpFloatx2_t r2p = vmul(vrev(r2), perp);
 311
 312                 cpFloatx2_t vBias_a = vld((cpFloat_t *)&a->v_bias);
 313                 cpFloatx2_t vBias_b = vld((cpFloat_t *)&b->v_bias);
 314                 cpFloatx2_t wBias = vmake(a->w_bias, b->w_bias);
 315
 316                 cpFloatx2_t vb1 = vadd(vBias_a, vmul_n(r1p, vget_lane(wBias, 0)));
 317                 cpFloatx2_t vb2 = vadd(vBias_b, vmul_n(r2p, vget_lane(wBias, 1)));
 318                 cpFloatx2_t vbr = vsub(vb2, vb1);
 319
 320                 cpFloatx2_t v_a = vld((cpFloat_t *)&a->v);
 321                 cpFloatx2_t v_b = vld((cpFloat_t *)&b->v);
 322                 cpFloatx2_t w = vmake(a->w, b->w);
 323                 cpFloatx2_t v1 = vadd(v_a, vmul_n(r1p, vget_lane(w, 0)));
 324                 cpFloatx2_t v2 = vadd(v_b, vmul_n(r2p, vget_lane(w, 1)));
 325                 cpFloatx2_t vr = vsub(v2, v1);
 326
 327                 cpFloatx2_t vbn_vrn = vpadd(vmul(vbr, n), vmul(vr, n));
 328
 329                 cpFloatx2_t v_offset = vmake(con->bias, -con->bounce);
 330                 cpFloatx2_t jOld = vmake(con->jBias, con->jnAcc);
 331                 cpFloatx2_t jbn_jn = vmul_n(vsub(v_offset, vbn_vrn), con->nMass);
 332                 jbn_jn = vmax(vadd(jOld, jbn_jn), vdup_n(0.0));
 333                 cpFloatx2_t jApply = vsub(jbn_jn, jOld);
 334
 335                 cpFloatx2_t t = vmul(vrev(n), perp);
 336                 cpFloatx2_t vrt_tmp = vmul(vadd(vr, surface_vr), t);
 337                 cpFloatx2_t vrt = vpadd(vrt_tmp, vrt_tmp);
 338
 339                 cpFloatx2_t jtOld = {}; jtOld = vset_lane(con->jtAcc, jtOld, 0);
 340                 cpFloatx2_t jtMax = vrev(vmul_n(jbn_jn, friction));
 341                 cpFloatx2_t jt = vmul_n(vrt, -con->tMass);
 342                 jt = vmax(vneg(jtMax), vmin(vadd(jtOld, jt), jtMax));
 343                 cpFloatx2_t jtApply = vsub(jt, jtOld);
 344
 345                 cpFloatx2_t i_inv = vmake(-a->i_inv, b->i_inv);
 346                 cpFloatx2_t nperp = vmake(1.0, -1.0);
 347
 348                 cpFloatx2_t jBias = vmul_n(n, vget_lane(jApply, 0));
 349                 cpFloatx2_t jBiasCross = vmul(vrev(jBias), nperp);
 350                 cpFloatx2_t biasCrosses = vpadd(vmul(r1, jBiasCross), vmul(r2, jBiasCross));
 351                 wBias = vadd(wBias, vmul(i_inv, biasCrosses));
 352
 353                 vBias_a = vsub(vBias_a, vmul_n(jBias, a->m_inv));
 354                 vBias_b = vadd(vBias_b, vmul_n(jBias, b->m_inv));
 355
 356                 cpFloatx2_t j = vadd(vmul_n(n, vget_lane(jApply, 1)), vmul_n(t, vget_lane(jtApply, 0)));
 357                 cpFloatx2_t jCross = vmul(vrev(j), nperp);
 358                 cpFloatx2_t crosses = vpadd(vmul(r1, jCross), vmul(r2, jCross));
 359                 w = vadd(w, vmul(i_inv, crosses));
 360
 361                 v_a = vsub(v_a, vmul_n(j, a->m_inv));
 362                 v_b = vadd(v_b, vmul_n(j, b->m_inv));
 363
 364                 // TODO would moving these earlier help pipeline them better?
 365                 vst((cpFloat_t *)&a->v_bias, vBias_a);
 366                 vst((cpFloat_t *)&b->v_bias, vBias_b);
 367                 vst_lane((cpFloat_t *)&a->w_bias, wBias, 0);
 368                 vst_lane((cpFloat_t *)&b->w_bias, wBias, 1);
 369
 370                 vst((cpFloat_t *)&a->v, v_a);
 371                 vst((cpFloat_t *)&b->v, v_b);
 372                 vst_lane((cpFloat_t *)&a->w, w, 0);
 373                 vst_lane((cpFloat_t *)&b->w, w, 1);
 374
 375                 vst_lane((cpFloat_t *)&con->jBias, jbn_jn, 0);
 376                 vst_lane((cpFloat_t *)&con->jnAcc, jbn_jn, 1);
 377                 vst_lane((cpFloat_t *)&con->jtAcc, jt, 0);
 378         }
 379 }
 380
 381 #endif
 382
 383 //MARK: PThreads
 384
 385 // Right now using more than 2 threads probably wont help your performance any.
 386 // If you are using a ridiculous number of iterations it could help though.
 387 #define MAX_THREADS 2
 388
 389 struct ThreadContext {
 390         pthread_t thread;
 391         cpHastySpace *space;
 392         unsigned long thread_num;
 393 };
 394
 395 typedef void (*cpHastySpaceWorkFunction)(cpSpace *space, unsigned long worker, unsigned long worker_count);
 396
 397 struct cpHastySpace {
 398         cpSpace space;
 399
 400         // Number of worker threads (including the main thread)
 401         unsigned long num_threads;
 402
 403         // Number of worker threads currently executing. (also including the main thread)
 404         unsigned long num_working;
 405
 406         // Number of constraints (plus contacts) that must exist per step to start the worker threads.
 407         unsigned long constraint_count_threshold;
 408
 409         pthread_mutex_t mutex;
 410         pthread_cond_t cond_work, cond_resume;
 411
 412         // Work function to invoke.
 413         cpHastySpaceWorkFunction work;
 414
 415         struct ThreadContext workers[MAX_THREADS - 1];
 416 };
 417
 418 static void *
 419 WorkerThreadLoop(struct ThreadContext *context)
 420 {
 421         cpHastySpace *hasty = context->space;
 422
 423         unsigned long thread = context->thread_num;
 424         unsigned long num_threads = hasty->num_threads;
 425
 426         for(;;){
 427                 pthread_mutex_lock(&hasty->mutex); {
 428                         if(--hasty->num_working == 0){
 429                                 pthread_cond_signal(&hasty->cond_resume);
 430                         }
 431
 432                         pthread_cond_wait(&hasty->cond_work, &hasty->mutex);
 433                 } pthread_mutex_unlock(&hasty->mutex);
 434
 435                 cpHastySpaceWorkFunction func = hasty->work;
 436                 if(func){
 437                         hasty->work(&hasty->space, thread, num_threads);
 438                 } else {
 439                         break;
 440                 }
 441         }
 442
 443         return NULL;
 444 }
 445
 446 static void
 447 RunWorkers(cpHastySpace *hasty, cpHastySpaceWorkFunction func)
 448 {
 449         hasty->num_working = hasty->num_threads - 1;
 450         hasty->work = func;
 451
 452         if(hasty->num_working > 0){
 453                 pthread_mutex_lock(&hasty->mutex); {
 454                         pthread_cond_broadcast(&hasty->cond_work);
 455                 } pthread_mutex_unlock(&hasty->mutex);
 456
 457                 func((cpSpace *)hasty, 0, hasty->num_threads);
 458
 459                 pthread_mutex_lock(&hasty->mutex); {
 460                         if(hasty->num_working > 0){
 461                                 pthread_cond_wait(&hasty->cond_resume, &hasty->mutex);
 462                         }
 463                 } pthread_mutex_unlock(&hasty->mutex);
 464         } else {
 465                 func((cpSpace *)hasty, 0, hasty->num_threads);
 466         }
 467
 468         hasty->work = NULL;
 469 }
 470
 471 static void
 472 Solver(cpSpace *space, unsigned long worker, unsigned long worker_count)
 473 {
 474         cpArray *constraints = space->constraints;
 475         cpArray *arbiters = space->arbiters;
 476
 477         cpFloat dt = space->curr_dt;
 478         unsigned long iterations = (space->iterations + worker_count - 1)/worker_count;
 479
 480         for(unsigned long i=0; i<iterations; i++){
 481                 for(int j=0; j<arbiters->num; j++){
 482                         cpArbiter *arb = (cpArbiter *)arbiters->arr[j];
 483                         #ifdef __ARM_NEON__
 484                                 cpArbiterApplyImpulse_NEON(arb);
 485                         #else
 486                                 cpArbiterApplyImpulse(arb);
 487                         #endif
 488                 }
 489
 490                 for(int j=0; j<constraints->num; j++){
 491                         cpConstraint *constraint = (cpConstraint *)constraints->arr[j];
 492                         constraint->klass->applyImpulse(constraint, dt);
 493                 }
 494         }
 495 }
 496
 497 //MARK: Thread Management Functions
 498
 499 static void
 500 HaltThreads(cpHastySpace *hasty)
 501 {
 502         pthread_mutex_t *mutex = &hasty->mutex;
 503         pthread_mutex_lock(mutex); {
 504                 hasty->work = NULL; // NULL work function means break and exit
 505                 pthread_cond_broadcast(&hasty->cond_work);
 506         } pthread_mutex_unlock(mutex);
 507
 508         for(unsigned long i=0; i<(hasty->num_threads-1); i++){
 509                 pthread_join(hasty->workers[i].thread, NULL);
 510         }
 511 }
 512
 513 void
 514 cpHastySpaceSetThreads(cpSpace *space, unsigned long threads)
 515 {
 516 #if TARGET_IPHONE_SIMULATOR == 1
 517         // Individual values appear to be written non-atomically when compiled as debug for the simulator.
 518         // No idea why, so threads are disabled.
 519         threads = 1;
 520 #endif
 521
 522         cpHastySpace *hasty = (cpHastySpace *)space;
 523         HaltThreads(hasty);
 524
 525 #ifdef __APPLE__
 526         if(threads == 0){
 527                 size_t size = sizeof(threads);
 528                 sysctlbyname("hw.ncpu", &threads, &size, NULL, 0);
 529         }
 530 #else
 531         if(threads == 0) threads = 1;
 532 #endif
 533
 534         hasty->num_threads = (threads < MAX_THREADS ? threads : MAX_THREADS);
 535         hasty->num_working = hasty->num_threads - 1;
 536
 537         // Create the worker threads and wait for them to signal ready.
 538         if(hasty->num_working > 0){
 539                 pthread_mutex_lock(&hasty->mutex);
 540                 for(unsigned long i=0; i<(hasty->num_threads-1); i++){
 541                         hasty->workers[i].space = hasty;
 542                         hasty->workers[i].thread_num = i + 1;
 543
 544                         pthread_create(&hasty->workers[i].thread, NULL, (void*(*)(void*))WorkerThreadLoop, &hasty->workers[i]);
 545                 }
 546
 547                 pthread_cond_wait(&hasty->cond_resume, &hasty->mutex);
 548                 pthread_mutex_unlock(&hasty->mutex);
 549         }
 550 }
 551
 552 unsigned long
 553 cpHastySpaceGetThreads(cpSpace *space)
 554 {
 555         return ((cpHastySpace *)space)->num_threads;
 556 }
 557
 558 //MARK: Overriden cpSpace Functions.
 559
 560 cpSpace *
 561 cpHastySpaceNew(void)
 562 {
 563         cpHastySpace *hasty = (cpHastySpace *)cpcalloc(1, sizeof(cpHastySpace));
 564         cpSpaceInit((cpSpace *)hasty);
 565
 566         pthread_mutex_init(&hasty->mutex, NULL);
 567         pthread_cond_init(&hasty->cond_work, NULL);
 568         pthread_cond_init(&hasty->cond_resume, NULL);
 569
 570         // TODO magic number, should test this more thoroughly.
 571         hasty->constraint_count_threshold = 50;
 572
 573         // Default to 1 thread for determinism.
 574         hasty->num_threads = 1;
 575         cpHastySpaceSetThreads((cpSpace *)hasty, 1);
 576
 577         return (cpSpace *)hasty;
 578 }
 579
 580 void
 581 cpHastySpaceFree(cpSpace *space)
 582 {
 583         cpHastySpace *hasty = (cpHastySpace *)space;
 584
 585         HaltThreads(hasty);
 586
 587         pthread_mutex_destroy(&hasty->mutex);
 588         pthread_cond_destroy(&hasty->cond_work);
 589         pthread_cond_destroy(&hasty->cond_resume);
 590
 591         cpSpaceFree(space);
 592 }
 593
 594 void
 595 cpHastySpaceStep(cpSpace *space, cpFloat dt)
 596 {
 597         // don't step if the timestep is 0!
 598         if(dt == 0.0f) return;
 599
 600         space->stamp++;
 601
 602         cpFloat prev_dt = space->curr_dt;
 603         space->curr_dt = dt;
 604
 605         cpArray *bodies = space->dynamicBodies;
 606         cpArray *constraints = space->constraints;
 607         cpArray *arbiters = space->arbiters;
 608
 609         // Reset and empty the arbiter list.
 610         for(int i=0; i<arbiters->num; i++){
 611                 cpArbiter *arb = (cpArbiter *)arbiters->arr[i];
 612                 arb->state = CP_ARBITER_STATE_NORMAL;
 613
 614                 // If both bodies are awake, unthread the arbiter from the contact graph.
 615                 if(!cpBodyIsSleeping(arb->body_a) && !cpBodyIsSleeping(arb->body_b)){
 616                         cpArbiterUnthread(arb);
 617                 }
 618         }
 619         arbiters->num = 0;
 620
 621         cpSpaceLock(space); {
 622                 // Integrate positions
 623                 for(int i=0; i<bodies->num; i++){
 624                         cpBody *body = (cpBody *)bodies->arr[i];
 625                         body->position_func(body, dt);
 626                 }
 627
 628                 // Find colliding pairs.
 629                 cpSpacePushFreshContactBuffer(space);
 630                 cpSpatialIndexEach(space->dynamicShapes, (cpSpatialIndexIteratorFunc)cpShapeUpdateFunc, NULL);
 631                 cpSpatialIndexReindexQuery(space->dynamicShapes, (cpSpatialIndexQueryFunc)cpSpaceCollideShapes, space);
 632         } cpSpaceUnlock(space, cpFalse);
 633
 634         // Rebuild the contact graph (and detect sleeping components if sleeping is enabled)
 635         cpSpaceProcessComponents(space, dt);
 636
 637         cpSpaceLock(space); {
 638                 // Clear out old cached arbiters and call separate callbacks
 639                 cpHashSetFilter(space->cachedArbiters, (cpHashSetFilterFunc)cpSpaceArbiterSetFilter, space);
 640
 641                 // Prestep the arbiters and constraints.
 642                 cpFloat slop = space->collisionSlop;
 643                 cpFloat biasCoef = 1.0f - cpfpow(space->collisionBias, dt);
 644                 for(int i=0; i<arbiters->num; i++){
 645                         cpArbiterPreStep((cpArbiter *)arbiters->arr[i], dt, slop, biasCoef);
 646                 }
 647
 648                 for(int i=0; i<constraints->num; i++){
 649                         cpConstraint *constraint = (cpConstraint *)constraints->arr[i];
 650
 651                         cpConstraintPreSolveFunc preSolve = constraint->preSolve;
 652                         if(preSolve) preSolve(constraint, space);
 653
 654                         constraint->klass->preStep(constraint, dt);
 655                 }
 656
 657                 // Integrate velocities.
 658                 cpFloat damping = cpfpow(space->damping, dt);
 659                 cpVect gravity = space->gravity;
 660                 for(int i=0; i<bodies->num; i++){
 661                         cpBody *body = (cpBody *)bodies->arr[i];
 662                         body->velocity_func(body, gravity, damping, dt);
 663                 }
 664
 665                 // Apply cached impulses
 666                 cpFloat dt_coef = (prev_dt == 0.0f ? 0.0f : dt/prev_dt);
 667                 for(int i=0; i<arbiters->num; i++){
 668                         cpArbiterApplyCachedImpulse((cpArbiter *)arbiters->arr[i], dt_coef);
 669                 }
 670
 671                 for(int i=0; i<constraints->num; i++){
 672                         cpConstraint *constraint = (cpConstraint *)constraints->arr[i];
 673                         constraint->klass->applyCachedImpulse(constraint, dt_coef);
 674                 }
 675
 676                 // Run the impulse solver.
 677                 cpHastySpace *hasty = (cpHastySpace *)space;
 678                 if((unsigned long)(arbiters->num + constraints->num) > hasty->constraint_count_threshold){
 679                         RunWorkers(hasty, Solver);
 680                 } else {
 681                         Solver(space, 0, 1);
 682                 }
 683
 684                 // Run the constraint post-solve callbacks
 685                 for(int i=0; i<constraints->num; i++){
 686                         cpConstraint *constraint = (cpConstraint *)constraints->arr[i];
 687
 688                         cpConstraintPostSolveFunc postSolve = constraint->postSolve;
 689                         if(postSolve) postSolve(constraint, space);
 690                 }
 691
 692                 // run the post-solve callbacks
 693                 for(int i=0; i<arbiters->num; i++){
 694                         cpArbiter *arb = (cpArbiter *) arbiters->arr[i];
 695
 696                         cpCollisionHandler *handler = arb->handler;
 697                         handler->postSolveFunc(arb, space, handler->userData);
 698                 }
 699         } cpSpaceUnlock(space, cpTrue);
 700 }