From: joshualitt <joshualitt@chromium.org>
Date: Thu, 28 Jan 2016 14:26:35 +0000 (-0800)
Subject: Add a background timing thread to kilobench
X-Git-Tag: accepted/tizen/5.0/unified/20181102.025319~129^2~2355
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b35c82dc943073e9945c0beea2d49925b45428dd;p=platform%2Fupstream%2FlibSkiaSharp.git

Add a background timing thread to kilobench

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1612513002

Review URL: https://codereview.chromium.org/1612513002
---

diff --git a/include/gpu/gl/SkGLContext.h b/include/gpu/gl/SkGLContext.h
index 992edf4..ddf5dc0 100644
--- a/include/gpu/gl/SkGLContext.h
+++ b/include/gpu/gl/SkGLContext.h
@@ -85,6 +85,11 @@ public:
 
     class GLFenceSync;  // SkGpuFenceSync implementation that uses the OpenGL functionality.
 
+    /*
+     * returns the fencesync object owned by this SkGLContext
+     */
+    SkGpuFenceSync* fenceSync() { return fFenceSync.get(); }
+
 protected:
     SkGLContext();
 
diff --git a/tools/kilobench/kilobench.cpp b/tools/kilobench/kilobench.cpp
index 8c844f4..1b9cb34 100644
--- a/tools/kilobench/kilobench.cpp
+++ b/tools/kilobench/kilobench.cpp
@@ -14,10 +14,15 @@
 #include "SkStream.h"
 #include "SkSurface.h"
 #include "SkTime.h"
+#include "SkTLList.h"
+#include "SkThreadUtils.h"
 #include "Stats.h"
 #include "Timer.h"
 #include "VisualSKPBench.h"
 #include "gl/GrGLDefines.h"
+#include "../private/SkMutex.h"
+#include "../private/SkSemaphore.h"
+#include "../private/SkGpuFenceSync.h"
 
 // posix only for now
 #include <unistd.h>
@@ -34,7 +39,6 @@
 #include "SkImageDecoder.h"
 __SK_FORCE_IMAGE_DECODER_LINKING;
 
-
 static const int kAutoTuneLoops = 0;
 
 static const int kDefaultLoops =
@@ -68,6 +72,8 @@ DEFINE_int32(maxLoops, 1000000, "Never run a bench more times than this.");
 DEFINE_int32(loops, kDefaultLoops, loops_help_txt().c_str());
 DEFINE_double(gpuMs, 5, "Target bench time in millseconds for GPU.");
 DEFINE_string2(writePath, w, "", "If set, write bitmaps here as .pngs.");
+DEFINE_bool(useBackgroundThread, true, "If false, kilobench will time cpu / gpu work together");
+DEFINE_bool(useMultiProcess, true, "If false, kilobench will run all tests in one process");
 
 static SkString humanize(double ms) {
     return HumanizeMs(ms);
@@ -146,25 +152,29 @@ private:
 
 struct GPUTarget {
     void setup() {
-        this->gl->makeCurrent();
+        fGL->makeCurrent();
         // Make sure we're done with whatever came before.
-        SK_GL(*this->gl, Finish());
+        SK_GL(*fGL, Finish());
     }
 
     SkCanvas* beginTiming(SkCanvas* canvas) { return canvas; }
 
-    void endTiming() {
-        if (this->gl) {
-            SK_GL(*this->gl, Flush());
-            this->gl->swapBuffers();
+    void endTiming(bool usePlatformSwapBuffers) {
+        if (fGL) {
+            SK_GL(*fGL, Flush());
+            if (usePlatformSwapBuffers) {
+                fGL->swapBuffers();
+            } else {
+                fGL->waitOnSyncOrSwap();
+            }
         }
     }
-    void fence() {
-        SK_GL(*this->gl, Finish());
+    void finish() {
+        SK_GL(*fGL, Finish());
     }
 
     bool needsFrameTiming(int* maxFrameLag) const {
-        if (!this->gl->getMaxGpuFrameLag(maxFrameLag)) {
+        if (!fGL->getMaxGpuFrameLag(maxFrameLag)) {
             // Frame lag is unknown.
             *maxFrameLag = FLAGS_gpuFrameLag;
         }
@@ -182,24 +192,24 @@ struct GPUTarget {
         uint32_t flags = useDfText ? SkSurfaceProps::kUseDeviceIndependentFonts_Flag :
                                                   0;
         SkSurfaceProps props(flags, SkSurfaceProps::kLegacyFontHost_InitType);
-        this->surface.reset(SkSurface::NewRenderTarget(context,
-                                                       SkSurface::kNo_Budgeted, info,
-                                                       numSamples, &props));
-        this->gl = factory->getContextInfo(ctxType, ctxOptions).fGLContext;
-        if (!this->surface.get()) {
+        fSurface.reset(SkSurface::NewRenderTarget(context,
+                                                  SkSurface::kNo_Budgeted, info,
+                                                  numSamples, &props));
+        fGL = factory->getContextInfo(ctxType, ctxOptions).fGLContext;
+        if (!fSurface.get()) {
             return false;
         }
 
         // Kilobench should only be used on platforms with fence sync support
-        SkASSERT(this->gl->fenceSyncSupport());
+        SkASSERT(fGL->fenceSyncSupport());
         return true;
     }
 
     SkCanvas* getCanvas() const {
-        if (!surface.get()) {
+        if (!fSurface.get()) {
             return nullptr;
         }
-        return surface->getCanvas();
+        return fSurface->getCanvas();
     }
 
     bool capturePixels(SkBitmap* bmp) {
@@ -215,10 +225,11 @@ struct GPUTarget {
         return true;
     }
 
+    SkGLContext* gl() { return fGL; }
+
 private:
-    //const Config config;
-    SkGLContext* gl;
-    SkAutoTDelete<SkSurface> surface;
+    SkGLContext* fGL;
+    SkAutoTDelete<SkSurface> fSurface;
 };
 
 static bool write_canvas_png(GPUTarget* target, const SkString& filename) {
@@ -276,24 +287,159 @@ static int clamp_loops(int loops) {
 }
 
 static double now_ms() { return SkTime::GetNSecs() * 1e-6; }
-static double time(int loops, Benchmark* bench, GPUTarget* target) {
-    SkCanvas* canvas = target->getCanvas();
-    if (canvas) {
-        canvas->clear(SK_ColorWHITE);
+
+struct TimingThread {
+    TimingThread(SkGLContext* mainContext)
+        : fFenceSync(mainContext->fenceSync())
+        ,  fMainContext(mainContext)
+        ,  fDone(false) {}
+
+    static void Loop(void* data) {
+        TimingThread* timingThread = reinterpret_cast<TimingThread*>(data);
+        timingThread->timingLoop();
+    }
+
+    // To ensure waiting for the sync actually does something, we check to make sure the we exceed
+    // some small value
+    const double kMinElapsed = 1e-6;
+    bool sanity(double start) const {
+        double elapsed = now_ms() - start;
+        return elapsed > kMinElapsed;
+    }
+
+    void waitFence(SkPlatformGpuFence sync) {
+        SkDEBUGCODE(double start = now_ms());
+        fFenceSync->waitFence(sync, false);
+        SkASSERT(sanity(start));
+    }
+
+    void timingLoop() {
+        // Create a context which shares display lists with the main thread
+        SkAutoTDelete<SkGLContext> glContext(SkCreatePlatformGLContext(kNone_GrGLStandard,
+                                                                       fMainContext));
+        glContext->makeCurrent();
+
+        // Basic timing methodology is:
+        // 1) Wait on semaphore until main thread indicates its time to start timing the frame
+        // 2) Wait on frame start sync, record time.  This is start of the frame.
+        // 3) Wait on semaphore until main thread indicates its time to finish timing the frame
+        // 4) Wait on frame end sync, record time.  FrameEndTime - FrameStartTime = frame time
+        // 5) Wait on semaphore until main thread indicates we should time the next frame or quit
+        while (true) {
+            fSemaphore.wait();
+
+            // get start sync
+            SkPlatformGpuFence startSync = this->popStartSync();
+
+            // wait on sync
+            this->waitFence(startSync);
+            double start = kilobench::now_ms();
+
+            // do we want to sleep here?
+            // wait for end sync
+            fSemaphore.wait();
+
+            // get end sync
+            SkPlatformGpuFence endSync = this->popEndSync();
+
+            // wait on sync
+            this->waitFence(endSync);
+            double elapsed = kilobench::now_ms() - start;
+
+            // No mutex needed, client won't touch timings until we're done
+            fTimings.push_back(elapsed);
+
+            // clean up fences
+            fFenceSync->deleteFence(startSync);
+            fFenceSync->deleteFence(endSync);
+
+            fSemaphore.wait();
+            if (this->isDone()) {
+                break;
+            }
+        }
+    }
+
+    void pushStartSync() { this->pushSync(&fFrameStartSyncs, &fFrameStartSyncsMutex); }
+
+    SkPlatformGpuFence popStartSync() {
+        return this->popSync(&fFrameStartSyncs, &fFrameStartSyncsMutex);
+    }
+
+    void pushEndSync() { this->pushSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); }
+
+    SkPlatformGpuFence popEndSync() { return this->popSync(&fFrameEndSyncs, &fFrameEndSyncsMutex); }
+
+    void setDone() {
+        SkAutoMutexAcquire done(fDoneMutex);
+        fDone = true;
+        fSemaphore.signal();
+    }
+
+    typedef SkTLList<SkPlatformGpuFence, 1> SyncQueue;
+
+    void pushSync(SyncQueue* queue, SkMutex* mutex) {
+        SkAutoMutexAcquire am(mutex);
+        *queue->addToHead() = fFenceSync->insertFence();
+        fSemaphore.signal();
+    }
+
+    SkPlatformGpuFence popSync(SyncQueue* queue, SkMutex* mutex) {
+        SkAutoMutexAcquire am(mutex);
+        SkPlatformGpuFence sync = *queue->head();
+        queue->popHead();
+        return sync;
+    }
+
+    bool isDone() {
+        SkAutoMutexAcquire am1(fFrameStartSyncsMutex);
+        SkAutoMutexAcquire done(fDoneMutex);
+        if (fDone && fFrameStartSyncs.isEmpty()) {
+            return true;
+        } else {
+            return false;
+        }
     }
+
+    const SkTArray<double>& timings() const { SkASSERT(fDone); return fTimings; }
+
+private:
+    SkGpuFenceSync* fFenceSync;
+    SkSemaphore fSemaphore;
+    SkMutex fFrameStartSyncsMutex;
+    SyncQueue fFrameStartSyncs;
+    SkMutex fFrameEndSyncsMutex;
+    SyncQueue fFrameEndSyncs;
+    SkTArray<double> fTimings;
+    SkMutex fDoneMutex;
+    SkGLContext* fMainContext;
+    bool fDone;
+};
+
+static double time(int loops, Benchmark* bench, GPUTarget* target, TimingThread* timingThread) {
+    SkCanvas* canvas = target->getCanvas();
+    canvas->clear(SK_ColorWHITE);
     bench->preDraw(canvas);
+
+    if (timingThread) {
+        timingThread->pushStartSync();
+    }
     double start = now_ms();
     canvas = target->beginTiming(canvas);
     bench->draw(loops, canvas);
-    if (canvas) {
-        canvas->flush();
-    }
-    target->endTiming();
+    canvas->flush();
+    target->endTiming(timingThread ? true : false);
+
     double elapsed = now_ms() - start;
+    if (timingThread) {
+        timingThread->pushEndSync();
+        timingThread->setDone();
+    }
     bench->postDraw(canvas);
     return elapsed;
 }
 
+// TODO For now we don't use the background timing thread to tune loops
 static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameLag) {
     // First, figure out how many loops it'll take to get a frame up to FLAGS_gpuMs.
     int loops = bench->calculateLoops(FLAGS_loops);
@@ -310,7 +456,7 @@ static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameL
             // If the GPU lets frames lag at all, we need to make sure we're timing
             // _this_ round, not still timing last round.
             for (int i = 0; i < maxGpuFrameLag; i++) {
-                elapsed = time(loops, bench, target);
+                elapsed = time(loops, bench, target, nullptr);
             }
         } while (elapsed < FLAGS_gpuMs);
 
@@ -319,7 +465,7 @@ static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameL
         loops = clamp_loops(loops);
 
         // Make sure we're not still timing our calibration.
-        target->fence();
+        target->finish();
     } else {
         loops = detect_forever_loops(loops);
     }
@@ -327,7 +473,7 @@ static int setup_gpu_bench(GPUTarget* target, Benchmark* bench, int maxGpuFrameL
     // Pretty much the same deal as the calibration: do some warmup to make
     // sure we're timing steady-state pipelined frames.
     for (int i = 0; i < maxGpuFrameLag - 1; i++) {
-        time(loops, bench, target);
+        time(loops, bench, target, nullptr);
     }
 
     return loops;
@@ -351,13 +497,14 @@ struct AutoSetupContextBenchAndTarget {
 
     int getLoops() { return setup_gpu_bench(&fTarget, fBenchmark, fMaxFrameLag); }
 
-    double timeSample(int loops) {
+    double timeSample(int loops, TimingThread* timingThread) {
         for (int i = 0; i < fMaxFrameLag; i++) {
-            time(loops, fBenchmark, &fTarget);
+            time(loops, fBenchmark, &fTarget, timingThread);
         }
 
-        return time(loops, fBenchmark, &fTarget) / loops;
+        return time(loops, fBenchmark, &fTarget, timingThread) / loops;
     }
+
     void teardownBench() { fBenchmark->perCanvasPostDraw(fCanvas); }
 
     SkAutoTDelete<GrContextFactory> fCtxFactory;
@@ -381,9 +528,32 @@ int setup_loops(Benchmark* bench) {
     return loops;
 }
 
-double time_sample(Benchmark* bench, int loops) {
+struct Sample {
+    double fCpu;
+    double fGpu;
+};
+
+Sample time_sample(Benchmark* bench, int loops) {
     AutoSetupContextBenchAndTarget ascbt(bench);
-    double sample = ascbt.timeSample(loops);
+
+    Sample sample;
+    if (FLAGS_useBackgroundThread) {
+        TimingThread timingThread(ascbt.fTarget.gl());
+        SkAutoTDelete<SkThread> nativeThread(new SkThread(TimingThread::Loop, &timingThread));
+        nativeThread->start();
+        sample.fCpu = ascbt.timeSample(loops, &timingThread);
+        nativeThread->join();
+
+        // return the min
+        double min = SK_ScalarMax;
+        for (int i = 0; i < timingThread.timings().count(); i++) {
+            min = SkTMin(min, timingThread.timings()[i]);
+        }
+        sample.fGpu = min;
+    } else {
+        sample.fCpu = ascbt.timeSample(loops, nullptr);
+    }
+
     ascbt.teardownBench();
 
     return sample;
@@ -393,6 +563,24 @@ double time_sample(Benchmark* bench, int loops) {
 
 static const int kOutResultSize = 1024;
 
+void printResult(const SkTArray<double>& samples, int loops, const char* name, const char* mod) {
+    SkString newName(name);
+    newName.appendf("_%s", mod);
+    Stats stats(samples);
+    const double stddev_percent = 100 * sqrt(stats.var) / stats.mean;
+    SkDebugf("%d\t%s\t%s\t%s\t%s\t%.0f%%\t%s\t%s\t%s\n"
+        , loops
+        , HUMANIZE(stats.min)
+        , HUMANIZE(stats.median)
+        , HUMANIZE(stats.mean)
+        , HUMANIZE(stats.max)
+        , stddev_percent
+        , stats.plot.c_str()
+        , "gpu"
+        , newName.c_str()
+    );
+}
+
 int kilobench_main() {
     kilobench::BenchmarkStream benchStream;
 
@@ -407,60 +595,63 @@ int kilobench_main() {
     while (Benchmark* b = benchStream.next()) {
         SkAutoTDelete<Benchmark> bench(b);
 
-        int loops;
-        SkTArray<double> samples;
+        int loops = 1;
+        SkTArray<double> cpuSamples;
+        SkTArray<double> gpuSamples;
         for (int i = 0; i < FLAGS_samples + 1; i++) {
             // We fork off a new process to setup the grcontext and run the test while we wait
-            int childPid = fork();
-            if (childPid > 0) {
-                char result[kOutResultSize];
-                if (read(descriptors[0], result, kOutResultSize) < 0) {
-                     SkFAIL("Failed to read from pipe\n");
-                }
-
-                // if samples == 0 then parse # of loops
-                // else parse float
-                if (i == 0) {
-                    sscanf(result, "%d", &loops);
+            if (FLAGS_useMultiProcess) {
+                int childPid = fork();
+                if (childPid > 0) {
+                    char result[kOutResultSize];
+                    if (read(descriptors[0], result, kOutResultSize) < 0) {
+                         SkFAIL("Failed to read from pipe\n");
+                    }
+
+                    // if samples == 0 then parse # of loops
+                    // else parse float
+                    if (i == 0) {
+                        sscanf(result, "%d", &loops);
+                    } else {
+                        sscanf(result, "%lf %lf", &cpuSamples.push_back(),
+                                                  &gpuSamples.push_back());
+                    }
+
+                    // wait until exit
+                    int status;
+                    waitpid(childPid, &status, 0);
+                } else if (0 == childPid) {
+                    char result[kOutResultSize];
+                    if (i == 0) {
+                        sprintf(result, "%d", kilobench::setup_loops(bench));
+                    } else {
+                        kilobench::Sample sample = kilobench::time_sample(bench, loops);
+                        sprintf(result, "%lf %lf", sample.fCpu, sample.fGpu);
+                    }
+
+                    // Make sure to write the null terminator
+                    if (write(descriptors[1], result, strlen(result) + 1) < 0) {
+                        SkFAIL("Failed to write to pipe\n");
+                    }
+                    return 0;
                 } else {
-                    sscanf(result, "%lf", &samples.push_back());
+                    SkFAIL("Fork failed\n");
                 }
-
-                // wait until exit
-                int status;
-                waitpid(childPid, &status, 0);
-            } else if (0 == childPid) {
-                char result[kOutResultSize];
+            } else {
                 if (i == 0) {
-                    sprintf(result, "%d", kilobench::setup_loops(bench));
+                    loops = kilobench::setup_loops(bench);
                 } else {
-                    sprintf(result, "%lf", kilobench::time_sample(bench, loops));
+                    kilobench::Sample sample = kilobench::time_sample(bench, loops);
+                    cpuSamples.push_back(sample.fCpu);
+                    gpuSamples.push_back(sample.fGpu);
                 }
-
-                // Make sure to write the null terminator
-                if (write(descriptors[1], result, strlen(result) + 1) < 0) {
-                    SkFAIL("Failed to write to pipe\n");
-                }
-                return 0;
-            } else {
-                SkFAIL("Fork failed\n");
             }
         }
 
-        Stats stats(samples);
-        const double stddev_percent = 100 * sqrt(stats.var) / stats.mean;
-        SkDebugf("%d\t%s\t%s\t%s\t%s\t%.0f%%\t%s\t%s\t%s\n"
-                , loops
-                , HUMANIZE(stats.min)
-                , HUMANIZE(stats.median)
-                , HUMANIZE(stats.mean)
-                , HUMANIZE(stats.max)
-                , stddev_percent
-                , stats.plot.c_str()
-                , "gpu"
-                , bench->getUniqueName()
-                );
-
+        printResult(cpuSamples, loops, bench->getUniqueName(), "cpu");
+        if (FLAGS_useBackgroundThread) {
+            printResult(gpuSamples, loops, bench->getUniqueName(), "gpu");
+        }
     }
     return 0;
 }