In the current code, tiling and bilerp sampling are strongly tied together. They...
authorherb <herb@google.com>
Thu, 21 Jul 2016 20:22:04 +0000 (13:22 -0700)
committerCommit bot <commit-bot@chromium.org>
Thu, 21 Jul 2016 20:22:04 +0000 (13:22 -0700)
This CL has several parts that are intertwined:
* move pin/wrap functionality into BilerpSampler.
* remove the nearest neighbor and bilerp tilers
* create a simplified general tiler
* remove the pipeline virtual calls bilerpEdge and bilerpSpan because everything works of sample points now.
* redo all the bilerp sampling to use the new local to methods to wrap/pin.
* introduce a new medium rate sample that handles spans with 1 < |dx| < 2.

This change improves the performance as displayed below:
Most of top 25 desktop improves or are the same. A few are worse, but close to the noise floor. In addition, this change has about 3% smaller code.

old time     new time   new/old
13274693  8414645  0.633886  top25desk_google_com_search_q_c.skp_1
4946466   3258018  0.658656  top25desk_wordpress.skp_1
6977187   5737584  0.822335  top25desk_youtube_com.skp_1
3770021   3296831  0.874486  top25desk_google_com__hl_en_q_b.skp_1
8890813   8600143  0.967307  top25desk_answers_yahoo_com.skp_1
3178974   3094300  0.973364  top25desk_facebook.skp_1
8871835   8711260  0.981901  top25desk_twitter.skp_1
838509    829290   0.989005  top25desk_blogger.skp_1
2821870   2801111  0.992644  top25desk_plus_google_com_11003.skp_1
511978    509530   0.995219  top25desk_techcrunch_com.skp_1
2408588   2397435  0.995369  top25desk_ebay_com.skp_1
4446919   4448004  1.00024   top25desk_espn.skp_1
2863241   2875696  1.00435   top25desk_google_com_calendar_.skp_1
7170086   7208447  1.00535   top25desk_booking_com.skp_1
7356109   7417776  1.00838   top25desk_pinterest.skp_1
5265591   5340392  1.01421   top25desk_weather_com.skp_1
5675244   5774144  1.01743   top25desk_sports_yahoo_com_.skp_1
1048531   1067663  1.01825   top25desk_games_yahoo_com.skp_1
2075501   2115131  1.01909   top25desk_amazon_com.skp_1
4262170   4370441  1.0254    top25desk_news_yahoo_com.skp_1
3789319   3897996  1.02868   top25desk_docs___1_open_documen.skp_1
919336    949979   1.03333   top25desk_wikipedia__1_tab_.skp_1
4274454   4489369  1.05028   top25desk_mail_google_com_mail_.skp_1
4149326   4376556  1.05476   top25desk_linkedin.skp_1

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2134893002
CQ_INCLUDE_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot;master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review-Url: https://codereview.chromium.org/2134893002

src/core/SkBitmapProcShader.h
src/core/SkLinearBitmapPipeline.cpp
src/core/SkLinearBitmapPipeline.h
src/core/SkLinearBitmapPipeline_core.h
src/core/SkLinearBitmapPipeline_sample.h

index a4591c7..67b005a 100644 (file)
@@ -56,7 +56,7 @@ private:
     typedef SkShader INHERITED;
 };
 
-enum {kSkBlitterContextSize = 3200};
+enum {kSkBlitterContextSize = 3332};
 
 // Commonly used allocator. It currently is only used to allocate up to 3 objects. The total
 // bytes requested is calculated using one of our large shaders, its context size plus the size of
index 088e829..0122765 100644 (file)
@@ -165,15 +165,14 @@ static SkLinearBitmapPipeline::PointProcessorInterface* choose_matrix(
 // Tile Stage
 
 template<typename XStrategy, typename YStrategy, typename Next>
-class NearestTileStage final : public SkLinearBitmapPipeline::PointProcessorInterface {
+class CombinedTileStage final : public SkLinearBitmapPipeline::PointProcessorInterface {
 public:
-    template <typename... Args>
-    NearestTileStage(Next* next, SkISize dimensions)
+    CombinedTileStage(Next* next, SkISize dimensions)
         : fNext{next}
         , fXStrategy{dimensions.width()}
         , fYStrategy{dimensions.height()}{ }
 
-    NearestTileStage(Next* next, const NearestTileStage& stage)
+    CombinedTileStage(Next* next, const CombinedTileStage& stage)
         : fNext{next}
         , fXStrategy{stage.fXStrategy}
         , fYStrategy{stage.fYStrategy} { }
@@ -195,187 +194,48 @@ public:
         SkASSERT(!span.isEmpty());
         SkPoint start; SkScalar length; int count;
         std::tie(start, length, count) = span;
-        SkScalar x = X(start);
-        SkScalar y = fYStrategy.tileY(Y(start));
-        Span yAdjustedSpan{{x, y}, length, count};
-        if (!fXStrategy.maybeProcessSpan(yAdjustedSpan, fNext)) {
-            span_fallback(span, this);
-        }
-    }
-
-private:
-    Next* const fNext;
-    XStrategy fXStrategy;
-    YStrategy fYStrategy;
-};
-
-template<typename XStrategy, typename YStrategy, typename Next>
-class BilerpTileStage final : public SkLinearBitmapPipeline::PointProcessorInterface {
-public:
-    template <typename... Args>
-    BilerpTileStage(Next* next, SkISize dimensions)
-        : fNext{next}
-        , fXMax(dimensions.width())
-        , fYMax(dimensions.height())
-        , fXStrategy{dimensions.width()}
-        , fYStrategy{dimensions.height()} { }
-
-    BilerpTileStage(Next* next, const BilerpTileStage& stage)
-        : fNext{next}
-        , fXMax{stage.fXMax}
-        , fYMax{stage.fYMax}
-        , fXStrategy{stage.fXStrategy}
-        , fYStrategy{stage.fYStrategy} { }
 
-    void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
-        fXStrategy.tileXPoints(&xs);
-        fYStrategy.tileYPoints(&ys);
-        // TODO: check to see if xs and ys are in range then just call pointListFew on next.
-        if (n >= 1) this->bilerpPoint(xs[0], ys[0]);
-        if (n >= 2) this->bilerpPoint(xs[1], ys[1]);
-        if (n >= 3) this->bilerpPoint(xs[2], ys[2]);
-    }
-
-    void SK_VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
-        fXStrategy.tileXPoints(&xs);
-        fYStrategy.tileYPoints(&ys);
-        // TODO: check to see if xs and ys are in range then just call pointList4 on next.
-        this->bilerpPoint(xs[0], ys[0]);
-        this->bilerpPoint(xs[1], ys[1]);
-        this->bilerpPoint(xs[2], ys[2]);
-        this->bilerpPoint(xs[3], ys[3]);
-    }
-
-    struct Wrapper {
-        void pointSpan(Span span) {
-            processor->breakIntoEdges(span);
-        }
-
-        void repeatSpan(Span span, int32_t repeatCount) {
-            while (repeatCount --> 0) {
-                processor->pointSpan(span);
-            }
+        if (span.count() == 1) {
+            this->pointListFew(1, span.startX(), span.startY());
+            return;
         }
 
-        BilerpTileStage* processor;
-    };
-
-    // The span you pass must not be empty.
-    void pointSpan(Span span) override {
-        SkASSERT(!span.isEmpty());
+        SkScalar x = X(start);
+        SkScalar y = fYStrategy.tileY(Y(start));
+        Span yAdjustedSpan{{x, y}, length, count};
 
-        Wrapper wrapper = {this};
-        if (!fXStrategy.maybeProcessSpan(span, &wrapper)) {
+        if (!fXStrategy.maybeProcessSpan(yAdjustedSpan, fNext)) {
             span_fallback(span, this);
         }
     }
 
 private:
-    void bilerpPoint(SkScalar x, SkScalar y) {
-        Sk4f txs = Sk4f{x} + Sk4f{-0.5f, 0.5f, -0.5f, 0.5f};
-        Sk4f tys = Sk4f{y} + Sk4f{-0.5f, -0.5f, 0.5f, 0.5f};
-        fXStrategy.tileXPoints(&txs);
-        fYStrategy.tileYPoints(&tys);
-        fNext->bilerpEdge(txs, tys);
-    }
-
-    void handleEdges(Span span, SkScalar dx) {
-        SkPoint start; SkScalar length; int count;
-        std::tie(start, length, count) = span;
-        SkScalar x = X(start);
-        SkScalar y = Y(start);
-        SkScalar tiledY = fYStrategy.tileY(y);
-        while (count > 0) {
-            this->bilerpPoint(x, tiledY);
-            x += dx;
-            count -= 1;
-        }
-    }
-
-    void yProcessSpan(Span span) {
-        SkScalar tiledY = fYStrategy.tileY(span.startY());
-        if (0.5f <= tiledY && tiledY < fYMax - 0.5f ) {
-            Span tiledSpan{{span.startX(), tiledY}, span.length(), span.count()};
-            fNext->pointSpan(tiledSpan);
-        } else {
-            // Convert to the Y0 bilerp sample set by shifting by -0.5f. Then tile that new y
-            // value and shift it back resulting in the working Y0. Do the same thing with Y1 but
-            // in the opposite direction.
-            SkScalar y0 = fYStrategy.tileY(span.startY() - 0.5f) + 0.5f;
-            SkScalar y1 = fYStrategy.tileY(span.startY() + 0.5f) - 0.5f;
-            Span newSpan{{span.startX(), y0}, span.length(), span.count()};
-            fNext->bilerpSpan(newSpan, y1);
-        }
-    }
-    void breakIntoEdges(Span span) {
-        if (span.count() == 1) {
-            this->bilerpPoint(span.startX(), span.startY());
-        } else if (span.length() == 0) {
-            yProcessSpan(span);
-        } else {
-            SkScalar dx = span.length() / (span.count() - 1);
-            if (span.length() > 0) {
-                Span leftBorder = span.breakAt(0.5f, dx);
-                if (!leftBorder.isEmpty()) {
-                    this->handleEdges(leftBorder, dx);
-                }
-                Span center = span.breakAt(fXMax - 0.5f, dx);
-                if (!center.isEmpty()) {
-                    this->yProcessSpan(center);
-                }
-
-                if (!span.isEmpty()) {
-                    this->handleEdges(span, dx);
-                }
-            } else {
-                Span center = span.breakAt(fXMax + 0.5f, dx);
-                if (!span.isEmpty()) {
-                    this->handleEdges(span, dx);
-                }
-                Span leftEdge = center.breakAt(0.5f, dx);
-                if (!center.isEmpty()) {
-                    this->yProcessSpan(center);
-                }
-                if (!leftEdge.isEmpty()) {
-                    this->handleEdges(leftEdge, dx);
-                }
-
-            }
-        }
-    }
-
     Next* const fNext;
-    SkScalar fXMax;
-    SkScalar fYMax;
     XStrategy fXStrategy;
     YStrategy fYStrategy;
 };
 
-template <typename XStrategy, typename YStrategy, typename Next>
-void make_tile_stage(
-    SkFilterQuality filterQuality, SkISize dimensions,
-    Next* next, SkLinearBitmapPipeline::TileStage* tileStage) {
-    if (filterQuality == kNone_SkFilterQuality) {
-        tileStage->initStage<NearestTileStage<XStrategy, YStrategy, Next>>(next, dimensions);
-    } else {
-        tileStage->initStage<BilerpTileStage<XStrategy, YStrategy, Next>>(next, dimensions);
-    }
-}
-template <typename XStrategy>
+template <typename XStrategy, typename Next>
 void choose_tiler_ymode(
     SkShader::TileMode yMode, SkFilterQuality filterQuality, SkISize dimensions,
-    SkLinearBitmapPipeline::SampleProcessorInterface* next,
+    Next* next,
     SkLinearBitmapPipeline::TileStage* tileStage) {
     switch (yMode) {
-        case SkShader::kClamp_TileMode:
-            make_tile_stage<XStrategy, YClampStrategy>(filterQuality, dimensions, next, tileStage);
+        case SkShader::kClamp_TileMode: {
+            using Tiler = CombinedTileStage<XStrategy, YClampStrategy, Next>;
+            tileStage->initStage<Tiler>(next, dimensions);
             break;
-        case SkShader::kRepeat_TileMode:
-            make_tile_stage<XStrategy, YRepeatStrategy>(filterQuality, dimensions, next, tileStage);
+        }
+        case SkShader::kRepeat_TileMode: {
+            using Tiler = CombinedTileStage<XStrategy, YRepeatStrategy, Next>;
+            tileStage->initStage<Tiler>(next, dimensions);
             break;
-        case SkShader::kMirror_TileMode:
-            make_tile_stage<XStrategy, YMirrorStrategy>(filterQuality, dimensions, next, tileStage);
+        }
+        case SkShader::kMirror_TileMode: {
+            using Tiler = CombinedTileStage<XStrategy, YMirrorStrategy, Next>;
+            tileStage->initStage<Tiler>(next, dimensions);
             break;
+        }
     }
 };
 
@@ -467,10 +327,6 @@ public:
         fDest = dest;
     }
 
-    void SK_VECTORCALL bilerpEdge(Sk4s xs, Sk4s ys) override { SkFAIL("Not Implemented"); }
-
-    void bilerpSpan(Span span, SkScalar y) override { SkFAIL("Not Implemented"); }
-
     void setDestination(void* dst, int count) override  {
         fDest = static_cast<uint32_t*>(dst);
         fEnd = fDest + count;
@@ -538,10 +394,6 @@ public:
         SkASSERT(fDest <= fEnd);
     }
 
-    void SK_VECTORCALL bilerpEdge(Sk4s xs, Sk4s ys) override { SkFAIL("Not Implemented"); }
-
-    void bilerpSpan(Span span, SkScalar y) override { SkFAIL("Not Implemented"); }
-
     void setDestination(void* dst, int count) override  {
         SkASSERT(count > 0);
         fDest = static_cast<uint32_t*>(dst);
@@ -582,12 +434,9 @@ static SkLinearBitmapPipeline::PixelAccessorInterface* choose_specific_accessor(
     }
 }
 
-template<template <typename, typename> class Sampler>
-static SkLinearBitmapPipeline::SampleProcessorInterface* choose_pixel_sampler_base(
-    Blender* next,
+static SkLinearBitmapPipeline::PixelAccessorInterface* choose_pixel_accessor(
     const SkPixmap& srcPixmap,
     const SkColor A8TintColor,
-    SkLinearBitmapPipeline::SampleStage* sampleStage,
     SkLinearBitmapPipeline::Accessor* accessor)
 {
     const SkImageInfo& imageInfo = srcPixmap.info();
@@ -629,19 +478,19 @@ static SkLinearBitmapPipeline::SampleProcessorInterface* choose_pixel_sampler_ba
             break;
     }
 
-    using S = Sampler<PixelAccessorShim, Blender>;
-    sampleStage->initStage<S>(next, pixelAccessor);
-    return sampleStage->get();
+    return pixelAccessor;
 }
 
 SkLinearBitmapPipeline::SampleProcessorInterface* choose_pixel_sampler(
     Blender* next,
     SkFilterQuality filterQuality,
+    SkShader::TileMode xTile, SkShader::TileMode yTile,
     const SkPixmap& srcPixmap,
     const SkColor A8TintColor,
     SkLinearBitmapPipeline::SampleStage* sampleStage,
     SkLinearBitmapPipeline::Accessor* accessor) {
     const SkImageInfo& imageInfo = srcPixmap.info();
+    SkISize dimensions = imageInfo.dimensions();
 
     // Special case samplers with fully expanded templates
     if (imageInfo.gammaCloseToSRGB()) {
@@ -670,14 +519,14 @@ SkLinearBitmapPipeline::SampleProcessorInterface* choose_pixel_sampler(
                     using S =
                     BilerpSampler<
                         PixelAccessor<kN32_SkColorType, kSRGB_SkGammaType>, Blender>;
-                    sampleStage->initStage<S>(next, srcPixmap);
+                    sampleStage->initStage<S>(next, dimensions, xTile, yTile, srcPixmap);
                     return sampleStage->get();
                 }
                 case kIndex_8_SkColorType: {
                     using S =
                     BilerpSampler<
                         PixelAccessor<kIndex_8_SkColorType, kSRGB_SkGammaType>, Blender>;
-                    sampleStage->initStage<S>(next, srcPixmap);
+                    sampleStage->initStage<S>(next, dimensions, xTile, yTile, srcPixmap);
                     return sampleStage->get();
                 }
                 default:
@@ -686,14 +535,16 @@ SkLinearBitmapPipeline::SampleProcessorInterface* choose_pixel_sampler(
         }
     }
 
+    auto pixelAccessor = choose_pixel_accessor(srcPixmap, A8TintColor, accessor);
     // General cases.
     if (filterQuality == kNone_SkFilterQuality) {
-        return choose_pixel_sampler_base<NearestNeighborSampler>(
-            next, srcPixmap, A8TintColor, sampleStage, accessor);
+        using S = NearestNeighborSampler<PixelAccessorShim, Blender>;
+        sampleStage->initStage<S>(next, pixelAccessor);
     } else {
-        return choose_pixel_sampler_base<BilerpSampler>(
-            next, srcPixmap, A8TintColor, sampleStage, accessor);
+        using S = BilerpSampler<PixelAccessorShim, Blender>;
+        sampleStage->initStage<S>(next, dimensions, xTile, yTile, pixelAccessor);
     }
+    return sampleStage->get();
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -705,17 +556,17 @@ public:
     SrcFPPixel(const SrcFPPixel& Blender) : fPostAlpha(Blender.fPostAlpha) {}
     void SK_VECTORCALL blendPixel(Sk4f pixel) override {
         SkASSERT(fDst + 1 <= fEnd );
-        SrcPixel(fDst, pixel, 0);
+        this->srcPixel(fDst, pixel, 0);
         fDst += 1;
     }
 
     void SK_VECTORCALL blend4Pixels(Sk4f p0, Sk4f p1, Sk4f p2, Sk4f p3) override {
         SkASSERT(fDst + 4 <= fEnd);
         SkPM4f* dst = fDst;
-        SrcPixel(dst, p0, 0);
-        SrcPixel(dst, p1, 1);
-        SrcPixel(dst, p2, 2);
-        SrcPixel(dst, p3, 3);
+        this->srcPixel(dst, p0, 0);
+        this->srcPixel(dst, p1, 1);
+        this->srcPixel(dst, p2, 2);
+        this->srcPixel(dst, p3, 3);
         fDst += 4;
     }
 
@@ -725,7 +576,9 @@ public:
     }
 
 private:
-    void SK_VECTORCALL SrcPixel(SkPM4f* dst, Sk4f pixel, int index) {
+    void SK_VECTORCALL srcPixel(SkPM4f* dst, Sk4f pixel, int index) {
+        check_pixel(pixel);
+
         Sk4f newPixel = pixel;
         if (alphaType == kUnpremul_SkAlphaType) {
             newPixel = Premultiply(pixel);
@@ -797,7 +650,8 @@ SkLinearBitmapPipeline::SkLinearBitmapPipeline(
     // identity matrix, the matrix stage is skipped, and the tilerStage is the first stage.
     auto blenderStage = choose_blender_for_shading(alphaType, postAlpha, &fBlenderStage);
     auto samplerStage = choose_pixel_sampler(
-        blenderStage, filterQuality, srcPixmap, paintColor, &fSampleStage, &fAccessor);
+        blenderStage, filterQuality, xTile, yTile,
+        srcPixmap, paintColor, &fSampleStage, &fAccessor);
     auto tilerStage   = choose_tiler(samplerStage, dimensions, xTile, yTile,
                                      filterQuality, dx, &fTileStage);
     fFirstStage       = choose_matrix(tilerStage, adjustedInverse, &fMatrixStage);
index b0f7e9d..91b573d 100644 (file)
@@ -133,9 +133,9 @@ public:
     // These values were generated by the assert above in Stage::init{Sink|Stage}.
     using MatrixStage  = Stage<PointProcessorInterface,    160, PointProcessorInterface>;
     using TileStage    = Stage<PointProcessorInterface,    160, SampleProcessorInterface>;
-    using SampleStage  = Stage<SampleProcessorInterface,   100, BlendProcessorInterface>;
+    using SampleStage  = Stage<SampleProcessorInterface,   160, BlendProcessorInterface>;
     using BlenderStage = Stage<BlendProcessorInterface,     40>;
-    using Accessor     = PolyMemory<PixelAccessorInterface, 48>;
+    using Accessor     = PolyMemory<PixelAccessorInterface, 64>;
 
 private:
     PointProcessorInterface* fFirstStage;
index 2c39a38..cf120ee 100644 (file)
@@ -178,6 +178,15 @@ void span_fallback(Span span, Stage* stage) {
         stage->pointListFew(count, xs, ys);
     }
 }
+
+inline Sk4f check_pixel(Sk4f& pixel) {
+    SkASSERTF(0.0f <= pixel[0] && pixel[0] <= 1.0f, "pixel[0]: %f", pixel[0]);
+    SkASSERTF(0.0f <= pixel[1] && pixel[1] <= 1.0f, "pixel[1]: %f", pixel[1]);
+    SkASSERTF(0.0f <= pixel[2] && pixel[2] <= 1.0f, "pixel[2]: %f", pixel[2]);
+    SkASSERTF(0.0f <= pixel[3] && pixel[3] <= 1.0f, "pixel[3]: %f", pixel[3]);
+    return pixel;
+}
+
 }  // namespace
 
 class SkLinearBitmapPipeline::PointProcessorInterface {
@@ -201,26 +210,6 @@ public:
     // Used for nearest neighbor when scale factor is 1.0. The span can just be repeated with no
     // edge pixel alignment problems. This is for handling a very common case.
     virtual void repeatSpan(Span span, int32_t repeatCount) = 0;
-
-    // The x's and y's are setup in the following order:
-    // +--------+--------+
-    // |        |        |
-    // |  px00  |  px10  |
-    // |    0   |    1   |
-    // +--------+--------+
-    // |        |        |
-    // |  px01  |  px11  |
-    // |    2   |    3   |
-    // +--------+--------+
-    // These pixels coordinates are arranged in the following order in xs and ys:
-    // px00  px10  px01  px11
-    virtual void SK_VECTORCALL bilerpEdge(Sk4s xs, Sk4s ys) = 0;
-
-    // A span represents sample points that have been mapped from destination space to source
-    // space. Each sample point is then expanded to the four bilerp points by add +/- 0.5. The
-    // resulting Y values my be off the tile. When y +/- 0.5 are more than 1 apart because of
-    // tiling, the second Y is used to denote the retiled Y value.
-    virtual void bilerpSpan(Span span, SkScalar y) = 0;
 };
 
 class SkLinearBitmapPipeline::DestinationInterface {
@@ -243,10 +232,10 @@ class SkLinearBitmapPipeline::PixelAccessorInterface {
 public:
     virtual ~PixelAccessorInterface() { }
     virtual void SK_VECTORCALL getFewPixels(
-        int n, Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const = 0;
+        int n, Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const = 0;
 
     virtual void SK_VECTORCALL get4Pixels(
-        Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const = 0;
+        Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const = 0;
 
     virtual void get4Pixels(
         const void* src, int index, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const = 0;
index 759075b..5421758 100644 (file)
@@ -40,7 +40,7 @@ namespace {
 // * px11 -> xy
 // So x * y is calculated first and then used to calculate all the other factors.
 static Sk4s SK_VECTORCALL bilerp4(Sk4s xs, Sk4s ys, Sk4f px00, Sk4f px10,
-                               Sk4f px01, Sk4f px11) {
+                                                    Sk4f px01, Sk4f px11) {
     // Calculate fractional xs and ys.
     Sk4s fxs = xs - xs.floor();
     Sk4s fys = ys - ys.floor();
@@ -134,20 +134,21 @@ template <SkGammaType gammaType>
 class PixelConverter<kIndex_8_SkColorType, gammaType> {
 public:
     using Element = uint8_t;
-    PixelConverter(const SkPixmap& srcPixmap) {
+    PixelConverter(const SkPixmap& srcPixmap)
+    : fColorTableSize(srcPixmap.ctable()->count()){
         SkColorTable* skColorTable = srcPixmap.ctable();
         SkASSERT(skColorTable != nullptr);
 
         fColorTable = (Sk4f*)SkAlign16((intptr_t)fColorTableStorage.get());
-        for (int i = 0; i < skColorTable->count(); i++) {
+        for (int i = 0; i < fColorTableSize; i++) {
             fColorTable[i] = pmcolor_to_rgba<gammaType>((*skColorTable)[i]);
         }
     }
 
-    PixelConverter(const PixelConverter& strategy) {
+    PixelConverter(const PixelConverter& strategy)
+    : fColorTableSize{strategy.fColorTableSize}{
         fColorTable = (Sk4f*)SkAlign16((intptr_t)fColorTableStorage.get());
-        // TODO: figure out the count.
-        for (int i = 0; i < 256; i++) {
+        for (int i = 0; i < fColorTableSize; i++) {
             fColorTable[i] = strategy.fColorTable[i];
         }
     }
@@ -158,9 +159,9 @@ public:
 
 private:
     static const size_t kColorTableSize = sizeof(Sk4f[256]) + 12;
-
-    SkAutoMalloc         fColorTableStorage{kColorTableSize};
-    Sk4f*                fColorTable;
+    const int           fColorTableSize;
+    SkAutoMalloc        fColorTableStorage{kColorTableSize};
+    Sk4f*               fColorTable;
 };
 
 template <SkGammaType gammaType>
@@ -194,12 +195,12 @@ public:
         : fPixelAccessor(accessor) { }
 
     void SK_VECTORCALL getFewPixels(
-        int n, Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const {
+        int n, Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const {
         fPixelAccessor->getFewPixels(n, xs, ys, px0, px1, px2);
     }
 
     void SK_VECTORCALL get4Pixels(
-        Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const {
+        Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const {
         fPixelAccessor->get4Pixels(xs, ys, px0, px1, px2, px3);
     }
 
@@ -237,10 +238,8 @@ public:
         , fConverter{srcPixmap, std::move<Args>(args)...} { }
 
     void SK_VECTORCALL getFewPixels (
-        int n, Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const override {
-        Sk4i XIs = SkNx_cast<int, SkScalar>(xs);
-        Sk4i YIs = SkNx_cast<int, SkScalar>(ys);
-        Sk4i bufferLoc = YIs * fWidth + XIs;
+        int n, Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2) const override {
+        Sk4i bufferLoc = ys * fWidth + xs;
         switch (n) {
             case 3:
                 *px2 = this->getPixelAt(bufferLoc[2]);
@@ -254,10 +253,8 @@ public:
     }
 
     void SK_VECTORCALL get4Pixels(
-        Sk4s xs, Sk4s ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const override {
-        Sk4i XIs = SkNx_cast<int, SkScalar>(xs);
-        Sk4i YIs = SkNx_cast<int, SkScalar>(ys);
-        Sk4i bufferLoc = YIs * fWidth + XIs;
+        Sk4i xs, Sk4i ys, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) const override {
+        Sk4i bufferLoc = ys * fWidth + xs;
         *px0 = this->getPixelAt(bufferLoc[0]);
         *px1 = this->getPixelAt(bufferLoc[1]);
         *px2 = this->getPixelAt(bufferLoc[2]);
@@ -330,6 +327,7 @@ static void src_strategy_blend(Span span, Next* next, Strategy* strategy) {
     }
 }
 
+// -- NearestNeighborSampler -----------------------------------------------------------------------
 // NearestNeighborSampler - use nearest neighbor filtering to create runs of destination pixels.
 template<typename Accessor, typename Next>
 class NearestNeighborSampler : public SkLinearBitmapPipeline::SampleProcessorInterface {
@@ -345,7 +343,7 @@ public:
     void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
         SkASSERT(0 < n && n < 4);
         Sk4f px0, px1, px2;
-        fAccessor.getFewPixels(n, xs, ys, &px0, &px1, &px2);
+        fAccessor.getFewPixels(n, SkNx_cast<int>(xs), SkNx_cast<int>(ys), &px0, &px1, &px2);
         if (n >= 1) fNext->blendPixel(px0);
         if (n >= 2) fNext->blendPixel(px1);
         if (n >= 3) fNext->blendPixel(px2);
@@ -353,7 +351,7 @@ public:
 
     void SK_VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
         Sk4f px0, px1, px2, px3;
-        fAccessor.get4Pixels(xs, ys, &px0, &px1, &px2, &px3);
+        fAccessor.get4Pixels(SkNx_cast<int>(xs), SkNx_cast<int>(ys), &px0, &px1, &px2, &px3);
         fNext->blend4Pixels(px0, px1, px2, px3);
     }
 
@@ -380,21 +378,11 @@ public:
         }
     }
 
-    void SK_VECTORCALL bilerpEdge(Sk4s xs, Sk4s ys) override {
-        SkFAIL("Using nearest neighbor sampler, but calling a bilerpEdge.");
-    }
-
-    void bilerpSpan(Span span, SkScalar y) override {
-        SkFAIL("Using nearest neighbor sampler, but calling a bilerpSpan.");
-    }
-
 private:
     // When moving through source space more slowly than dst space (zoomed in),
     // we'll be sampling from the same source pixel more than once.
     void spanSlowRate(Span span) {
-        SkPoint start;
-        SkScalar length;
-        int count;
+        SkPoint start; SkScalar length; int count;
         std::tie(start, length, count) = span;
         SkScalar x = X(start);
         SkFixed fx = SkScalarToFixed(x);
@@ -451,35 +439,82 @@ private:
     Accessor    fAccessor;
 };
 
+// From an edgeType, the integer value of a pixel vs, and the integer value of the extreme edge
+// vMax, take the point which might be off the tile by one pixel and either wrap it or pin it to
+// generate the right pixel. The value vs is on the interval [-1, vMax + 1]. It produces a value
+// on the interval [0, vMax].
+// Note: vMax is not width or height, but width-1 or height-1 because it is the largest valid pixel.
+static inline int adjust_edge(SkShader::TileMode edgeType, int vs, int vMax) {
+    SkASSERT(-1 <= vs && vs <= vMax + 1)
+    switch (edgeType) {
+        case SkShader::kClamp_TileMode:
+        case SkShader::kMirror_TileMode:
+            vs = std::max(vs, 0);
+            vs = std::min(vs, vMax);
+            break;
+        case SkShader::kRepeat_TileMode:
+            vs = (vs <= vMax) ? vs : 0;
+            vs =    (vs >= 0) ? vs : vMax;
+            break;
+    }
+    SkASSERT(0 <= vs && vs <= vMax);
+    return vs;
+}
+
+// From a sample point on the tile, return the top or left filter value.
+// The result r should be in the range (0, 1]. Since this represents the weight given to the top
+// left element, then if x == 0.5 the filter value should be 1.0.
+// The input sample point must be on the tile, therefore it must be >= 0.
+static SkScalar sample_to_filter(SkScalar x) {
+    SkASSERT(x >= 0.0f);
+    // The usual form of the top or left edge is x - .5, but since we are working on the unit
+    // square, then x + .5 works just as well. This also guarantees that v > 0.0 allowing the use
+    // of trunc.
+    SkScalar v = x + 0.5f;
+    // Produce the top or left offset a value on the range [0, 1).
+    SkScalar f = v - SkScalarTruncToScalar(v);
+    // Produce the filter value which is on the range (0, 1].
+    SkScalar r =  1.0f - f;
+    SkASSERT(0.0f < r && r <= 1.0f);
+    return r;
+}
+
 // -- BilerpSampler --------------------------------------------------------------------------------
 // BilerpSampler - use a bilerp filter to create runs of destination pixels.
+// Note: in the code below, there are two types of points
+//       * sample points - these are the points passed in by pointList* and Spans.
+//       * filter points - are created from a sample point to form the coordinates of the points
+//                         to use in the filter and to generate the filter values.
 template<typename Accessor, typename Next>
 class BilerpSampler : public SkLinearBitmapPipeline::SampleProcessorInterface {
 public:
     template<typename... Args>
-    BilerpSampler(SkLinearBitmapPipeline::BlendProcessorInterface* next, Args&& ... args)
-        : fNext{next}, fAccessor{std::forward<Args>(args)...} { }
+    BilerpSampler(
+        SkLinearBitmapPipeline::BlendProcessorInterface* next,
+        SkISize dimensions,
+        SkShader::TileMode xTile, SkShader::TileMode yTile,
+        Args&& ... args
+    )
+        : fNext{next}
+        , fXEdgeType{xTile}
+        , fXMax{dimensions.width() - 1}
+        , fYEdgeType{yTile}
+        , fYMax{dimensions.height() - 1}
+        , fAccessor{std::forward<Args>(args)...} { }
 
     BilerpSampler(SkLinearBitmapPipeline::BlendProcessorInterface* next,
                    const BilerpSampler& sampler)
-        : fNext{next}, fAccessor{sampler.fAccessor} { }
-
-    Sk4f bilerpNonEdgePixel(SkScalar x, SkScalar y) {
-        Sk4f px00, px10, px01, px11;
-
-        // bilerp4() expects xs, ys are the top-lefts of the 2x2 kernel.
-        Sk4f xs = Sk4f{x} - 0.5f;
-        Sk4f ys = Sk4f{y} - 0.5f;
-        Sk4f sampleXs = xs + Sk4f{0.0f, 1.0f, 0.0f, 1.0f};
-        Sk4f sampleYs = ys + Sk4f{0.0f, 0.0f, 1.0f, 1.0f};
-        fAccessor.get4Pixels(sampleXs, sampleYs, &px00, &px10, &px01, &px11);
-        return bilerp4(xs, ys, px00, px10, px01, px11);
-    }
+        : fNext{next}
+        , fXEdgeType{sampler.fXEdgeType}
+        , fXMax{sampler.fXMax}
+        , fYEdgeType{sampler.fYEdgeType}
+        , fYMax{sampler.fYMax}
+        , fAccessor{sampler.fAccessor} { }
 
     void SK_VECTORCALL pointListFew(int n, Sk4s xs, Sk4s ys) override {
         SkASSERT(0 < n && n < 4);
         auto bilerpPixel = [&](int index) {
-            return this->bilerpNonEdgePixel(xs[index], ys[index]);
+            return this->bilerpSamplePoint(SkPoint{xs[index], ys[index]});
         };
 
         if (n >= 1) fNext->blendPixel(bilerpPixel(0));
@@ -489,308 +524,484 @@ public:
 
     void SK_VECTORCALL pointList4(Sk4s xs, Sk4s ys) override {
         auto bilerpPixel = [&](int index) {
-            return this->bilerpNonEdgePixel(xs[index], ys[index]);
+            return this->bilerpSamplePoint(SkPoint{xs[index], ys[index]});
         };
         fNext->blend4Pixels(bilerpPixel(0), bilerpPixel(1), bilerpPixel(2), bilerpPixel(3));
     }
 
     void pointSpan(Span span) override {
-        this->bilerpSpan(span, span.startY());
-    }
-
-    void repeatSpan(Span span, int32_t repeatCount) override {
-        while (repeatCount > 0) {
-            this->pointSpan(span);
-            repeatCount--;
-        }
-    }
-
-    void SK_VECTORCALL bilerpEdge(Sk4s sampleXs, Sk4s sampleYs) override {
-        Sk4f px00, px10, px01, px11;
-        Sk4f xs = Sk4f{sampleXs[0]};
-        Sk4f ys = Sk4f{sampleYs[0]};
-        fAccessor.get4Pixels(sampleXs, sampleYs, &px00, &px10, &px01, &px11);
-        Sk4f pixel = bilerp4(xs, ys, px00, px10, px01, px11);
-        fNext->blendPixel(pixel);
-    }
-
-    void bilerpSpan(Span span, SkScalar y) override {
         SkASSERT(!span.isEmpty());
         SkPoint start;
         SkScalar length;
         int count;
         std::tie(start, length, count) = span;
+
+        // Nothing to do.
+        if (count == 0) {
+            return;
+        }
+
+        // Trivial case. No sample points are generated other than start.
+        if (count == 1) {
+            fNext->blendPixel(this->bilerpSamplePoint(start));
+            return;
+        }
+
+        // Note: the following code could be done in terms of dx = length / (count -1), but that
+        // would introduce a divide that is not needed for the most common dx == 1 cases.
         SkScalar absLength = SkScalarAbs(length);
         if (absLength == 0.0f) {
-            this->spanZeroRate(span, y);
+            // |dx| == 0
+            // length is zero, so clamp an edge pixel.
+            this->spanZeroRate(span);
         } else if (absLength < (count - 1)) {
-            this->spanSlowRate(span, y);
+            // 0 < |dx| < 1.
+            this->spanSlowRate(span);
         } else if (absLength == (count - 1)) {
-            if (std::fmod(span.startX() - 0.5f, 1.0f) == 0.0f) {
-                if (std::fmod(span.startY() - 0.5f, 1.0f) == 0.0f) {
-                    src_strategy_blend(span, fNext, &fAccessor);
-                } else {
-                    this->spanUnitRateAlignedX(span, y);
-                }
+            // |dx| == 1.
+            if (sample_to_filter(span.startX()) == 1.0f
+                && sample_to_filter(span.startY()) == 1.0f) {
+                // All the pixels are aligned with the dest; go fast.
+                src_strategy_blend(span, fNext, &fAccessor);
             } else {
-                this->spanUnitRate(span, y);
+                // There is some sub-pixel offsets, so bilerp.
+                this->spanUnitRate(span);
             }
+        } else if (absLength < 2.0f * (count - 1)) {
+            // 1 < |dx| < 2.
+            this->spanMediumRate(span);
         } else {
-            this->spanFastRate(span, y);
+            // |dx| >= 2.
+            this->spanFastRate(span);
+        }
+    }
+
+    void repeatSpan(Span span, int32_t repeatCount) override {
+        while (repeatCount > 0) {
+            this->pointSpan(span);
+            repeatCount--;
         }
     }
 
 private:
-    void spanZeroRate(Span span, SkScalar y1) {
-        SkScalar y0 = span.startY() - 0.5f;
-        y1 += 0.5f;
-        int iy0 = SkScalarFloorToInt(y0);
-        SkScalar filterY1 = y0 - iy0;
-        SkScalar filterY0 = 1.0f - filterY1;
-        int iy1 = SkScalarFloorToInt(y1);
-        int ix = SkScalarFloorToInt(span.startX());
-        Sk4f pixelY0 = fAccessor.getPixelFromRow(fAccessor.row(iy0), ix);
-        Sk4f pixelY1 = fAccessor.getPixelFromRow(fAccessor.row(iy1), ix);
-        Sk4f filterPixel = pixelY0 * filterY0 + pixelY1 * filterY1;
-        int count = span.count();
+
+    // Convert a sample point to the points used by the filter.
+    void filterPoints(SkPoint sample, Sk4i* filterXs, Sk4i* filterYs) {
+        // May be less than zero. Be careful to use Floor.
+        int x0 = adjust_edge(fXEdgeType, SkScalarFloorToInt(X(sample) - 0.5), fXMax);
+        // Always greater than zero. Use the faster Trunc.
+        int x1 = adjust_edge(fXEdgeType, SkScalarTruncToInt(X(sample) + 0.5), fXMax);
+        int y0 = adjust_edge(fYEdgeType, SkScalarFloorToInt(Y(sample) - 0.5), fYMax);
+        int y1 = adjust_edge(fYEdgeType, SkScalarTruncToInt(Y(sample) + 0.5), fYMax);
+
+        *filterXs = Sk4i{x0, x1, x0, x1};
+        *filterYs = Sk4i{y0, y0, y1, y1};
+    }
+
+    // Given a sample point, generate a color by bilerping the four filter points.
+    Sk4f bilerpSamplePoint(SkPoint sample) {
+        Sk4i iXs, iYs;
+        filterPoints(sample, &iXs, &iYs);
+        Sk4f px00, px10, px01, px11;
+        fAccessor.get4Pixels(iXs, iYs, &px00, &px10, &px01, &px11);
+        return bilerp4(Sk4f{X(sample) - 0.5f}, Sk4f{Y(sample) - 0.5f}, px00, px10, px01, px11);
+    }
+
+    // Get two pixels at x from row0 and row1.
+    void get2PixelColumn(const void* row0, const void* row1, int x, Sk4f* px0, Sk4f* px1) {
+        *px0 = fAccessor.getPixelFromRow(row0, x);
+        *px1 = fAccessor.getPixelFromRow(row1, x);
+    }
+
+    // |dx| == 0. This code assumes that length is zero.
+    void spanZeroRate(Span span) {
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = span;
+        SkASSERT(length == 0.0f);
+
+        // Filter for the blending of the top and bottom pixels.
+        SkScalar filterY = sample_to_filter(Y(start));
+
+        // Generate the four filter points from the sample point start. Generate the row* values.
+        Sk4i iXs, iYs;
+        this->filterPoints(start, &iXs, &iYs);
+        const void* const row0 = fAccessor.row(iYs[0]);
+        const void* const row1 = fAccessor.row(iYs[2]);
+
+        // Get the two pixels that make up the clamping pixel.
+        Sk4f pxTop, pxBottom;
+        this->get2PixelColumn(row0, row1, SkScalarFloorToInt(X(start)), &pxTop, &pxBottom);
+        Sk4f pixel = pxTop * filterY + (1.0f - filterY) * pxBottom;
+
         while (count >= 4) {
-            fNext->blend4Pixels(filterPixel, filterPixel, filterPixel, filterPixel);
+            fNext->blend4Pixels(pixel, pixel, pixel, pixel);
             count -= 4;
         }
         while (count > 0) {
-            fNext->blendPixel(filterPixel);
+            fNext->blendPixel(pixel);
             count -= 1;
         }
     }
 
-    // When moving through source space more slowly than dst space (zoomed in),
-    // we'll be sampling from the same source pixel more than once.
-    void spanSlowRate(Span span, SkScalar ry1) {
-        SkPoint start;
-        SkScalar length;
-        int count;
+    // 0 < |dx| < 1. This code reuses the calculations from previous pixels to reduce
+    // computation. In particular, several destination pixels maybe generated from the same four
+    // source pixels.
+    // In the following code a "part" is a combination of two pixels from the same column of the
+    // filter.
+    void spanSlowRate(Span span) {
+        SkPoint start; SkScalar length; int count;
         std::tie(start, length, count) = span;
-        SkFixed fx = SkScalarToFixed(X(start)-0.5f);
 
-        SkFixed fdx = SkScalarToFixed(length / (count - 1));
+        // Calculate the distance between each sample point.
+        const SkScalar dx = length / (count - 1);
+        SkASSERT(-1.0f < dx && dx < 1.0f && dx != 0.0f);
+
+        // Generate the filter values for the top-left corner.
+        // Note: these values are in filter space; this has implications about how to adjust
+        // these values at each step. For example, as the sample point increases, the filter
+        // value decreases, this is because the filter and position are related by
+        // (1 - (X(sample) - .5)) % 1. The (1 - stuff) causes the filter to move in the opposite
+        // direction of the sample point which is increasing by dx.
+        SkScalar filterX = sample_to_filter(X(start));
+        SkScalar filterY = sample_to_filter(Y(start));
+
+        // Generate the four filter points from the sample point start. Generate the row* values.
+        Sk4i iXs, iYs;
+        this->filterPoints(start, &iXs, &iYs);
+        const void* const row0 = fAccessor.row(iYs[0]);
+        const void* const row1 = fAccessor.row(iYs[2]);
+
+        // Generate part of the filter value at xColumn.
+        auto partAtColumn = [&](int xColumn) {
+            int adjustedColumn = adjust_edge(fXEdgeType, xColumn, fXMax);
+            Sk4f pxTop, pxBottom;
+            this->get2PixelColumn(row0, row1, adjustedColumn, &pxTop, &pxBottom);
+            return pxTop * filterY + (1.0f - filterY) * pxBottom;
+        };
 
-        Sk4f xAdjust;
-        if (fdx >= 0) {
-            xAdjust = Sk4f{-1.0f};
-        } else {
-            xAdjust = Sk4f{1.0f};
-        }
-        int ix = SkFixedFloorToInt(fx);
-        int ioldx = ix;
-        Sk4f x{SkFixedToScalar(fx) - ix};
-        Sk4f dx{SkFixedToScalar(fdx)};
-        SkScalar ry0 = Y(start) - 0.5f;
-        ry1 += 0.5f;
-        SkScalar yFloor = std::floor(ry0);
-        Sk4f y1 = Sk4f{ry0 - yFloor};
-        Sk4f y0 = Sk4f{1.0f} - y1;
-        const void* const row0 = fAccessor.row(SkScalarFloorToInt(ry0));
-        const void* const row1 = fAccessor.row(SkScalarFloorToInt(ry1));
-        Sk4f fpixel00 = y0 * fAccessor.getPixelFromRow(row0, ix);
-        Sk4f fpixel01 = y1 * fAccessor.getPixelFromRow(row1, ix);
-        Sk4f fpixel10 = y0 * fAccessor.getPixelFromRow(row0, ix + 1);
-        Sk4f fpixel11 = y1 * fAccessor.getPixelFromRow(row1, ix + 1);
-        auto getNextPixel = [&]() {
-            if (ix != ioldx) {
-                fpixel00 = fpixel10;
-                fpixel01 = fpixel11;
-                fpixel10 = y0 * fAccessor.getPixelFromRow(row0, ix + 1);
-                fpixel11 = y1 * fAccessor.getPixelFromRow(row1, ix + 1);
-                ioldx = ix;
-                x = x + xAdjust;
-            }
+        // The leftPart is made up of two pixels from the left column of the filter, right part
+        // is similar. The top and bottom pixels in the *Part are created as a linear blend of
+        // the top and bottom pixels using filterY. See the partAtColumn function above.
+        Sk4f leftPart  = partAtColumn(iXs[0]);
+        Sk4f rightPart = partAtColumn(iXs[1]);
 
-            Sk4f x0, x1;
-            x0 = Sk4f{1.0f} - x;
-            x1 = x;
-            Sk4f fpixel = x0 * (fpixel00 + fpixel01) + x1 * (fpixel10 + fpixel11);
-            fx += fdx;
-            ix = SkFixedFloorToInt(fx);
-            x = x + dx;
-            return fpixel;
+        // Create a destination color by blending together a left and right part using filterX.
+        auto bilerp = [&]() {
+            Sk4f pixel = leftPart * filterX + rightPart * (1.0f - filterX);
+            return check_pixel(pixel);
         };
 
-        while (count >= 4) {
-            Sk4f fpixel0 = getNextPixel();
-            Sk4f fpixel1 = getNextPixel();
-            Sk4f fpixel2 = getNextPixel();
-            Sk4f fpixel3 = getNextPixel();
+        // Send the first pixel to the destination. This simplifies the loop structure so that no
+        // extra pixels are fetched for the last iteration of the loop.
+        fNext->blendPixel(bilerp());
+        count -= 1;
+
+        if (dx > 0.0f) {
+            // * positive direction - generate destination pixels by sliding the filter from left
+            //                        to right.
+            int rightPartCursor = iXs[1];
+
+            // Advance the filter from left to right. Remember that moving the top-left corner of
+            // the filter to the right actually makes the filter value smaller.
+            auto advanceFilter = [&]() {
+                filterX -= dx;
+                if (filterX <= 0.0f) {
+                    filterX += 1.0f;
+                    leftPart = rightPart;
+                    rightPartCursor += 1;
+                    rightPart = partAtColumn(rightPartCursor);
+                }
+                SkASSERT(0.0f < filterX && filterX <= 1.0f);
 
-            fNext->blend4Pixels(fpixel0, fpixel1, fpixel2, fpixel3);
-            count -= 4;
-        }
+                return bilerp();
+            };
 
-        while (count > 0) {
-            fNext->blendPixel(getNextPixel());
+            while (count >= 4) {
+                Sk4f px0 = advanceFilter(),
+                     px1 = advanceFilter(),
+                     px2 = advanceFilter(),
+                     px3 = advanceFilter();
+                fNext->blend4Pixels(px0, px1, px2, px3);
+                count -= 4;
+            }
 
-            count -= 1;
+            while (count > 0) {
+                fNext->blendPixel(advanceFilter());
+                count -= 1;
+            }
+        } else {
+            // * negative direction - generate destination pixels by sliding the filter from
+            //                        right to left.
+            int leftPartCursor = iXs[0];
+
+            // Advance the filter from right to left. Remember that moving the top-left corner of
+            // the filter to the left actually makes the filter value larger.
+            auto advanceFilter = [&]() {
+                // Remember, dx < 0 therefore this adds |dx| to filterX.
+                filterX -= dx;
+                // At this point filterX may be > 1, and needs to be wrapped back on to the filter
+                // interval, and the next column in the filter is calculated.
+                if (filterX > 1.0f) {
+                    filterX -= 1.0f;
+                    rightPart = leftPart;
+                    leftPartCursor -= 1;
+                    leftPart = partAtColumn(leftPartCursor);
+                }
+                SkASSERT(0.0f < filterX && filterX <= 1.0f);
+
+                return bilerp();
+            };
+
+            while (count >= 4) {
+                Sk4f px0 = advanceFilter(),
+                     px1 = advanceFilter(),
+                     px2 = advanceFilter(),
+                     px3 = advanceFilter();
+                fNext->blend4Pixels(px0, px1, px2, px3);
+                count -= 4;
+            }
+
+            while (count > 0) {
+                fNext->blendPixel(advanceFilter());
+                count -= 1;
+            }
         }
     }
 
-    // We're moving through source space at a rate of 1 source pixel per 1 dst pixel.
-    // We'll never re-use pixels, but we can at least load contiguous pixels.
-    void spanUnitRate(Span span, SkScalar y1) {
-        y1 += 0.5f;
-        SkScalar y0 = span.startY() - 0.5f;
-        int iy0 = SkScalarFloorToInt(y0);
-        SkScalar filterY1 = y0 - iy0;
-        SkScalar filterY0 = 1.0f - filterY1;
-        int iy1 = SkScalarFloorToInt(y1);
-        const void* rowY0 = fAccessor.row(iy0);
-        const void* rowY1 = fAccessor.row(iy1);
-        SkScalar x0 = span.startX() - 0.5f;
-        int ix0 = SkScalarFloorToInt(x0);
-        SkScalar filterX1 = x0 - ix0;
-        SkScalar filterX0 = 1.0f - filterX1;
-
-        auto getPixelY0 = [&]() {
-            Sk4f px = fAccessor.getPixelFromRow(rowY0, ix0);
-            return px * filterY0;
-        };
-
-        auto getPixelY1 = [&]() {
-            Sk4f px = fAccessor.getPixelFromRow(rowY1, ix0);
-            return px * filterY1;
+    // |dx| == 1. Moving through source space at a rate of 1 source pixel per 1 dst pixel.
+    // Every filter part is used for two destination pixels, and the code can bulk load four
+    // pixels at a time.
+    void spanUnitRate(Span span) {
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = span;
+        SkASSERT(SkScalarAbs(length) == (count - 1));
+
+        // Calculate the four filter points of start, and use the two different Y values to
+        // generate the row pointers.
+        Sk4i iXs, iYs;
+        filterPoints(start, &iXs, &iYs);
+        const void* row0 = fAccessor.row(iYs[0]);
+        const void* row1 = fAccessor.row(iYs[2]);
+
+        // Calculate the filter values for the top-left filter element.
+        const SkScalar filterX = sample_to_filter(X(start));
+        const SkScalar filterY = sample_to_filter(Y(start));
+
+        // Generate part of the filter value at xColumn.
+        auto partAtColumn = [&](int xColumn) {
+            int adjustedColumn = adjust_edge(fXEdgeType, xColumn, fXMax);
+            Sk4f pxTop, pxBottom;
+            this->get2PixelColumn(row0, row1, adjustedColumn, &pxTop, &pxBottom);
+            return pxTop * filterY + (1.0f - filterY) * pxBottom;
         };
 
-        auto get4PixelsY0 = [&](int ix, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) {
-            fAccessor.get4Pixels(rowY0, ix, px0, px1, px2, px3);
-            *px0 = *px0 * filterY0;
-            *px1 = *px1 * filterY0;
-            *px2 = *px2 * filterY0;
-            *px3 = *px3 * filterY0;
+        auto get4Parts = [&](int ix, Sk4f* part0, Sk4f* part1, Sk4f* part2, Sk4f* part3) {
+            // Check if the pixels needed are near the edges. If not go fast using bulk pixels,
+            // otherwise be careful.
+            if (0 <= ix && ix <= fXMax - 3) {
+                Sk4f px00, px10, px20, px30,
+                     px01, px11, px21, px31;
+                fAccessor.get4Pixels(row0, ix, &px00, &px10, &px20, &px30);
+                fAccessor.get4Pixels(row1, ix, &px01, &px11, &px21, &px31);
+                *part0 = filterY * px00 + (1.0f - filterY) * px01;
+                *part1 = filterY * px10 + (1.0f - filterY) * px11;
+                *part2 = filterY * px20 + (1.0f - filterY) * px21;
+                *part3 = filterY * px30 + (1.0f - filterY) * px31;
+            } else {
+                *part0 = partAtColumn(ix + 0);
+                *part1 = partAtColumn(ix + 1);
+                *part2 = partAtColumn(ix + 2);
+                *part3 = partAtColumn(ix + 3);
+            }
         };
 
-        auto get4PixelsY1 = [&](int ix, Sk4f* px0, Sk4f* px1, Sk4f* px2, Sk4f* px3) {
-            fAccessor.get4Pixels(rowY1, ix, px0, px1, px2, px3);
-            *px0 = *px0 * filterY1;
-            *px1 = *px1 * filterY1;
-            *px2 = *px2 * filterY1;
-            *px3 = *px3 * filterY1;
+        auto bilerp = [&](Sk4f& part0, Sk4f& part1) {
+            return part0 * filterX + part1 * (1.0f - filterX);
         };
 
-        auto lerp = [&](Sk4f& pixelX0, Sk4f& pixelX1) {
-            return pixelX0 * filterX0 + pixelX1 * filterX1;
-        };
+        if (length > 0) {
+            // * positive direction - generate destination pixels by sliding the filter from left
+            //                        to right.
 
-        // Mid making 4 unit rate.
-        Sk4f pxB = getPixelY0() + getPixelY1();
-        if (span.length() > 0) {
-            int count = span.count();
+            // overlapPart is the filter part from the end of the previous four pixels used at
+            // the start of the next four pixels.
+            Sk4f overlapPart = partAtColumn(iXs[0]);
+            int rightColumnCursor = iXs[1];
             while (count >= 4) {
-                Sk4f px00, px10, px20, px30;
-                get4PixelsY0(ix0, &px00, &px10, &px20, &px30);
-                Sk4f px01, px11, px21, px31;
-                get4PixelsY1(ix0, &px01, &px11, &px21, &px31);
-                Sk4f pxS0 = px00 + px01;
-                Sk4f px0 = lerp(pxB, pxS0);
-                Sk4f pxS1 = px10 + px11;
-                Sk4f px1 = lerp(pxS0, pxS1);
-                Sk4f pxS2 = px20 + px21;
-                Sk4f px2 = lerp(pxS1, pxS2);
-                Sk4f pxS3 = px30 + px31;
-                Sk4f px3 = lerp(pxS2, pxS3);
-                pxB = pxS3;
+                Sk4f part0, part1, part2, part3;
+                get4Parts(rightColumnCursor, &part0, &part1, &part2, &part3);
+                Sk4f px0 = bilerp(overlapPart, part0);
+                Sk4f px1 = bilerp(part0, part1);
+                Sk4f px2 = bilerp(part1, part2);
+                Sk4f px3 = bilerp(part2, part3);
+                overlapPart = part3;
                 fNext->blend4Pixels(px0, px1, px2, px3);
-                ix0 += 4;
+                rightColumnCursor += 4;
                 count -= 4;
             }
+
             while (count > 0) {
-                Sk4f pixelY0 = fAccessor.getPixelFromRow(rowY0, ix0);
-                Sk4f pixelY1 = fAccessor.getPixelFromRow(rowY1, ix0);
+                Sk4f rightPart = partAtColumn(rightColumnCursor);
 
-                fNext->blendPixel(lerp(pixelY0, pixelY1));
-                ix0 += 1;
+                fNext->blendPixel(bilerp(overlapPart, rightPart));
+                overlapPart = rightPart;
+                rightColumnCursor += 1;
                 count -= 1;
             }
         } else {
-            int count = span.count();
+            // * negative direction - generate destination pixels by sliding the filter from
+            //                        right to left.
+            Sk4f overlapPart = partAtColumn(iXs[1]);
+            int leftColumnCursor = iXs[0];
+
             while (count >= 4) {
-                Sk4f px00, px10, px20, px30;
-                get4PixelsY0(ix0 - 3, &px00, &px10, &px20, &px30);
-                Sk4f px01, px11, px21, px31;
-                get4PixelsY1(ix0 - 3, &px01, &px11, &px21, &px31);
-                Sk4f pxS3 = px30 + px31;
-                Sk4f px0 = lerp(pxS3, pxB);
-                Sk4f pxS2 = px20 + px21;
-                Sk4f px1 = lerp(pxS2, pxS3);
-                Sk4f pxS1 = px10 + px11;
-                Sk4f px2 = lerp(pxS1, pxS2);
-                Sk4f pxS0 = px00 + px01;
-                Sk4f px3 = lerp(pxS0, pxS1);
-                pxB = pxS0;
+                Sk4f part0, part1, part2, part3;
+                get4Parts(leftColumnCursor - 3, &part3, &part2, &part1, &part0);
+                Sk4f px0 = bilerp(part0, overlapPart);
+                Sk4f px1 = bilerp(part1, part0);
+                Sk4f px2 = bilerp(part2, part1);
+                Sk4f px3 = bilerp(part3, part2);
+                overlapPart = part3;
                 fNext->blend4Pixels(px0, px1, px2, px3);
-                ix0 -= 4;
+                leftColumnCursor -= 4;
                 count -= 4;
             }
+
             while (count > 0) {
-                Sk4f pixelY0 = fAccessor.getPixelFromRow(rowY0, ix0);
-                Sk4f pixelY1 = fAccessor.getPixelFromRow(rowY1, ix0);
+                Sk4f leftPart = partAtColumn(leftColumnCursor);
 
-                fNext->blendPixel(lerp(pixelY0, pixelY1));
-                ix0 -= 1;
+                fNext->blendPixel(bilerp(leftPart, overlapPart));
+                overlapPart = leftPart;
+                leftColumnCursor -= 1;
                 count -= 1;
             }
         }
     }
 
-    void spanUnitRateAlignedX(Span span, SkScalar y1) {
-        SkScalar y0 = span.startY() - 0.5f;
-        y1 += 0.5f;
-        int iy0 = SkScalarFloorToInt(y0);
-        SkScalar filterY1 = y0 - iy0;
-        SkScalar filterY0 = 1.0f - filterY1;
-        int iy1 = SkScalarFloorToInt(y1);
-        int ix = SkScalarFloorToInt(span.startX());
-        const void* rowY0 = fAccessor.row(iy0);
-        const void* rowY1 = fAccessor.row(iy1);
-        auto lerp = [&](Sk4f* pixelY0, Sk4f* pixelY1) {
-            return *pixelY0 * filterY0 + *pixelY1 * filterY1;
+    // 1 < |dx| < 2. Going through the source pixels at a faster rate than the dest pixels, but
+    // still slow enough to take advantage of previous calculations.
+    void spanMediumRate(Span span) {
+        SkPoint start; SkScalar length; int count;
+        std::tie(start, length, count) = span;
+
+        // Calculate the distance between each sample point.
+        const SkScalar dx = length / (count - 1);
+        SkASSERT((-2.0f < dx && dx < -1.0f) || (1.0f < dx && dx < 2.0f));
+
+        // Generate the filter values for the top-left corner.
+        // Note: these values are in filter space; this has implications about how to adjust
+        // these values at each step. For example, as the sample point increases, the filter
+        // value decreases, this is because the filter and position are related by
+        // (1 - (X(sample) - .5)) % 1. The (1 - stuff) causes the filter to move in the opposite
+        // direction of the sample point which is increasing by dx.
+        SkScalar filterX = sample_to_filter(X(start));
+        SkScalar filterY = sample_to_filter(Y(start));
+
+        // Generate the four filter points from the sample point start. Generate the row* values.
+        Sk4i iXs, iYs;
+        this->filterPoints(start, &iXs, &iYs);
+        const void* const row0 = fAccessor.row(iYs[0]);
+        const void* const row1 = fAccessor.row(iYs[2]);
+
+        // Generate part of the filter value at xColumn.
+        auto partAtColumn = [&](int xColumn) {
+            int adjustedColumn = adjust_edge(fXEdgeType, xColumn, fXMax);
+            Sk4f pxTop, pxBottom;
+            this->get2PixelColumn(row0, row1, adjustedColumn, &pxTop, &pxBottom);
+            return pxTop * filterY + (1.0f - filterY) * pxBottom;
+        };
+
+        // The leftPart is made up of two pixels from the left column of the filter, right part
+        // is similar. The top and bottom pixels in the *Part are created as a linear blend of
+        // the top and bottom pixels using filterY. See the nextPart function below.
+        Sk4f leftPart  = partAtColumn(iXs[0]);
+        Sk4f rightPart = partAtColumn(iXs[1]);
+
+        // Create a destination color by blending together a left and right part using filterX.
+        auto bilerp = [&]() {
+            Sk4f pixel = leftPart * filterX + rightPart * (1.0f - filterX);
+            return check_pixel(pixel);
         };
 
-        if (span.length() > 0) {
-            int count = span.count();
+        // Send the first pixel to the destination. This simplifies the loop structure so that no
+        // extra pixels are fetched for the last iteration of the loop.
+        fNext->blendPixel(bilerp());
+        count -= 1;
+
+        if (dx > 0.0f) {
+            // * positive direction - generate destination pixels by sliding the filter from left
+            //                        to right.
+            int rightPartCursor = iXs[1];
+
+            // Advance the filter from left to right. Remember that moving the top-left corner of
+            // the filter to the right actually makes the filter value smaller.
+            auto advanceFilter = [&]() {
+                filterX -= dx;
+                // At this point filterX is less than zero, but might actually be less than -1.
+                if (filterX > -1.0f) {
+                    filterX += 1.0f;
+                    leftPart = rightPart;
+                    rightPartCursor += 1;
+                    rightPart = partAtColumn(rightPartCursor);
+                } else {
+                    filterX += 2.0f;
+                    rightPartCursor += 2;
+                    leftPart = partAtColumn(rightPartCursor - 1);
+                    rightPart = partAtColumn(rightPartCursor);
+                }
+                SkASSERT(0.0f < filterX && filterX <= 1.0f);
+
+                return bilerp();
+            };
+
             while (count >= 4) {
-                Sk4f px00, px10, px20, px30;
-                fAccessor.get4Pixels(rowY0, ix, &px00, &px10, &px20, &px30);
-                Sk4f px01, px11, px21, px31;
-                fAccessor.get4Pixels(rowY1, ix, &px01, &px11, &px21, &px31);
-                fNext->blend4Pixels(
-                    lerp(&px00, &px01), lerp(&px10, &px11), lerp(&px20, &px21), lerp(&px30, &px31));
-                ix += 4;
+                Sk4f px0 = advanceFilter(),
+                     px1 = advanceFilter(),
+                     px2 = advanceFilter(),
+                     px3 = advanceFilter();
+                fNext->blend4Pixels(px0, px1, px2, px3);
                 count -= 4;
             }
-            while (count > 0) {
-                Sk4f pixelY0 = fAccessor.getPixelFromRow(rowY0, ix);
-                Sk4f pixelY1 = fAccessor.getPixelFromRow(rowY1, ix);
 
-                fNext->blendPixel(lerp(&pixelY0, &pixelY1));
-                ix += 1;
+            while (count > 0) {
+                fNext->blendPixel(advanceFilter());
                 count -= 1;
             }
         } else {
-            int count = span.count();
+            // * negative direction - generate destination pixels by sliding the filter from
+            //                        right to left.
+            int leftPartCursor = iXs[0];
+
+            auto advanceFilter = [&]() {
+                // Remember, dx < 0 therefore this adds |dx| to filterX.
+                filterX -= dx;
+                // At this point, filterX is greater than one, but may actually be greater than two.
+                if (filterX < 2.0f) {
+                    filterX -= 1.0f;
+                    rightPart = leftPart;
+                    leftPartCursor -= 1;
+                    leftPart = partAtColumn(leftPartCursor);
+                } else {
+                    filterX -= 2.0f;
+                    leftPartCursor -= 2;
+                    rightPart = partAtColumn(leftPartCursor - 1);
+                    leftPart = partAtColumn(leftPartCursor);
+                }
+                SkASSERT(0.0f < filterX && filterX <= 1.0f);
+                return bilerp();
+            };
+
             while (count >= 4) {
-                Sk4f px00, px10, px20, px30;
-                fAccessor.get4Pixels(rowY0, ix - 3, &px30, &px20, &px10, &px00);
-                Sk4f px01, px11, px21, px31;
-                fAccessor.get4Pixels(rowY1, ix - 3, &px31, &px21, &px11, &px01);
-                fNext->blend4Pixels(
-                    lerp(&px00, &px01), lerp(&px10, &px11), lerp(&px20, &px21), lerp(&px30, &px31));
-                ix -= 4;
+                Sk4f px0 = advanceFilter(),
+                     px1 = advanceFilter(),
+                     px2 = advanceFilter(),
+                     px3 = advanceFilter();
+                fNext->blend4Pixels(px0, px1, px2, px3);
                 count -= 4;
             }
-            while (count > 0) {
-                Sk4f pixelY0 = fAccessor.getPixelFromRow(rowY0, ix);
-                Sk4f pixelY1 = fAccessor.getPixelFromRow(rowY1, ix);
 
-                fNext->blendPixel(lerp(&pixelY0, &pixelY1));
-                ix -= 1;
+            while (count > 0) {
+                fNext->blendPixel(advanceFilter());
                 count -= 1;
             }
         }
@@ -798,34 +1009,26 @@ private:
 
     // We're moving through source space faster than dst (zoomed out),
     // so we'll never reuse a source pixel or be able to do contiguous loads.
-    void spanFastRate(Span span, SkScalar y1) {
-        SkPoint start;
-        SkScalar length;
-        int count;
+    void spanFastRate(Span span) {
+        SkPoint start; SkScalar length; int count;
         std::tie(start, length, count) = span;
         SkScalar x = X(start);
         SkScalar y = Y(start);
 
-        // In this sampler, it is assumed that if span.StartY() and y1 are the same then both
-        // y-lines are on the same tile.
-        if (y == y1) {
-            // Both y-lines are on the same tile.
-            span_fallback(span, this);
-        } else {
-            // The y-lines are on different tiles.
-            SkScalar dx = length / (count - 1);
-            Sk4f ys = {y - 0.5f, y - 0.5f, y1 + 0.5f, y1 + 0.5f};
-            while (count > 0) {
-                Sk4f xs = Sk4f{-0.5f, 0.5f, -0.5f, 0.5f} + Sk4f{x};
-                this->bilerpEdge(xs, ys);
-                x += dx;
-                count -= 1;
-            }
+        SkScalar dx = length / (count - 1);
+        while (count > 0) {
+            fNext->blendPixel(this->bilerpSamplePoint(SkPoint{x, y}));
+            x += dx;
+            count -= 1;
         }
     }
 
-    Next* const fNext;
-    Accessor    fAccessor;
+    Next* const              fNext;
+    const SkShader::TileMode fXEdgeType;
+    const int                fXMax;
+    const SkShader::TileMode fYEdgeType;
+    const int                fYMax;
+    Accessor                 fAccessor;
 };
 
 }  // namespace