From 13f550fb38236363ba52c745bad2dc7f0621c208 Mon Sep 17 00:00:00 2001
From: JunsuChoi <jsuya.choi@samsung.com>
Date: Mon, 25 Oct 2021 14:33:20 +0900
Subject: [PATCH 01/16] example: Modify the number of svg image examples

Find the hidden example...
---
 src/examples/Svg.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/examples/Svg.cpp b/src/examples/Svg.cpp
index 7c05cdf..da30c14 100644
--- a/src/examples/Svg.cpp
+++ b/src/examples/Svg.cpp
@@ -27,8 +27,9 @@
 /* Drawing Commands                                                     */
 /************************************************************************/
 
-#define NUM_PER_LINE 6
-#define SIZE (WIDTH/NUM_PER_LINE)
+#define NUM_PER_ROW 7
+#define NUM_PER_COL 6
+#define SIZE (WIDTH/NUM_PER_ROW)
 
 static int count = 0;
 
@@ -48,7 +49,7 @@ void svgDirCallback(const char* name, const char* path, void* data)
     if (picture->load(buf) != tvg::Result::Success) return;
 
     picture->size(SIZE, SIZE);
-    picture->translate((count % NUM_PER_LINE) * SIZE, SIZE * (count / NUM_PER_LINE));
+    picture->translate((count % NUM_PER_ROW) * SIZE, (count / NUM_PER_ROW) * (HEIGHT / NUM_PER_COL));
 
     pictures.push_back(move(picture));
 
-- 
2.7.4


From 57d5f3e3bd871c3a4e69933945fffe7d797bbdff Mon Sep 17 00:00:00 2001
From: JunsuChoi <jsuya.choi@samsung.com>
Date: Mon, 25 Oct 2021 13:43:20 +0900
Subject: [PATCH 02/16] svg_loader SvgLoader: Fix gradient default value

If isXXPercentage is true, then it is calculated accordingly.
---
 src/loaders/svg/tvgSvgLoader.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/loaders/svg/tvgSvgLoader.cpp b/src/loaders/svg/tvgSvgLoader.cpp
index bc4fbd5..696d565 100644
--- a/src/loaders/svg/tvgSvgLoader.cpp
+++ b/src/loaders/svg/tvgSvgLoader.cpp
@@ -2069,11 +2069,11 @@ static SvgStyleGradient* _createRadialGradient(SvgLoaderData* loader, const char
     /**
     * Default values of gradient transformed into global percentage
     */
-    grad->radial->cx = 0.5f / loader->svgParse->global.w;
-    grad->radial->cy = 0.5f / loader->svgParse->global.h;
-    grad->radial->fx = 0.5f / loader->svgParse->global.w;
-    grad->radial->fy = 0.5f / loader->svgParse->global.h;
-    grad->radial->r = 0.5f / (sqrtf(pow(loader->svgParse->global.h, 2) + pow(loader->svgParse->global.w, 2)) / sqrtf(2.0f));
+    grad->radial->cx = 0.5f;
+    grad->radial->cy = 0.5f;
+    grad->radial->fx = 0.5f;
+    grad->radial->fy = 0.5f;
+    grad->radial->r = 0.5f;
     grad->radial->isCxPercentage = true;
     grad->radial->isCyPercentage = true;
     grad->radial->isFxPercentage = true;
@@ -2258,7 +2258,7 @@ static SvgStyleGradient* _createLinearGradient(SvgLoaderData* loader, const char
     /**
     * Default value of x2 is 100% - transformed to the global percentage
     */
-    grad->linear->x2 = 1.0f / loader->svgParse->global.w;
+    grad->linear->x2 = 1.0f;
     grad->linear->isX2Percentage = true;
 
     simpleXmlParseAttributes(buf, bufLength, _attrParseLinearGradientNode, loader);
-- 
2.7.4


From aec59b9a38a29ae3c20811c7010218afd2c1dd08 Mon Sep 17 00:00:00 2001
From: Mira Grudzinska <m.grudzinska@samsung.com>
Date: Sat, 18 Sep 2021 00:33:32 +0200
Subject: [PATCH 03/16] sw_engine: proper linear gradient transformation

In the case of any shape transformation, the gradient fill
was not correct.
---
 src/lib/sw_engine/tvgSwCommon.h   |  1 +
 src/lib/sw_engine/tvgSwFill.cpp   | 25 +++++++++++++++----------
 src/lib/sw_engine/tvgSwMath.cpp   | 25 +++++++++++++++++++++++++
 src/lib/sw_engine/tvgSwRaster.cpp | 27 +--------------------------
 4 files changed, 42 insertions(+), 36 deletions(-)

diff --git a/src/lib/sw_engine/tvgSwCommon.h b/src/lib/sw_engine/tvgSwCommon.h
index 57e3d16..76a7ac9 100644
--- a/src/lib/sw_engine/tvgSwCommon.h
+++ b/src/lib/sw_engine/tvgSwCommon.h
@@ -299,6 +299,7 @@ bool mathSmallCubic(const SwPoint* base, SwFixed& angleIn, SwFixed& angleMid, Sw
 SwFixed mathMean(SwFixed angle1, SwFixed angle2);
 SwPoint mathTransform(const Point* to, const Matrix* transform);
 bool mathUpdateOutlineBBox(const SwOutline* outline, const SwBBox& clipRegion, SwBBox& renderRegion);
+bool mathInverse(const Matrix* m, Matrix* invM);
 
 void shapeReset(SwShape* shape);
 bool shapePrepare(SwShape* shape, const Shape* sdata, const Matrix* transform, const SwBBox& clipRegion, SwBBox& renderRegion, SwMpool* mpool, unsigned tid);
diff --git a/src/lib/sw_engine/tvgSwFill.cpp b/src/lib/sw_engine/tvgSwFill.cpp
index 25348f0..01a8dc5 100644
--- a/src/lib/sw_engine/tvgSwFill.cpp
+++ b/src/lib/sw_engine/tvgSwFill.cpp
@@ -106,15 +106,6 @@ bool _prepareLinear(SwFill* fill, const LinearGradient* linear, const Matrix* tr
     float x1, x2, y1, y2;
     if (linear->linear(&x1, &y1, &x2, &y2) != Result::Success) return false;
 
-    if (transform) {
-        auto t1 = x1;
-        x1 = t1 * transform->e11 + y1 * transform->e12 + transform->e13;
-        y1 = t1 * transform->e21 + y1 * transform->e22 + transform->e23;
-        auto t2 = x2;
-        x2 = t2 * transform->e11 + y2 * transform->e12 + transform->e13;
-        y2 = t2 * transform->e21 + y2 * transform->e22 + transform->e23;
-    }
-
     fill->linear.dx = x2 - x1;
     fill->linear.dy = y2 - y1;
     fill->linear.len = fill->linear.dx * fill->linear.dx + fill->linear.dy * fill->linear.dy;
@@ -123,7 +114,21 @@ bool _prepareLinear(SwFill* fill, const LinearGradient* linear, const Matrix* tr
 
     fill->linear.dx /= fill->linear.len;
     fill->linear.dy /= fill->linear.len;
-    fill->linear.offset = -fill->linear.dx * x1 -fill->linear.dy * y1;
+    fill->linear.offset = -fill->linear.dx * x1 - fill->linear.dy * y1;
+
+    if (transform) {
+        Matrix invTransform;
+        if (!mathInverse(transform, &invTransform)) return false;
+
+        fill->linear.offset += fill->linear.dx * invTransform.e13 + fill->linear.dy * invTransform.e23;
+
+        auto dx = fill->linear.dx;
+        fill->linear.dx = dx * invTransform.e11 + fill->linear.dy * invTransform.e21;
+        fill->linear.dy = dx * invTransform.e12 + fill->linear.dy * invTransform.e22;
+
+        fill->linear.len = fill->linear.dx * fill->linear.dx + fill->linear.dy * fill->linear.dy;
+        if (fill->linear.len < FLT_EPSILON) return true;
+    }
 
     return true;
 }
diff --git a/src/lib/sw_engine/tvgSwMath.cpp b/src/lib/sw_engine/tvgSwMath.cpp
index b6247aa..b5e3c6b 100644
--- a/src/lib/sw_engine/tvgSwMath.cpp
+++ b/src/lib/sw_engine/tvgSwMath.cpp
@@ -20,6 +20,7 @@
  * SOFTWARE.
  */
 #include <math.h>
+#include <float.h>
 #include "tvgSwCommon.h"
 
 
@@ -485,3 +486,27 @@ bool mathUpdateOutlineBBox(const SwOutline* outline, const SwBBox& clipRegion, S
 
     return true;
 }
+
+
+bool mathInverse(const Matrix* m, Matrix* invM)
+{
+    auto det = m->e11 * (m->e22 * m->e33 - m->e32 * m->e23) -
+               m->e12 * (m->e21 * m->e33 - m->e23 * m->e31) +
+               m->e13 * (m->e21 * m->e32 - m->e22 * m->e31);
+
+    if (fabsf(det) < FLT_EPSILON) return false;
+
+    auto invDet = 1 / det;
+
+    invM->e11 = (m->e22 * m->e33 - m->e32 * m->e23) * invDet;
+    invM->e12 = (m->e13 * m->e32 - m->e12 * m->e33) * invDet;
+    invM->e13 = (m->e12 * m->e23 - m->e13 * m->e22) * invDet;
+    invM->e21 = (m->e23 * m->e31 - m->e21 * m->e33) * invDet;
+    invM->e22 = (m->e11 * m->e33 - m->e13 * m->e31) * invDet;
+    invM->e23 = (m->e21 * m->e13 - m->e11 * m->e23) * invDet;
+    invM->e31 = (m->e21 * m->e32 - m->e31 * m->e22) * invDet;
+    invM->e32 = (m->e31 * m->e12 - m->e11 * m->e32) * invDet;
+    invM->e33 = (m->e11 * m->e22 - m->e21 * m->e12) * invDet;
+
+    return true;
+}
diff --git a/src/lib/sw_engine/tvgSwRaster.cpp b/src/lib/sw_engine/tvgSwRaster.cpp
index e5b3681..9dcbc79 100644
--- a/src/lib/sw_engine/tvgSwRaster.cpp
+++ b/src/lib/sw_engine/tvgSwRaster.cpp
@@ -49,31 +49,6 @@ static uint32_t _argbJoin(uint8_t r, uint8_t g, uint8_t b, uint8_t a)
 }
 
 
-static bool _inverse(const Matrix* transform, Matrix* invM)
-{
-    //computes the inverse of a matrix m
-    auto det = transform->e11 * (transform->e22 * transform->e33 - transform->e32 * transform->e23) -
-               transform->e12 * (transform->e21 * transform->e33 - transform->e23 * transform->e31) +
-               transform->e13 * (transform->e21 * transform->e32 - transform->e22 * transform->e31);
-
-    if (fabsf(det) < FLT_EPSILON) return false;
-
-    auto invDet = 1 / det;
-
-    invM->e11 = (transform->e22 * transform->e33 - transform->e32 * transform->e23) * invDet;
-    invM->e12 = (transform->e13 * transform->e32 - transform->e12 * transform->e33) * invDet;
-    invM->e13 = (transform->e12 * transform->e23 - transform->e13 * transform->e22) * invDet;
-    invM->e21 = (transform->e23 * transform->e31 - transform->e21 * transform->e33) * invDet;
-    invM->e22 = (transform->e11 * transform->e33 - transform->e13 * transform->e31) * invDet;
-    invM->e23 = (transform->e21 * transform->e13 - transform->e11 * transform->e23) * invDet;
-    invM->e31 = (transform->e21 * transform->e32 - transform->e31 * transform->e22) * invDet;
-    invM->e32 = (transform->e31 * transform->e12 - transform->e11 * transform->e32) * invDet;
-    invM->e33 = (transform->e11 * transform->e22 - transform->e21 * transform->e12) * invDet;
-
-    return true;
-}
-
-
 static bool _identify(const Matrix* transform)
 {
     if (transform) {
@@ -1560,7 +1535,7 @@ bool rasterImage(SwSurface* surface, SwImage* image, const Matrix* transform, co
     float scaling = 1.0f;
 
     if (transform) {
-        if (!_inverse(transform, &invTransform)) return false;
+        if (!mathInverse(transform, &invTransform)) return false;
         scaling = sqrtf((transform->e11 * transform->e11) + (transform->e21 * transform->e21));
         auto scalingY = sqrtf((transform->e22 * transform->e22) + (transform->e12 * transform->e12));
         //TODO:If the x and y axis scaling is different, a separate algorithm for each axis should be applied.
-- 
2.7.4


From d6825b4a2c81eb6f9fcd4c533da8c51c8a5f5dac Mon Sep 17 00:00:00 2001
From: Mira Grudzinska <m.grudzinska@samsung.com>
Date: Sat, 23 Oct 2021 01:50:18 +0200
Subject: [PATCH 04/16] sw_engine: including gradient transformation into
 gradient preparation

The gradient transformation has to be taken into account as the last
step of the gradient preparation.
---
 src/lib/sw_engine/tvgSwCommon.h   |  2 ++
 src/lib/sw_engine/tvgSwFill.cpp   | 14 ++++++++++++--
 src/lib/sw_engine/tvgSwMath.cpp   | 36 ++++++++++++++++++++++++++++++++++++
 src/lib/sw_engine/tvgSwRaster.cpp | 18 ++----------------
 4 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/src/lib/sw_engine/tvgSwCommon.h b/src/lib/sw_engine/tvgSwCommon.h
index 76a7ac9..42bb91e 100644
--- a/src/lib/sw_engine/tvgSwCommon.h
+++ b/src/lib/sw_engine/tvgSwCommon.h
@@ -300,6 +300,8 @@ SwFixed mathMean(SwFixed angle1, SwFixed angle2);
 SwPoint mathTransform(const Point* to, const Matrix* transform);
 bool mathUpdateOutlineBBox(const SwOutline* outline, const SwBBox& clipRegion, SwBBox& renderRegion);
 bool mathInverse(const Matrix* m, Matrix* invM);
+bool mathMultiply(const Matrix* lhs, Matrix* rhs);
+bool mathIdentity(const Matrix* m);
 
 void shapeReset(SwShape* shape);
 bool shapePrepare(SwShape* shape, const Shape* sdata, const Matrix* transform, const SwBBox& clipRegion, SwBBox& renderRegion, SwMpool* mpool, unsigned tid);
diff --git a/src/lib/sw_engine/tvgSwFill.cpp b/src/lib/sw_engine/tvgSwFill.cpp
index 01a8dc5..15f647b 100644
--- a/src/lib/sw_engine/tvgSwFill.cpp
+++ b/src/lib/sw_engine/tvgSwFill.cpp
@@ -116,9 +116,19 @@ bool _prepareLinear(SwFill* fill, const LinearGradient* linear, const Matrix* tr
     fill->linear.dy /= fill->linear.len;
     fill->linear.offset = -fill->linear.dx * x1 - fill->linear.dy * y1;
 
-    if (transform) {
+    auto gradTransform = linear->transform();
+    bool isTransformation = !mathIdentity(&gradTransform);
+
+    if (isTransformation) {
+        if (transform) mathMultiply(transform, &gradTransform);
+    } else if (transform) {
+        gradTransform = *transform;
+        isTransformation = true;
+    }
+
+    if (isTransformation) {
         Matrix invTransform;
-        if (!mathInverse(transform, &invTransform)) return false;
+        if (!mathInverse(&gradTransform, &invTransform)) return false;
 
         fill->linear.offset += fill->linear.dx * invTransform.e13 + fill->linear.dy * invTransform.e23;
 
diff --git a/src/lib/sw_engine/tvgSwMath.cpp b/src/lib/sw_engine/tvgSwMath.cpp
index b5e3c6b..1f59039 100644
--- a/src/lib/sw_engine/tvgSwMath.cpp
+++ b/src/lib/sw_engine/tvgSwMath.cpp
@@ -510,3 +510,39 @@ bool mathInverse(const Matrix* m, Matrix* invM)
 
     return true;
 }
+
+
+bool mathMultiply(const Matrix* lhs, Matrix* rhs)
+{
+    Matrix m;
+
+    m.e11 = lhs->e11 * rhs->e11 + lhs->e12 * rhs->e21 + lhs->e13 * rhs->e31;
+    m.e12 = lhs->e11 * rhs->e12 + lhs->e12 * rhs->e22 + lhs->e13 * rhs->e32;
+    m.e13 = lhs->e11 * rhs->e13 + lhs->e12 * rhs->e23 + lhs->e13 * rhs->e33;
+
+    m.e21 = lhs->e21 * rhs->e11 + lhs->e22 * rhs->e21 + lhs->e23 * rhs->e31;
+    m.e22 = lhs->e21 * rhs->e12 + lhs->e22 * rhs->e22 + lhs->e23 * rhs->e32;
+    m.e23 = lhs->e21 * rhs->e13 + lhs->e22 * rhs->e23 + lhs->e23 * rhs->e33;
+
+    m.e31 = lhs->e31 * rhs->e11 + lhs->e32 * rhs->e21 + lhs->e33 * rhs->e31;
+    m.e32 = lhs->e31 * rhs->e12 + lhs->e32 * rhs->e22 + lhs->e33 * rhs->e32;
+    m.e33 = lhs->e31 * rhs->e13 + lhs->e32 * rhs->e23 + lhs->e33 * rhs->e33;
+
+    *rhs = m;
+
+    return true;
+}
+
+
+bool mathIdentity(const Matrix* m)
+{
+    if (m) {
+        if (m->e11 != 1.0f || m->e12 != 0.0f || m->e13 != 0.0f ||
+            m->e21 != 0.0f || m->e22 != 1.0f || m->e23 != 0.0f ||
+            m->e31 != 0.0f || m->e32 != 0.0f || m->e33 != 1.0f) {
+            return false;
+        }
+    }
+
+    return true;
+}
diff --git a/src/lib/sw_engine/tvgSwRaster.cpp b/src/lib/sw_engine/tvgSwRaster.cpp
index 9dcbc79..35d0ba4 100644
--- a/src/lib/sw_engine/tvgSwRaster.cpp
+++ b/src/lib/sw_engine/tvgSwRaster.cpp
@@ -49,20 +49,6 @@ static uint32_t _argbJoin(uint8_t r, uint8_t g, uint8_t b, uint8_t a)
 }
 
 
-static bool _identify(const Matrix* transform)
-{
-    if (transform) {
-        if (transform->e11 != 1.0f || transform->e12 != 0.0f || transform->e13 != 0.0f ||
-            transform->e21 != 0.0f || transform->e22 != 1.0f || transform->e23 != 0.0f ||
-            transform->e31 != 0.0f || transform->e32 != 0.0f || transform->e33 != 1.0f) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-
 static bool _translucent(const SwSurface* surface, uint8_t a)
 {
     if (a < 255) return true;
@@ -1548,7 +1534,7 @@ bool rasterImage(SwSurface* surface, SwImage* image, const Matrix* transform, co
 
     if (image->rle) {
         //Fast track
-        if (_identify(transform)) {
+        if (mathIdentity(transform)) {
             //OPTIMIZE ME: Support non transformed image. Only shifted image can use these routines.
             if (translucent) return _rasterTranslucentImageRle(surface, image->rle, image->data, image->w, image->h, opacity);
             return _rasterImageRle(surface, image->rle, image->data, image->w, image->h);
@@ -1565,7 +1551,7 @@ bool rasterImage(SwSurface* surface, SwImage* image, const Matrix* transform, co
     }
     else {
         //Fast track
-        if (_identify(transform)) {
+        if (mathIdentity(transform)) {
             //OPTIMIZE ME: Support non transformed image. Only shifted image can use these routines.
             if (translucent) return _rasterTranslucentImage(surface, image->data, image->w, image->h, opacity, bbox);
             return _rasterImage(surface, image->data, image->w, image->h, bbox);
-- 
2.7.4


From c9d5cf7cc237f72c7fb4d67466c2afb14fd80130 Mon Sep 17 00:00:00 2001
From: Mira Grudzinska <m.grudzinska@samsung.com>
Date: Fri, 22 Oct 2021 13:46:06 +0200
Subject: [PATCH 05/16] capi: gradient transformation apis added

---
 src/bindings/capi/thorvg_capi.h | 31 +++++++++++++++++++++++++++++++
 src/bindings/capi/tvgCapi.cpp   | 16 ++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/src/bindings/capi/thorvg_capi.h b/src/bindings/capi/thorvg_capi.h
index b675865..23c3329 100644
--- a/src/bindings/capi/thorvg_capi.h
+++ b/src/bindings/capi/thorvg_capi.h
@@ -1694,6 +1694,37 @@ TVG_EXPORT Tvg_Result tvg_gradient_get_spread(Tvg_Gradient* grad, Tvg_Stroke_Fil
 
 
 /*!
+* \brief Sets the matrix of the affine transformation for the gradient object. (BETA_API)
+*
+* The augmented matrix of the transformation is expected to be given.
+*
+* \param[in] grad The Tvg_Gradient object to be transformed.
+* \param[in] m The 3x3 augmented matrix.
+*
+* \return Tvg_Result enumeration.
+* \retval TVG_RESULT_SUCCESS Succeed.
+* \retval TVG_RESULT_INVALID_ARGUMENT A @c nullptr is passed as the argument.
+* \retval TVG_RESULT_FAILED_ALLOCATION An internal error with a memory allocation.
+*/
+TVG_EXPORT Tvg_Result tvg_gradient_set_transform(Tvg_Gradient* grad, const Tvg_Matrix* m);
+
+
+/*!
+* \brief Gets the matrix of the affine transformation of the gradient object. (BETA_API)
+*
+* In case no transformation was applied, the identity matrix is set.
+*
+* \param[in] grad The Tvg_Gradient object of which to get the transformation matrix.
+* \param[out] m The 3x3 augmented matrix.
+*
+* \return Tvg_Result enumeration.
+* \retval TVG_RESULT_SUCCESS Succeed.
+* \retval TVG_RESULT_INVALID_ARGUMENT A @c nullptr is passed as the argument.
+*/
+TVG_EXPORT Tvg_Result tvg_gradient_get_transform(const Tvg_Gradient* grad, Tvg_Matrix* m);
+
+
+/*!
 * \brief Duplicates the given Tvg_Gradient object.
 *
 * Creates a new object and sets its all properties as in the original object.
diff --git a/src/bindings/capi/tvgCapi.cpp b/src/bindings/capi/tvgCapi.cpp
index 245d47c..ebcbd01 100644
--- a/src/bindings/capi/tvgCapi.cpp
+++ b/src/bindings/capi/tvgCapi.cpp
@@ -170,6 +170,7 @@ TVG_EXPORT Tvg_Result tvg_paint_transform(Tvg_Paint* paint, const Tvg_Matrix* m)
     return (Tvg_Result) reinterpret_cast<Paint*>(paint)->transform(*(reinterpret_cast<const Matrix*>(m)));
 }
 
+
 TVG_EXPORT Tvg_Result tvg_paint_get_transform(Tvg_Paint* paint, Tvg_Matrix* m)
 {
     if (!paint || !m) return TVG_RESULT_INVALID_ARGUMENT;
@@ -597,6 +598,21 @@ TVG_EXPORT Tvg_Result tvg_gradient_get_spread(Tvg_Gradient* grad, Tvg_Stroke_Fil
     return TVG_RESULT_SUCCESS;
 }
 
+
+TVG_EXPORT Tvg_Result tvg_gradient_set_transform(Tvg_Gradient* grad, const Tvg_Matrix* m)
+{
+    if (!grad || !m) return TVG_RESULT_INVALID_ARGUMENT;
+    return (Tvg_Result) reinterpret_cast<Fill*>(grad)->transform(*(reinterpret_cast<const Matrix*>(m)));
+}
+
+
+TVG_EXPORT Tvg_Result tvg_gradient_get_transform(const Tvg_Gradient* grad, Tvg_Matrix* m)
+{
+    if (!grad || !m) return TVG_RESULT_INVALID_ARGUMENT;
+    *reinterpret_cast<Matrix*>(m) = reinterpret_cast<Fill*>(const_cast<Tvg_Gradient*>(grad))->transform();
+    return TVG_RESULT_SUCCESS;
+}
+
 /************************************************************************/
 /* Scene API                                                            */
 /************************************************************************/
-- 
2.7.4


From 0da8bbe51e8b0bc0940db8f6dbd082633d53e97c Mon Sep 17 00:00:00 2001
From: Mira Grudzinska <m.grudzinska@samsung.com>
Date: Fri, 22 Oct 2021 20:28:28 +0200
Subject: [PATCH 06/16] tests: added Capis gradient transformation tests

---
 test/capi/capiLinearGradient.cpp | 41 ++++++++++++++++++++++++++++++++++++++++
 test/capi/capiRadialGradient.cpp | 41 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

diff --git a/test/capi/capiLinearGradient.cpp b/test/capi/capiLinearGradient.cpp
index adfc959..2a8c5d2 100644
--- a/test/capi/capiLinearGradient.cpp
+++ b/test/capi/capiLinearGradient.cpp
@@ -182,6 +182,47 @@ TEST_CASE("Linear Gradient spread", "[capiLinearGradient]")
     REQUIRE(tvg_gradient_del(NULL) == TVG_RESULT_INVALID_ARGUMENT);
 }
 
+TEST_CASE("Linear Gradient transformation", "[capiLinearGradient]")
+{
+    Tvg_Gradient *gradient = tvg_linear_gradient_new();
+    REQUIRE(gradient);
+
+    Tvg_Matrix matrix_get;
+
+    REQUIRE(tvg_gradient_get_transform(NULL, &matrix_get) == TVG_RESULT_INVALID_ARGUMENT);
+    REQUIRE(tvg_gradient_get_transform(gradient, NULL) == TVG_RESULT_INVALID_ARGUMENT);
+    REQUIRE(tvg_gradient_get_transform(gradient, &matrix_get) == TVG_RESULT_SUCCESS);
+
+    REQUIRE(matrix_get.e11 == Approx(1.0f).margin(0.000001));
+    REQUIRE(matrix_get.e12 == Approx(0.0f).margin(0.000001));
+    REQUIRE(matrix_get.e13 == Approx(0.0f).margin(0.000001));
+    REQUIRE(matrix_get.e21 == Approx(0.0f).margin(0.000001));
+    REQUIRE(matrix_get.e22 == Approx(1.0f).margin(0.000001));
+    REQUIRE(matrix_get.e23 == Approx(0.0f).margin(0.000001));
+    REQUIRE(matrix_get.e31 == Approx(0.0f).margin(0.000001));
+    REQUIRE(matrix_get.e32 == Approx(0.0f).margin(0.000001));
+    REQUIRE(matrix_get.e33 == Approx(1.0f).margin(0.000001));
+
+    Tvg_Matrix matrix_set = {1.1f, -2.2f, 3.3f, -4.4f, 5.5f, -6.6f, 7.7f, -8.8f, 9.9f};
+    REQUIRE(tvg_gradient_set_transform(NULL, &matrix_set) == TVG_RESULT_INVALID_ARGUMENT);
+    REQUIRE(tvg_gradient_set_transform(gradient, NULL) == TVG_RESULT_INVALID_ARGUMENT);
+    REQUIRE(tvg_gradient_set_transform(gradient, &matrix_set) == TVG_RESULT_SUCCESS);
+
+    REQUIRE(tvg_gradient_get_transform(gradient, &matrix_get) == TVG_RESULT_SUCCESS);
+
+    REQUIRE(matrix_get.e11 == Approx(matrix_set.e11).margin(0.000001));
+    REQUIRE(matrix_get.e12 == Approx(matrix_set.e12).margin(0.000001));
+    REQUIRE(matrix_get.e13 == Approx(matrix_set.e13).margin(0.000001));
+    REQUIRE(matrix_get.e21 == Approx(matrix_set.e21).margin(0.000001));
+    REQUIRE(matrix_get.e22 == Approx(matrix_set.e22).margin(0.000001));
+    REQUIRE(matrix_get.e23 == Approx(matrix_set.e23).margin(0.000001));
+    REQUIRE(matrix_get.e31 == Approx(matrix_set.e31).margin(0.000001));
+    REQUIRE(matrix_get.e32 == Approx(matrix_set.e32).margin(0.000001));
+    REQUIRE(matrix_get.e33 == Approx(matrix_set.e33).margin(0.000001));
+
+    REQUIRE(tvg_gradient_del(gradient) == TVG_RESULT_SUCCESS);
+}
+
 TEST_CASE("Stroke Linear Gradient", "[capiLinearGradient]")
 {
     Tvg_Paint *shape = tvg_shape_new();
diff --git a/test/capi/capiRadialGradient.cpp b/test/capi/capiRadialGradient.cpp
index 09bcefd..14c59b7 100644
--- a/test/capi/capiRadialGradient.cpp
+++ b/test/capi/capiRadialGradient.cpp
@@ -138,6 +138,47 @@ TEST_CASE("Set/Get gradient spread", "[capiRadialGradient]")
     REQUIRE(tvg_gradient_del(NULL) == TVG_RESULT_INVALID_ARGUMENT);
 }
 
+TEST_CASE("Radial Gradient transformation", "[capiRadialGradient]")
+{
+    Tvg_Gradient *gradient = tvg_radial_gradient_new();
+    REQUIRE(gradient);
+
+    Tvg_Matrix matrix_get;
+
+    REQUIRE(tvg_gradient_get_transform(NULL, &matrix_get) == TVG_RESULT_INVALID_ARGUMENT);
+    REQUIRE(tvg_gradient_get_transform(gradient, NULL) == TVG_RESULT_INVALID_ARGUMENT);
+    REQUIRE(tvg_gradient_get_transform(gradient, &matrix_get) == TVG_RESULT_SUCCESS);
+
+    REQUIRE(matrix_get.e11 == Approx(1.0f).margin(0.000001));
+    REQUIRE(matrix_get.e12 == Approx(0.0f).margin(0.000001));
+    REQUIRE(matrix_get.e13 == Approx(0.0f).margin(0.000001));
+    REQUIRE(matrix_get.e21 == Approx(0.0f).margin(0.000001));
+    REQUIRE(matrix_get.e22 == Approx(1.0f).margin(0.000001));
+    REQUIRE(matrix_get.e23 == Approx(0.0f).margin(0.000001));
+    REQUIRE(matrix_get.e31 == Approx(0.0f).margin(0.000001));
+    REQUIRE(matrix_get.e32 == Approx(0.0f).margin(0.000001));
+    REQUIRE(matrix_get.e33 == Approx(1.0f).margin(0.000001));
+
+    Tvg_Matrix matrix_set = {1.1f, -2.2f, 3.3f, -4.4f, 5.5f, -6.6f, 7.7f, -8.8f, 9.9f};
+    REQUIRE(tvg_gradient_set_transform(NULL, &matrix_set) == TVG_RESULT_INVALID_ARGUMENT);
+    REQUIRE(tvg_gradient_set_transform(gradient, NULL) == TVG_RESULT_INVALID_ARGUMENT);
+    REQUIRE(tvg_gradient_set_transform(gradient, &matrix_set) == TVG_RESULT_SUCCESS);
+
+    REQUIRE(tvg_gradient_get_transform(gradient, &matrix_get) == TVG_RESULT_SUCCESS);
+
+    REQUIRE(matrix_get.e11 == Approx(matrix_set.e11).margin(0.000001));
+    REQUIRE(matrix_get.e12 == Approx(matrix_set.e12).margin(0.000001));
+    REQUIRE(matrix_get.e13 == Approx(matrix_set.e13).margin(0.000001));
+    REQUIRE(matrix_get.e21 == Approx(matrix_set.e21).margin(0.000001));
+    REQUIRE(matrix_get.e22 == Approx(matrix_set.e22).margin(0.000001));
+    REQUIRE(matrix_get.e23 == Approx(matrix_set.e23).margin(0.000001));
+    REQUIRE(matrix_get.e31 == Approx(matrix_set.e31).margin(0.000001));
+    REQUIRE(matrix_get.e32 == Approx(matrix_set.e32).margin(0.000001));
+    REQUIRE(matrix_get.e33 == Approx(matrix_set.e33).margin(0.000001));
+
+    REQUIRE(tvg_gradient_del(gradient) == TVG_RESULT_SUCCESS);
+}
+
 TEST_CASE("Stroke Radial Gradient", "[capiRadialGradient]")
 {
     Tvg_Paint *shape = tvg_shape_new();
-- 
2.7.4


From de6b1fcb77d6a60f9836870280ff360694fc821b Mon Sep 17 00:00:00 2001
From: Hermet Park <chuneon.park@samsung.com>
Date: Mon, 25 Oct 2021 19:24:45 +0900
Subject: [PATCH 07/16] capi: ++missing const in capis.

Non-changeable apis should keep the const parameter so that
user knows the api won't change the internal data.

Thanksfully, we didn't release the capis, we can change it.
---
 src/bindings/capi/thorvg_capi.h |  6 +++---
 src/bindings/capi/tvgCapi.cpp   | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/bindings/capi/thorvg_capi.h b/src/bindings/capi/thorvg_capi.h
index 23c3329..b6c7ab4 100644
--- a/src/bindings/capi/thorvg_capi.h
+++ b/src/bindings/capi/thorvg_capi.h
@@ -823,7 +823,7 @@ TVG_EXPORT Tvg_Result tvg_paint_set_opacity(Tvg_Paint* paint, uint8_t opacity);
 * \retval TVG_RESULT_SUCCESS Succeed.
 * \retval TVG_RESULT_INVALID_ARGUMENT In case a @c nullptr is passed as the argument.
 */
-TVG_EXPORT Tvg_Result tvg_paint_get_opacity(Tvg_Paint* paint, uint8_t* opacity);
+TVG_EXPORT Tvg_Result tvg_paint_get_opacity(const Tvg_Paint* paint, uint8_t* opacity);
 
 
 /*!
@@ -1664,7 +1664,7 @@ TVG_EXPORT Tvg_Result tvg_gradient_set_color_stops(Tvg_Gradient* grad, const Tvg
 * \retval TVG_RESULT_SUCCESS Succeed.
 * \retval TVG_RESULT_INVALID_ARGUMENT A @c nullptr passed as the argument.
 */
-TVG_EXPORT Tvg_Result tvg_gradient_get_color_stops(Tvg_Gradient* grad, const Tvg_Color_Stop** color_stop, uint32_t* cnt);
+TVG_EXPORT Tvg_Result tvg_gradient_get_color_stops(const Tvg_Gradient* grad, const Tvg_Color_Stop** color_stop, uint32_t* cnt);
 
 
 /*!
@@ -1690,7 +1690,7 @@ TVG_EXPORT Tvg_Result tvg_gradient_set_spread(Tvg_Gradient* grad, const Tvg_Stro
 * \retval TVG_RESULT_SUCCESS Succeed.
 * \retval TVG_RESULT_INVALID_ARGUMENT A @c nullptr passed as the argument.
 */
-TVG_EXPORT Tvg_Result tvg_gradient_get_spread(Tvg_Gradient* grad, Tvg_Stroke_Fill* spread);
+TVG_EXPORT Tvg_Result tvg_gradient_get_spread(const Tvg_Gradient* grad, Tvg_Stroke_Fill* spread);
 
 
 /*!
diff --git a/src/bindings/capi/tvgCapi.cpp b/src/bindings/capi/tvgCapi.cpp
index ebcbd01..d6dde93 100644
--- a/src/bindings/capi/tvgCapi.cpp
+++ b/src/bindings/capi/tvgCapi.cpp
@@ -193,10 +193,10 @@ TVG_EXPORT Tvg_Result tvg_paint_set_opacity(Tvg_Paint* paint, uint8_t opacity)
 }
 
 
-TVG_EXPORT Tvg_Result tvg_paint_get_opacity(Tvg_Paint* paint, uint8_t* opacity)
+TVG_EXPORT Tvg_Result tvg_paint_get_opacity(const Tvg_Paint* paint, uint8_t* opacity)
 {
     if (!paint || !opacity)  return TVG_RESULT_INVALID_ARGUMENT;
-    *opacity = reinterpret_cast<Paint*>(paint)->opacity();
+    *opacity = reinterpret_cast<const Paint*>(paint)->opacity();
     return TVG_RESULT_SUCCESS;
 }
 
@@ -576,10 +576,10 @@ TVG_EXPORT Tvg_Result tvg_gradient_set_color_stops(Tvg_Gradient* grad, const Tvg
 }
 
 
-TVG_EXPORT Tvg_Result tvg_gradient_get_color_stops(Tvg_Gradient* grad, const Tvg_Color_Stop** color_stop, uint32_t* cnt)
+TVG_EXPORT Tvg_Result tvg_gradient_get_color_stops(const Tvg_Gradient* grad, const Tvg_Color_Stop** color_stop, uint32_t* cnt)
 {
     if (!grad || !color_stop || !cnt) return TVG_RESULT_INVALID_ARGUMENT;
-    *cnt = reinterpret_cast<Fill*>(grad)->colorStops(reinterpret_cast<const Fill::ColorStop**>(color_stop));
+    *cnt = reinterpret_cast<const Fill*>(grad)->colorStops(reinterpret_cast<const Fill::ColorStop**>(color_stop));
     return TVG_RESULT_SUCCESS;
 }
 
@@ -591,10 +591,10 @@ TVG_EXPORT Tvg_Result tvg_gradient_set_spread(Tvg_Gradient* grad, const Tvg_Stro
 }
 
 
-TVG_EXPORT Tvg_Result tvg_gradient_get_spread(Tvg_Gradient* grad, Tvg_Stroke_Fill* spread)
+TVG_EXPORT Tvg_Result tvg_gradient_get_spread(const Tvg_Gradient* grad, Tvg_Stroke_Fill* spread)
 {
     if (!grad || !spread) return TVG_RESULT_INVALID_ARGUMENT;
-    *spread = (Tvg_Stroke_Fill) reinterpret_cast<Fill*>(grad)->spread();
+    *spread = (Tvg_Stroke_Fill) reinterpret_cast<const Fill*>(grad)->spread();
     return TVG_RESULT_SUCCESS;
 }
 
-- 
2.7.4


From 69cae0234126851d6ae6607caf62fe980417a55c Mon Sep 17 00:00:00 2001
From: Mira Grudzinska <m.grudzinska@samsung.com>
Date: Tue, 26 Oct 2021 02:21:07 +0200
Subject: [PATCH 08/16] example: modify the tvg example to plot all of the
 files

Change-Id: I63908b03d0276aa78bc8a266c62a89c98f9634c1
---
 src/examples/Tvg.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/examples/Tvg.cpp b/src/examples/Tvg.cpp
index 1f57911..1cde659 100644
--- a/src/examples/Tvg.cpp
+++ b/src/examples/Tvg.cpp
@@ -27,8 +27,9 @@
 /* Drawing Commands                                                     */
 /************************************************************************/
 
-#define NUM_PER_LINE 6
-#define SIZE (WIDTH/NUM_PER_LINE)
+#define NUM_PER_ROW 7
+#define NUM_PER_COL 6
+#define SIZE (WIDTH/NUM_PER_ROW)
 
 static int count = 0;
 
@@ -48,7 +49,7 @@ void tvgDirCallback(const char* name, const char* path, void* data)
     if (picture->load(buf) != tvg::Result::Success) return;
 
     picture->size(SIZE, SIZE);
-    picture->translate((count % NUM_PER_LINE) * SIZE, SIZE * (count / NUM_PER_LINE));
+    picture->translate((count % NUM_PER_ROW) * SIZE, (count / NUM_PER_ROW) * (HEIGHT / NUM_PER_COL));
 
     pictures.push_back(move(picture));
 
-- 
2.7.4


From 768a3dd930a9a89c0bf72574b62c1be812015a18 Mon Sep 17 00:00:00 2001
From: Mira Grudzinska <m.grudzinska@samsung.com>
Date: Tue, 26 Oct 2021 02:02:46 +0200
Subject: [PATCH 09/16] capi: casting on 'const Paint*' instead of constness
 removing

---
 src/bindings/capi/tvgCapi.cpp | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/bindings/capi/tvgCapi.cpp b/src/bindings/capi/tvgCapi.cpp
index d6dde93..860472a 100644
--- a/src/bindings/capi/tvgCapi.cpp
+++ b/src/bindings/capi/tvgCapi.cpp
@@ -27,8 +27,6 @@
 using namespace std;
 using namespace tvg;
 
-#define CCP(A) const_cast<Tvg_Paint*>(A)  //Const-Cast-Paint
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -322,7 +320,7 @@ TVG_EXPORT Tvg_Result tvg_shape_set_stroke_width(Tvg_Paint* paint, float width)
 TVG_EXPORT Tvg_Result tvg_shape_get_stroke_width(const Tvg_Paint* paint, float* width)
 {
     if (!paint || !width) return TVG_RESULT_INVALID_ARGUMENT;
-    *width = reinterpret_cast<Shape*>(CCP(paint))->strokeWidth();
+    *width = reinterpret_cast<const Shape*>(paint)->strokeWidth();
     return TVG_RESULT_SUCCESS;
 }
 
@@ -337,7 +335,7 @@ TVG_EXPORT Tvg_Result tvg_shape_set_stroke_color(Tvg_Paint* paint, uint8_t r, ui
 TVG_EXPORT Tvg_Result tvg_shape_get_stroke_color(const Tvg_Paint* paint, uint8_t* r, uint8_t* g, uint8_t* b, uint8_t* a)
 {
     if (!paint) return TVG_RESULT_INVALID_ARGUMENT;
-    return (Tvg_Result) reinterpret_cast<Shape*>(CCP(paint))->strokeColor(r, g, b, a);
+    return (Tvg_Result) reinterpret_cast<const Shape*>(paint)->strokeColor(r, g, b, a);
 }
 
 
@@ -358,7 +356,7 @@ TVG_EXPORT Tvg_Result tvg_shape_set_stroke_radial_gradient(Tvg_Paint* paint, Tvg
 TVG_EXPORT Tvg_Result tvg_shape_get_stroke_gradient(const Tvg_Paint* paint, Tvg_Gradient** gradient)
 {
    if (!paint || !gradient) return TVG_RESULT_INVALID_ARGUMENT;
-   *gradient = (Tvg_Gradient*)(reinterpret_cast<Shape*>(CCP(paint))->strokeFill());
+   *gradient = (Tvg_Gradient*)(reinterpret_cast<const Shape*>(paint)->strokeFill());
    return TVG_RESULT_SUCCESS;
 }
 
@@ -373,7 +371,7 @@ TVG_EXPORT Tvg_Result tvg_shape_set_stroke_dash(Tvg_Paint* paint, const float* d
 TVG_EXPORT Tvg_Result tvg_shape_get_stroke_dash(const Tvg_Paint* paint, const float** dashPattern, uint32_t* cnt)
 {
     if (!paint || !cnt || !dashPattern) return TVG_RESULT_INVALID_ARGUMENT;
-    *cnt = reinterpret_cast<Shape*>(CCP(paint))->strokeDash(dashPattern);
+    *cnt = reinterpret_cast<const Shape*>(paint)->strokeDash(dashPattern);
     return TVG_RESULT_SUCCESS;
 }
 
@@ -388,7 +386,7 @@ TVG_EXPORT Tvg_Result tvg_shape_set_stroke_cap(Tvg_Paint* paint, Tvg_Stroke_Cap
 TVG_EXPORT Tvg_Result tvg_shape_get_stroke_cap(const Tvg_Paint* paint, Tvg_Stroke_Cap* cap)
 {
     if (!paint || !cap) return TVG_RESULT_INVALID_ARGUMENT;
-    *cap = (Tvg_Stroke_Cap) reinterpret_cast<Shape*>(CCP(paint))->strokeCap();
+    *cap = (Tvg_Stroke_Cap) reinterpret_cast<const Shape*>(paint)->strokeCap();
     return TVG_RESULT_SUCCESS;
 }
 
@@ -403,7 +401,7 @@ TVG_EXPORT Tvg_Result tvg_shape_set_stroke_join(Tvg_Paint* paint, Tvg_Stroke_Joi
 TVG_EXPORT Tvg_Result tvg_shape_get_stroke_join(const Tvg_Paint* paint, Tvg_Stroke_Join* join)
 {
     if (!paint || !join) return TVG_RESULT_INVALID_ARGUMENT;
-    *join = (Tvg_Stroke_Join) reinterpret_cast<Shape*>(CCP(paint))->strokeJoin();
+    *join = (Tvg_Stroke_Join) reinterpret_cast<const Shape*>(paint)->strokeJoin();
     return TVG_RESULT_SUCCESS;
 }
 
@@ -418,7 +416,7 @@ TVG_EXPORT Tvg_Result tvg_shape_set_fill_color(Tvg_Paint* paint, uint8_t r, uint
 TVG_EXPORT Tvg_Result tvg_shape_get_fill_color(const Tvg_Paint* paint, uint8_t* r, uint8_t* g, uint8_t* b, uint8_t* a)
 {
     if (!paint) return TVG_RESULT_INVALID_ARGUMENT;
-    return (Tvg_Result) reinterpret_cast<Shape*>(CCP(paint))->fillColor(r, g, b, a);
+    return (Tvg_Result) reinterpret_cast<const Shape*>(paint)->fillColor(r, g, b, a);
 }
 
 
@@ -432,7 +430,7 @@ TVG_EXPORT Tvg_Result tvg_shape_set_fill_rule(Tvg_Paint* paint, Tvg_Fill_Rule ru
 TVG_EXPORT Tvg_Result tvg_shape_get_fill_rule(const Tvg_Paint* paint, Tvg_Fill_Rule* rule)
 {
     if (!paint || !rule) return TVG_RESULT_INVALID_ARGUMENT;
-    *rule = (Tvg_Fill_Rule) reinterpret_cast<Shape*>(CCP(paint))->fillRule();
+    *rule = (Tvg_Fill_Rule) reinterpret_cast<const Shape*>(paint)->fillRule();
     return TVG_RESULT_SUCCESS;
 }
 
@@ -454,7 +452,7 @@ TVG_EXPORT Tvg_Result tvg_shape_set_radial_gradient(Tvg_Paint* paint, Tvg_Gradie
 TVG_EXPORT Tvg_Result tvg_shape_get_gradient(const Tvg_Paint* paint, Tvg_Gradient** gradient)
 {
    if (!paint || !gradient) return TVG_RESULT_INVALID_ARGUMENT;
-   *gradient = (Tvg_Gradient*)(reinterpret_cast<Shape*>(CCP(paint))->fill());
+   *gradient = (Tvg_Gradient*)(reinterpret_cast<const Shape*>(paint)->fill());
    return TVG_RESULT_SUCCESS;
 }
 
@@ -492,21 +490,21 @@ TVG_EXPORT Tvg_Result tvg_picture_load_data(Tvg_Paint* paint, const char *data,
 TVG_EXPORT Tvg_Result tvg_picture_set_size(Tvg_Paint* paint, float w, float h)
 {
     if (!paint) return TVG_RESULT_INVALID_ARGUMENT;
-    return (Tvg_Result) reinterpret_cast<Picture*>(CCP(paint))->size(w, h);
+    return (Tvg_Result) reinterpret_cast<Picture*>(paint)->size(w, h);
 }
 
 
 TVG_EXPORT Tvg_Result tvg_picture_get_size(const Tvg_Paint* paint, float* w, float* h)
 {
     if (!paint) return TVG_RESULT_INVALID_ARGUMENT;
-    return (Tvg_Result) reinterpret_cast<Picture*>(CCP(paint))->size(w, h);
+    return (Tvg_Result) reinterpret_cast<const Picture*>(paint)->size(w, h);
 }
 
 
 TVG_EXPORT Tvg_Result tvg_picture_get_viewbox(const Tvg_Paint* paint, float* x, float* y, float* w, float* h)
 {
     if (!paint) return TVG_RESULT_INVALID_ARGUMENT;
-    return (Tvg_Result) reinterpret_cast<Picture*>(CCP(paint))->viewbox(x, y, w, h);
+    return (Tvg_Result) reinterpret_cast<const Picture*>(paint)->viewbox(x, y, w, h);
 }
 
 
-- 
2.7.4


From cb8583a76b8e08a0454e46833bd5872c6896bb43 Mon Sep 17 00:00:00 2001
From: Michal Maciola <m.maciola@samsung.com>
Date: Fri, 22 Oct 2021 17:01:14 +0200
Subject: [PATCH 10/16] jpg loader: support static jpeg loader using richgel999

Adds an alternative static jpg loader.

The jpg loader copied the jpeg decoding implementation from this open-source
repo: https://github.com/richgel999/jpeg-compressor
That opensource has the public-domain license, it doesn't have any restriction
of the copy.

note: jpgd.cpp is modified version (changed decompress_jpeg_image_from_stream)
for returning BGRA
---
 src/loaders/jpg/jpgd.cpp         | 3185 ++++++++++++++++++++++++++++++++++++++
 src/loaders/jpg/jpgd.h           |  319 ++++
 src/loaders/jpg/meson.build      |    2 +
 src/loaders/jpg/tvgJpgLoader.cpp |   43 +-
 src/loaders/jpg/tvgJpgLoader.h   |    4 +-
 5 files changed, 3530 insertions(+), 23 deletions(-)
 create mode 100644 src/loaders/jpg/jpgd.cpp
 create mode 100644 src/loaders/jpg/jpgd.h

diff --git a/src/loaders/jpg/jpgd.cpp b/src/loaders/jpg/jpgd.cpp
new file mode 100644
index 0000000..bd40cd3
--- /dev/null
+++ b/src/loaders/jpg/jpgd.cpp
@@ -0,0 +1,3185 @@
+// jpgd.cpp - C++ class for JPEG decompression.
+// Public domain, Rich Geldreich <richgel99@gmail.com>
+// Alex Evans: Linear memory allocator (taken from jpge.h).
+// v1.04, May. 19, 2012: Code tweaks to fix VS2008 static code analysis warnings (all looked harmless)
+//
+// Supports progressive and baseline sequential JPEG image files, and the most common chroma subsampling factors: Y, H1V1, H2V1, H1V2, and H2V2.
+//
+// Chroma upsampling quality: H2V2 is upsampled in the frequency domain, H2V1 and H1V2 are upsampled using point sampling.
+// Chroma upsampling reference: "Fast Scheme for Image Size Change in the Compressed Domain"
+// http://vision.ai.uiuc.edu/~dugad/research/dct/index.html
+
+#include "jpgd.h"
+#include <string.h>
+
+#include <assert.h>
+#define JPGD_ASSERT(x) assert(x)
+
+#ifdef _MSC_VER
+#pragma warning (disable : 4611) // warning C4611: interaction between '_setjmp' and C++ object destruction is non-portable
+#endif
+
+// Set to 1 to enable freq. domain chroma upsampling on images using H2V2 subsampling (0=faster nearest neighbor sampling).
+// This is slower, but results in higher quality on images with highly saturated colors.
+#define JPGD_SUPPORT_FREQ_DOMAIN_UPSAMPLING 1
+
+#define JPGD_TRUE (1)
+#define JPGD_FALSE (0)
+
+#define JPGD_MAX(a,b) (((a)>(b)) ? (a) : (b))
+#define JPGD_MIN(a,b) (((a)<(b)) ? (a) : (b))
+
+namespace jpgd {
+
+static inline void *jpgd_malloc(size_t nSize) { return malloc(nSize); }
+static inline void jpgd_free(void *p) { free(p); }
+
+// DCT coefficients are stored in this sequence.
+static int g_ZAG[64] = {  0,1,8,16,9,2,3,10,17,24,32,25,18,11,4,5,12,19,26,33,40,48,41,34,27,20,13,6,7,14,21,28,35,42,49,56,57,50,43,36,29,22,15,23,30,37,44,51,58,59,52,45,38,31,39,46,53,60,61,54,47,55,62,63 };
+
+enum JPEG_MARKER
+{
+  M_SOF0  = 0xC0, M_SOF1  = 0xC1, M_SOF2  = 0xC2, M_SOF3  = 0xC3, M_SOF5  = 0xC5, M_SOF6  = 0xC6, M_SOF7  = 0xC7, M_JPG   = 0xC8,
+  M_SOF9  = 0xC9, M_SOF10 = 0xCA, M_SOF11 = 0xCB, M_SOF13 = 0xCD, M_SOF14 = 0xCE, M_SOF15 = 0xCF, M_DHT   = 0xC4, M_DAC   = 0xCC,
+  M_RST0  = 0xD0, M_RST1  = 0xD1, M_RST2  = 0xD2, M_RST3  = 0xD3, M_RST4  = 0xD4, M_RST5  = 0xD5, M_RST6  = 0xD6, M_RST7  = 0xD7,
+  M_SOI   = 0xD8, M_EOI   = 0xD9, M_SOS   = 0xDA, M_DQT   = 0xDB, M_DNL   = 0xDC, M_DRI   = 0xDD, M_DHP   = 0xDE, M_EXP   = 0xDF,
+  M_APP0  = 0xE0, M_APP15 = 0xEF, M_JPG0  = 0xF0, M_JPG13 = 0xFD, M_COM   = 0xFE, M_TEM   = 0x01, M_ERROR = 0x100, RST0   = 0xD0
+};
+
+enum JPEG_SUBSAMPLING { JPGD_GRAYSCALE = 0, JPGD_YH1V1, JPGD_YH2V1, JPGD_YH1V2, JPGD_YH2V2 };
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define SCALEDONE ((int32)1)
+
+#define FIX_0_298631336  ((int32)2446)        /* FIX(0.298631336) */
+#define FIX_0_390180644  ((int32)3196)        /* FIX(0.390180644) */
+#define FIX_0_541196100  ((int32)4433)        /* FIX(0.541196100) */
+#define FIX_0_765366865  ((int32)6270)        /* FIX(0.765366865) */
+#define FIX_0_899976223  ((int32)7373)        /* FIX(0.899976223) */
+#define FIX_1_175875602  ((int32)9633)        /* FIX(1.175875602) */
+#define FIX_1_501321110  ((int32)12299)       /* FIX(1.501321110) */
+#define FIX_1_847759065  ((int32)15137)       /* FIX(1.847759065) */
+#define FIX_1_961570560  ((int32)16069)       /* FIX(1.961570560) */
+#define FIX_2_053119869  ((int32)16819)       /* FIX(2.053119869) */
+#define FIX_2_562915447  ((int32)20995)       /* FIX(2.562915447) */
+#define FIX_3_072711026  ((int32)25172)       /* FIX(3.072711026) */
+
+#define DESCALE(x,n)  (((x) + (SCALEDONE << ((n)-1))) >> (n))
+#define DESCALE_ZEROSHIFT(x,n)  (((x) + (128 << (n)) + (SCALEDONE << ((n)-1))) >> (n))
+
+#define MULTIPLY(var, cnst)  ((var) * (cnst))
+
+#define CLAMP(i) ((static_cast<uint>(i) > 255) ? (((~i) >> 31) & 0xFF) : (i))
+
+// Compiler creates a fast path 1D IDCT for X non-zero columns
+template <int NONZERO_COLS>
+struct Row
+{
+  static void idct(int* pTemp, const jpgd_block_t* pSrc)
+  {
+    // ACCESS_COL() will be optimized at compile time to either an array access, or 0.
+    #define ACCESS_COL(x) (((x) < NONZERO_COLS) ? (int)pSrc[x] : 0)
+
+    const int z2 = ACCESS_COL(2), z3 = ACCESS_COL(6);
+
+    const int z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+    const int tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
+    const int tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+
+    const int tmp0 = (ACCESS_COL(0) + ACCESS_COL(4)) << CONST_BITS;
+    const int tmp1 = (ACCESS_COL(0) - ACCESS_COL(4)) << CONST_BITS;
+
+    const int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
+
+    const int atmp0 = ACCESS_COL(7), atmp1 = ACCESS_COL(5), atmp2 = ACCESS_COL(3), atmp3 = ACCESS_COL(1);
+
+    const int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
+    const int bz5 = MULTIPLY(bz3 + bz4, FIX_1_175875602);
+
+    const int az1 = MULTIPLY(bz1, - FIX_0_899976223);
+    const int az2 = MULTIPLY(bz2, - FIX_2_562915447);
+    const int az3 = MULTIPLY(bz3, - FIX_1_961570560) + bz5;
+    const int az4 = MULTIPLY(bz4, - FIX_0_390180644) + bz5;
+
+    const int btmp0 = MULTIPLY(atmp0, FIX_0_298631336) + az1 + az3;
+    const int btmp1 = MULTIPLY(atmp1, FIX_2_053119869) + az2 + az4;
+    const int btmp2 = MULTIPLY(atmp2, FIX_3_072711026) + az2 + az3;
+    const int btmp3 = MULTIPLY(atmp3, FIX_1_501321110) + az1 + az4;
+
+    pTemp[0] = DESCALE(tmp10 + btmp3, CONST_BITS-PASS1_BITS);
+    pTemp[7] = DESCALE(tmp10 - btmp3, CONST_BITS-PASS1_BITS);
+    pTemp[1] = DESCALE(tmp11 + btmp2, CONST_BITS-PASS1_BITS);
+    pTemp[6] = DESCALE(tmp11 - btmp2, CONST_BITS-PASS1_BITS);
+    pTemp[2] = DESCALE(tmp12 + btmp1, CONST_BITS-PASS1_BITS);
+    pTemp[5] = DESCALE(tmp12 - btmp1, CONST_BITS-PASS1_BITS);
+    pTemp[3] = DESCALE(tmp13 + btmp0, CONST_BITS-PASS1_BITS);
+    pTemp[4] = DESCALE(tmp13 - btmp0, CONST_BITS-PASS1_BITS);
+  }
+};
+
+template <>
+struct Row<0>
+{
+  static void idct(int* pTemp, const jpgd_block_t* pSrc)
+  {
+#ifdef _MSC_VER
+    pTemp; pSrc;
+#endif
+  }
+};
+
+template <>
+struct Row<1>
+{
+  static void idct(int* pTemp, const jpgd_block_t* pSrc)
+  {
+    const int dcval = (pSrc[0] << PASS1_BITS);
+
+    pTemp[0] = dcval;
+    pTemp[1] = dcval;
+    pTemp[2] = dcval;
+    pTemp[3] = dcval;
+    pTemp[4] = dcval;
+    pTemp[5] = dcval;
+    pTemp[6] = dcval;
+    pTemp[7] = dcval;
+  }
+};
+
+// Compiler creates a fast path 1D IDCT for X non-zero rows
+template <int NONZERO_ROWS>
+struct Col
+{
+  static void idct(uint8* pDst_ptr, const int* pTemp)
+  {
+    // ACCESS_ROW() will be optimized at compile time to either an array access, or 0.
+    #define ACCESS_ROW(x) (((x) < NONZERO_ROWS) ? pTemp[x * 8] : 0)
+
+    const int z2 = ACCESS_ROW(2);
+    const int z3 = ACCESS_ROW(6);
+
+    const int z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+    const int tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
+    const int tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+
+    const int tmp0 = (ACCESS_ROW(0) + ACCESS_ROW(4)) << CONST_BITS;
+    const int tmp1 = (ACCESS_ROW(0) - ACCESS_ROW(4)) << CONST_BITS;
+
+    const int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
+
+    const int atmp0 = ACCESS_ROW(7), atmp1 = ACCESS_ROW(5), atmp2 = ACCESS_ROW(3), atmp3 = ACCESS_ROW(1);
+
+    const int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
+    const int bz5 = MULTIPLY(bz3 + bz4, FIX_1_175875602);
+
+    const int az1 = MULTIPLY(bz1, - FIX_0_899976223);
+    const int az2 = MULTIPLY(bz2, - FIX_2_562915447);
+    const int az3 = MULTIPLY(bz3, - FIX_1_961570560) + bz5;
+    const int az4 = MULTIPLY(bz4, - FIX_0_390180644) + bz5;
+
+    const int btmp0 = MULTIPLY(atmp0, FIX_0_298631336) + az1 + az3;
+    const int btmp1 = MULTIPLY(atmp1, FIX_2_053119869) + az2 + az4;
+    const int btmp2 = MULTIPLY(atmp2, FIX_3_072711026) + az2 + az3;
+    const int btmp3 = MULTIPLY(atmp3, FIX_1_501321110) + az1 + az4;
+
+    int i = DESCALE_ZEROSHIFT(tmp10 + btmp3, CONST_BITS+PASS1_BITS+3);
+    pDst_ptr[8*0] = (uint8)CLAMP(i);
+
+    i = DESCALE_ZEROSHIFT(tmp10 - btmp3, CONST_BITS+PASS1_BITS+3);
+    pDst_ptr[8*7] = (uint8)CLAMP(i);
+
+    i = DESCALE_ZEROSHIFT(tmp11 + btmp2, CONST_BITS+PASS1_BITS+3);
+    pDst_ptr[8*1] = (uint8)CLAMP(i);
+
+    i = DESCALE_ZEROSHIFT(tmp11 - btmp2, CONST_BITS+PASS1_BITS+3);
+    pDst_ptr[8*6] = (uint8)CLAMP(i);
+
+    i = DESCALE_ZEROSHIFT(tmp12 + btmp1, CONST_BITS+PASS1_BITS+3);
+    pDst_ptr[8*2] = (uint8)CLAMP(i);
+
+    i = DESCALE_ZEROSHIFT(tmp12 - btmp1, CONST_BITS+PASS1_BITS+3);
+    pDst_ptr[8*5] = (uint8)CLAMP(i);
+
+    i = DESCALE_ZEROSHIFT(tmp13 + btmp0, CONST_BITS+PASS1_BITS+3);
+    pDst_ptr[8*3] = (uint8)CLAMP(i);
+
+    i = DESCALE_ZEROSHIFT(tmp13 - btmp0, CONST_BITS+PASS1_BITS+3);
+    pDst_ptr[8*4] = (uint8)CLAMP(i);
+  }
+};
+
+template <>
+struct Col<1>
+{
+  static void idct(uint8* pDst_ptr, const int* pTemp)
+  {
+    int dcval = DESCALE_ZEROSHIFT(pTemp[0], PASS1_BITS+3);
+    const uint8 dcval_clamped = (uint8)CLAMP(dcval);
+    pDst_ptr[0*8] = dcval_clamped;
+    pDst_ptr[1*8] = dcval_clamped;
+    pDst_ptr[2*8] = dcval_clamped;
+    pDst_ptr[3*8] = dcval_clamped;
+    pDst_ptr[4*8] = dcval_clamped;
+    pDst_ptr[5*8] = dcval_clamped;
+    pDst_ptr[6*8] = dcval_clamped;
+    pDst_ptr[7*8] = dcval_clamped;
+  }
+};
+
+static const uint8 s_idct_row_table[] =
+{
+  1,0,0,0,0,0,0,0, 2,0,0,0,0,0,0,0, 2,1,0,0,0,0,0,0, 2,1,1,0,0,0,0,0, 2,2,1,0,0,0,0,0, 3,2,1,0,0,0,0,0, 4,2,1,0,0,0,0,0, 4,3,1,0,0,0,0,0,
+  4,3,2,0,0,0,0,0, 4,3,2,1,0,0,0,0, 4,3,2,1,1,0,0,0, 4,3,2,2,1,0,0,0, 4,3,3,2,1,0,0,0, 4,4,3,2,1,0,0,0, 5,4,3,2,1,0,0,0, 6,4,3,2,1,0,0,0,
+  6,5,3,2,1,0,0,0, 6,5,4,2,1,0,0,0, 6,5,4,3,1,0,0,0, 6,5,4,3,2,0,0,0, 6,5,4,3,2,1,0,0, 6,5,4,3,2,1,1,0, 6,5,4,3,2,2,1,0, 6,5,4,3,3,2,1,0,
+  6,5,4,4,3,2,1,0, 6,5,5,4,3,2,1,0, 6,6,5,4,3,2,1,0, 7,6,5,4,3,2,1,0, 8,6,5,4,3,2,1,0, 8,7,5,4,3,2,1,0, 8,7,6,4,3,2,1,0, 8,7,6,5,3,2,1,0,
+  8,7,6,5,4,2,1,0, 8,7,6,5,4,3,1,0, 8,7,6,5,4,3,2,0, 8,7,6,5,4,3,2,1, 8,7,6,5,4,3,2,2, 8,7,6,5,4,3,3,2, 8,7,6,5,4,4,3,2, 8,7,6,5,5,4,3,2,
+  8,7,6,6,5,4,3,2, 8,7,7,6,5,4,3,2, 8,8,7,6,5,4,3,2, 8,8,8,6,5,4,3,2, 8,8,8,7,5,4,3,2, 8,8,8,7,6,4,3,2, 8,8,8,7,6,5,3,2, 8,8,8,7,6,5,4,2,
+  8,8,8,7,6,5,4,3, 8,8,8,7,6,5,4,4, 8,8,8,7,6,5,5,4, 8,8,8,7,6,6,5,4, 8,8,8,7,7,6,5,4, 8,8,8,8,7,6,5,4, 8,8,8,8,8,6,5,4, 8,8,8,8,8,7,5,4,
+  8,8,8,8,8,7,6,4, 8,8,8,8,8,7,6,5, 8,8,8,8,8,7,6,6, 8,8,8,8,8,7,7,6, 8,8,8,8,8,8,7,6, 8,8,8,8,8,8,8,6, 8,8,8,8,8,8,8,7, 8,8,8,8,8,8,8,8,
+};
+
+static const uint8 s_idct_col_table[] = { 1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 };
+
+void idct(const jpgd_block_t* pSrc_ptr, uint8* pDst_ptr, int block_max_zag)
+{
+  JPGD_ASSERT(block_max_zag >= 1);
+  JPGD_ASSERT(block_max_zag <= 64);
+
+  if (block_max_zag <= 1)
+  {
+    int k = ((pSrc_ptr[0] + 4) >> 3) + 128;
+    k = CLAMP(k);
+    k = k | (k<<8);
+    k = k | (k<<16);
+
+    for (int i = 8; i > 0; i--)
+    {
+      *(int*)&pDst_ptr[0] = k;
+      *(int*)&pDst_ptr[4] = k;
+      pDst_ptr += 8;
+    }
+    return;
+  }
+
+  int temp[64];
+
+  const jpgd_block_t* pSrc = pSrc_ptr;
+  int* pTemp = temp;
+
+  const uint8* pRow_tab = &s_idct_row_table[(block_max_zag - 1) * 8];
+  int i;
+  for (i = 8; i > 0; i--, pRow_tab++)
+  {
+    switch (*pRow_tab)
+    {
+      case 0: Row<0>::idct(pTemp, pSrc); break;
+      case 1: Row<1>::idct(pTemp, pSrc); break;
+      case 2: Row<2>::idct(pTemp, pSrc); break;
+      case 3: Row<3>::idct(pTemp, pSrc); break;
+      case 4: Row<4>::idct(pTemp, pSrc); break;
+      case 5: Row<5>::idct(pTemp, pSrc); break;
+      case 6: Row<6>::idct(pTemp, pSrc); break;
+      case 7: Row<7>::idct(pTemp, pSrc); break;
+      case 8: Row<8>::idct(pTemp, pSrc); break;
+    }
+
+    pSrc += 8;
+    pTemp += 8;
+  }
+
+  pTemp = temp;
+
+  const int nonzero_rows = s_idct_col_table[block_max_zag - 1];
+  for (i = 8; i > 0; i--)
+  {
+    switch (nonzero_rows)
+    {
+      case 1: Col<1>::idct(pDst_ptr, pTemp); break;
+      case 2: Col<2>::idct(pDst_ptr, pTemp); break;
+      case 3: Col<3>::idct(pDst_ptr, pTemp); break;
+      case 4: Col<4>::idct(pDst_ptr, pTemp); break;
+      case 5: Col<5>::idct(pDst_ptr, pTemp); break;
+      case 6: Col<6>::idct(pDst_ptr, pTemp); break;
+      case 7: Col<7>::idct(pDst_ptr, pTemp); break;
+      case 8: Col<8>::idct(pDst_ptr, pTemp); break;
+    }
+
+    pTemp++;
+    pDst_ptr++;
+  }
+}
+
+void idct_4x4(const jpgd_block_t* pSrc_ptr, uint8* pDst_ptr)
+{
+  int temp[64];
+  int* pTemp = temp;
+  const jpgd_block_t* pSrc = pSrc_ptr;
+
+  for (int i = 4; i > 0; i--)
+  {
+    Row<4>::idct(pTemp, pSrc);
+    pSrc += 8;
+    pTemp += 8;
+  }
+
+  pTemp = temp;
+  for (int i = 8; i > 0; i--)
+  {
+    Col<4>::idct(pDst_ptr, pTemp);
+    pTemp++;
+    pDst_ptr++;
+  }
+}
+
+// Retrieve one character from the input stream.
+inline uint jpeg_decoder::get_char()
+{
+  // Any bytes remaining in buffer?
+  if (!m_in_buf_left)
+  {
+    // Try to get more bytes.
+    prep_in_buffer();
+    // Still nothing to get?
+    if (!m_in_buf_left)
+    {
+      // Pad the end of the stream with 0xFF 0xD9 (EOI marker)
+      int t = m_tem_flag;
+      m_tem_flag ^= 1;
+      if (t)
+        return 0xD9;
+      else
+        return 0xFF;
+    }
+  }
+
+  uint c = *m_pIn_buf_ofs++;
+  m_in_buf_left--;
+
+  return c;
+}
+
+// Same as previous method, except can indicate if the character is a pad character or not.
+inline uint jpeg_decoder::get_char(bool *pPadding_flag)
+{
+  if (!m_in_buf_left)
+  {
+    prep_in_buffer();
+    if (!m_in_buf_left)
+    {
+      *pPadding_flag = true;
+      int t = m_tem_flag;
+      m_tem_flag ^= 1;
+      if (t)
+        return 0xD9;
+      else
+        return 0xFF;
+    }
+  }
+
+  *pPadding_flag = false;
+
+  uint c = *m_pIn_buf_ofs++;
+  m_in_buf_left--;
+
+  return c;
+}
+
+// Inserts a previously retrieved character back into the input buffer.
+inline void jpeg_decoder::stuff_char(uint8 q)
+{
+  *(--m_pIn_buf_ofs) = q;
+  m_in_buf_left++;
+}
+
+// Retrieves one character from the input stream, but does not read past markers. Will continue to return 0xFF when a marker is encountered.
+inline uint8 jpeg_decoder::get_octet()
+{
+  bool padding_flag;
+  int c = get_char(&padding_flag);
+
+  if (c == 0xFF)
+  {
+    if (padding_flag)
+      return 0xFF;
+
+    c = get_char(&padding_flag);
+    if (padding_flag)
+    {
+      stuff_char(0xFF);
+      return 0xFF;
+    }
+
+    if (c == 0x00)
+      return 0xFF;
+    else
+    {
+      stuff_char(static_cast<uint8>(c));
+      stuff_char(0xFF);
+      return 0xFF;
+    }
+  }
+
+  return static_cast<uint8>(c);
+}
+
+// Retrieves a variable number of bits from the input stream. Does not recognize markers.
+inline uint jpeg_decoder::get_bits(int num_bits)
+{
+  if (!num_bits)
+    return 0;
+
+  uint i = m_bit_buf >> (32 - num_bits);
+
+  if ((m_bits_left -= num_bits) <= 0)
+  {
+    m_bit_buf <<= (num_bits += m_bits_left);
+
+    uint c1 = get_char();
+    uint c2 = get_char();
+    m_bit_buf = (m_bit_buf & 0xFFFF0000) | (c1 << 8) | c2;
+
+    m_bit_buf <<= -m_bits_left;
+
+    m_bits_left += 16;
+
+    JPGD_ASSERT(m_bits_left >= 0);
+  }
+  else
+    m_bit_buf <<= num_bits;
+
+  return i;
+}
+
+// Retrieves a variable number of bits from the input stream. Markers will not be read into the input bit buffer. Instead, an infinite number of all 1's will be returned when a marker is encountered.
+inline uint jpeg_decoder::get_bits_no_markers(int num_bits)
+{
+  if (!num_bits)
+    return 0;
+
+  uint i = m_bit_buf >> (32 - num_bits);
+
+  if ((m_bits_left -= num_bits) <= 0)
+  {
+    m_bit_buf <<= (num_bits += m_bits_left);
+
+    if ((m_in_buf_left < 2) || (m_pIn_buf_ofs[0] == 0xFF) || (m_pIn_buf_ofs[1] == 0xFF))
+    {
+      uint c1 = get_octet();
+      uint c2 = get_octet();
+      m_bit_buf |= (c1 << 8) | c2;
+    }
+    else
+    {
+      m_bit_buf |= ((uint)m_pIn_buf_ofs[0] << 8) | m_pIn_buf_ofs[1];
+      m_in_buf_left -= 2;
+      m_pIn_buf_ofs += 2;
+    }
+
+    m_bit_buf <<= -m_bits_left;
+
+    m_bits_left += 16;
+
+    JPGD_ASSERT(m_bits_left >= 0);
+  }
+  else
+    m_bit_buf <<= num_bits;
+
+  return i;
+}
+
+// Decodes a Huffman encoded symbol.
+inline int jpeg_decoder::huff_decode(huff_tables *pH)
+{
+  int symbol;
+
+  // Check first 8-bits: do we have a complete symbol?
+  if ((symbol = pH->look_up[m_bit_buf >> 24]) < 0)
+  {
+    // Decode more bits, use a tree traversal to find symbol.
+    int ofs = 23;
+    do
+    {
+      symbol = pH->tree[-(int)(symbol + ((m_bit_buf >> ofs) & 1))];
+      ofs--;
+    } while (symbol < 0);
+
+    get_bits_no_markers(8 + (23 - ofs));
+  }
+  else
+    get_bits_no_markers(pH->code_size[symbol]);
+
+  return symbol;
+}
+
+// Decodes a Huffman encoded symbol.
+inline int jpeg_decoder::huff_decode(huff_tables *pH, int& extra_bits)
+{
+  int symbol;
+
+  // Check first 8-bits: do we have a complete symbol?
+  if ((symbol = pH->look_up2[m_bit_buf >> 24]) < 0)
+  {
+    // Use a tree traversal to find symbol.
+    int ofs = 23;
+    do
+    {
+      symbol = pH->tree[-(int)(symbol + ((m_bit_buf >> ofs) & 1))];
+      ofs--;
+    } while (symbol < 0);
+
+    get_bits_no_markers(8 + (23 - ofs));
+
+    extra_bits = get_bits_no_markers(symbol & 0xF);
+  }
+  else
+  {
+    JPGD_ASSERT(((symbol >> 8) & 31) == pH->code_size[symbol & 255] + ((symbol & 0x8000) ? (symbol & 15) : 0));
+
+    if (symbol & 0x8000)
+    {
+      get_bits_no_markers((symbol >> 8) & 31);
+      extra_bits = symbol >> 16;
+    }
+    else
+    {
+      int code_size = (symbol >> 8) & 31;
+      int num_extra_bits = symbol & 0xF;
+      int bits = code_size + num_extra_bits;
+      if (bits <= (m_bits_left + 16))
+        extra_bits = get_bits_no_markers(bits) & ((1 << num_extra_bits) - 1);
+      else
+      {
+        get_bits_no_markers(code_size);
+        extra_bits = get_bits_no_markers(num_extra_bits);
+      }
+    }
+
+    symbol &= 0xFF;
+  }
+
+  return symbol;
+}
+
+// Tables and macro used to fully decode the DPCM differences.
+static const int s_extend_test[16] = { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
+static const int s_extend_offset[16] = { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1, ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1, ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1, ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
+static const int s_extend_mask[] = { 0, (1<<0), (1<<1), (1<<2), (1<<3), (1<<4), (1<<5), (1<<6), (1<<7), (1<<8), (1<<9), (1<<10), (1<<11), (1<<12), (1<<13), (1<<14), (1<<15), (1<<16) };
+// The logical AND's in this macro are to shut up static code analysis (aren't really necessary - couldn't find another way to do this)
+#define JPGD_HUFF_EXTEND(x, s) (((x) < s_extend_test[s & 15]) ? ((x) + s_extend_offset[s & 15]) : (x))
+
+// Clamps a value between 0-255.
+inline uint8 jpeg_decoder::clamp(int i)
+{
+  if (static_cast<uint>(i) > 255)
+    i = (((~i) >> 31) & 0xFF);
+
+  return static_cast<uint8>(i);
+}
+
+namespace DCT_Upsample
+{
+  struct Matrix44
+  {
+    typedef int Element_Type;
+    enum { NUM_ROWS = 4, NUM_COLS = 4 };
+
+    Element_Type v[NUM_ROWS][NUM_COLS];
+
+    inline int rows() const { return NUM_ROWS; }
+    inline int cols() const { return NUM_COLS; }
+
+    inline const Element_Type & at(int r, int c) const { return v[r][c]; }
+    inline       Element_Type & at(int r, int c)       { return v[r][c]; }
+
+    inline Matrix44() { }
+
+    inline Matrix44& operator += (const Matrix44& a)
+    {
+      for (int r = 0; r < NUM_ROWS; r++)
+      {
+        at(r, 0) += a.at(r, 0);
+        at(r, 1) += a.at(r, 1);
+        at(r, 2) += a.at(r, 2);
+        at(r, 3) += a.at(r, 3);
+      }
+      return *this;
+    }
+
+    inline Matrix44& operator -= (const Matrix44& a)
+    {
+      for (int r = 0; r < NUM_ROWS; r++)
+      {
+        at(r, 0) -= a.at(r, 0);
+        at(r, 1) -= a.at(r, 1);
+        at(r, 2) -= a.at(r, 2);
+        at(r, 3) -= a.at(r, 3);
+      }
+      return *this;
+    }
+
+    friend inline Matrix44 operator + (const Matrix44& a, const Matrix44& b)
+    {
+      Matrix44 ret;
+      for (int r = 0; r < NUM_ROWS; r++)
+      {
+        ret.at(r, 0) = a.at(r, 0) + b.at(r, 0);
+        ret.at(r, 1) = a.at(r, 1) + b.at(r, 1);
+        ret.at(r, 2) = a.at(r, 2) + b.at(r, 2);
+        ret.at(r, 3) = a.at(r, 3) + b.at(r, 3);
+      }
+      return ret;
+    }
+
+    friend inline Matrix44 operator - (const Matrix44& a, const Matrix44& b)
+    {
+      Matrix44 ret;
+      for (int r = 0; r < NUM_ROWS; r++)
+      {
+        ret.at(r, 0) = a.at(r, 0) - b.at(r, 0);
+        ret.at(r, 1) = a.at(r, 1) - b.at(r, 1);
+        ret.at(r, 2) = a.at(r, 2) - b.at(r, 2);
+        ret.at(r, 3) = a.at(r, 3) - b.at(r, 3);
+      }
+      return ret;
+    }
+
+    static inline void add_and_store(jpgd_block_t* pDst, const Matrix44& a, const Matrix44& b)
+    {
+      for (int r = 0; r < 4; r++)
+      {
+        pDst[0*8 + r] = static_cast<jpgd_block_t>(a.at(r, 0) + b.at(r, 0));
+        pDst[1*8 + r] = static_cast<jpgd_block_t>(a.at(r, 1) + b.at(r, 1));
+        pDst[2*8 + r] = static_cast<jpgd_block_t>(a.at(r, 2) + b.at(r, 2));
+        pDst[3*8 + r] = static_cast<jpgd_block_t>(a.at(r, 3) + b.at(r, 3));
+      }
+    }
+
+    static inline void sub_and_store(jpgd_block_t* pDst, const Matrix44& a, const Matrix44& b)
+    {
+      for (int r = 0; r < 4; r++)
+      {
+        pDst[0*8 + r] = static_cast<jpgd_block_t>(a.at(r, 0) - b.at(r, 0));
+        pDst[1*8 + r] = static_cast<jpgd_block_t>(a.at(r, 1) - b.at(r, 1));
+        pDst[2*8 + r] = static_cast<jpgd_block_t>(a.at(r, 2) - b.at(r, 2));
+        pDst[3*8 + r] = static_cast<jpgd_block_t>(a.at(r, 3) - b.at(r, 3));
+      }
+    }
+  };
+
+  const int FRACT_BITS = 10;
+  const int SCALE = 1 << FRACT_BITS;
+
+  typedef int Temp_Type;
+  #define D(i) (((i) + (SCALE >> 1)) >> FRACT_BITS)
+  #define F(i) ((int)((i) * SCALE + .5f))
+
+  // Any decent C++ compiler will optimize this at compile time to a 0, or an array access.
+  #define AT(c, r) ((((c)>=NUM_COLS)||((r)>=NUM_ROWS)) ? 0 : pSrc[(c)+(r)*8])
+
+  // NUM_ROWS/NUM_COLS = # of non-zero rows/cols in input matrix
+  template<int NUM_ROWS, int NUM_COLS>
+  struct P_Q
+  {
+    static void calc(Matrix44& P, Matrix44& Q, const jpgd_block_t* pSrc)
+    {
+      // 4x8 = 4x8 times 8x8, matrix 0 is constant
+      const Temp_Type X000 = AT(0, 0);
+      const Temp_Type X001 = AT(0, 1);
+      const Temp_Type X002 = AT(0, 2);
+      const Temp_Type X003 = AT(0, 3);
+      const Temp_Type X004 = AT(0, 4);
+      const Temp_Type X005 = AT(0, 5);
+      const Temp_Type X006 = AT(0, 6);
+      const Temp_Type X007 = AT(0, 7);
+      const Temp_Type X010 = D(F(0.415735f) * AT(1, 0) + F(0.791065f) * AT(3, 0) + F(-0.352443f) * AT(5, 0) + F(0.277785f) * AT(7, 0));
+      const Temp_Type X011 = D(F(0.415735f) * AT(1, 1) + F(0.791065f) * AT(3, 1) + F(-0.352443f) * AT(5, 1) + F(0.277785f) * AT(7, 1));
+      const Temp_Type X012 = D(F(0.415735f) * AT(1, 2) + F(0.791065f) * AT(3, 2) + F(-0.352443f) * AT(5, 2) + F(0.277785f) * AT(7, 2));
+      const Temp_Type X013 = D(F(0.415735f) * AT(1, 3) + F(0.791065f) * AT(3, 3) + F(-0.352443f) * AT(5, 3) + F(0.277785f) * AT(7, 3));
+      const Temp_Type X014 = D(F(0.415735f) * AT(1, 4) + F(0.791065f) * AT(3, 4) + F(-0.352443f) * AT(5, 4) + F(0.277785f) * AT(7, 4));
+      const Temp_Type X015 = D(F(0.415735f) * AT(1, 5) + F(0.791065f) * AT(3, 5) + F(-0.352443f) * AT(5, 5) + F(0.277785f) * AT(7, 5));
+      const Temp_Type X016 = D(F(0.415735f) * AT(1, 6) + F(0.791065f) * AT(3, 6) + F(-0.352443f) * AT(5, 6) + F(0.277785f) * AT(7, 6));
+      const Temp_Type X017 = D(F(0.415735f) * AT(1, 7) + F(0.791065f) * AT(3, 7) + F(-0.352443f) * AT(5, 7) + F(0.277785f) * AT(7, 7));
+      const Temp_Type X020 = AT(4, 0);
+      const Temp_Type X021 = AT(4, 1);
+      const Temp_Type X022 = AT(4, 2);
+      const Temp_Type X023 = AT(4, 3);
+      const Temp_Type X024 = AT(4, 4);
+      const Temp_Type X025 = AT(4, 5);
+      const Temp_Type X026 = AT(4, 6);
+      const Temp_Type X027 = AT(4, 7);
+      const Temp_Type X030 = D(F(0.022887f) * AT(1, 0) + F(-0.097545f) * AT(3, 0) + F(0.490393f) * AT(5, 0) + F(0.865723f) * AT(7, 0));
+      const Temp_Type X031 = D(F(0.022887f) * AT(1, 1) + F(-0.097545f) * AT(3, 1) + F(0.490393f) * AT(5, 1) + F(0.865723f) * AT(7, 1));
+      const Temp_Type X032 = D(F(0.022887f) * AT(1, 2) + F(-0.097545f) * AT(3, 2) + F(0.490393f) * AT(5, 2) + F(0.865723f) * AT(7, 2));
+      const Temp_Type X033 = D(F(0.022887f) * AT(1, 3) + F(-0.097545f) * AT(3, 3) + F(0.490393f) * AT(5, 3) + F(0.865723f) * AT(7, 3));
+      const Temp_Type X034 = D(F(0.022887f) * AT(1, 4) + F(-0.097545f) * AT(3, 4) + F(0.490393f) * AT(5, 4) + F(0.865723f) * AT(7, 4));
+      const Temp_Type X035 = D(F(0.022887f) * AT(1, 5) + F(-0.097545f) * AT(3, 5) + F(0.490393f) * AT(5, 5) + F(0.865723f) * AT(7, 5));
+      const Temp_Type X036 = D(F(0.022887f) * AT(1, 6) + F(-0.097545f) * AT(3, 6) + F(0.490393f) * AT(5, 6) + F(0.865723f) * AT(7, 6));
+      const Temp_Type X037 = D(F(0.022887f) * AT(1, 7) + F(-0.097545f) * AT(3, 7) + F(0.490393f) * AT(5, 7) + F(0.865723f) * AT(7, 7));
+
+      // 4x4 = 4x8 times 8x4, matrix 1 is constant
+      P.at(0, 0) = X000;
+      P.at(0, 1) = D(X001 * F(0.415735f) + X003 * F(0.791065f) + X005 * F(-0.352443f) + X007 * F(0.277785f));
+      P.at(0, 2) = X004;
+      P.at(0, 3) = D(X001 * F(0.022887f) + X003 * F(-0.097545f) + X005 * F(0.490393f) + X007 * F(0.865723f));
+      P.at(1, 0) = X010;
+      P.at(1, 1) = D(X011 * F(0.415735f) + X013 * F(0.791065f) + X015 * F(-0.352443f) + X017 * F(0.277785f));
+      P.at(1, 2) = X014;
+      P.at(1, 3) = D(X011 * F(0.022887f) + X013 * F(-0.097545f) + X015 * F(0.490393f) + X017 * F(0.865723f));
+      P.at(2, 0) = X020;
+      P.at(2, 1) = D(X021 * F(0.415735f) + X023 * F(0.791065f) + X025 * F(-0.352443f) + X027 * F(0.277785f));
+      P.at(2, 2) = X024;
+      P.at(2, 3) = D(X021 * F(0.022887f) + X023 * F(-0.097545f) + X025 * F(0.490393f) + X027 * F(0.865723f));
+      P.at(3, 0) = X030;
+      P.at(3, 1) = D(X031 * F(0.415735f) + X033 * F(0.791065f) + X035 * F(-0.352443f) + X037 * F(0.277785f));
+      P.at(3, 2) = X034;
+      P.at(3, 3) = D(X031 * F(0.022887f) + X033 * F(-0.097545f) + X035 * F(0.490393f) + X037 * F(0.865723f));
+      // 40 muls 24 adds
+
+      // 4x4 = 4x8 times 8x4, matrix 1 is constant
+      Q.at(0, 0) = D(X001 * F(0.906127f) + X003 * F(-0.318190f) + X005 * F(0.212608f) + X007 * F(-0.180240f));
+      Q.at(0, 1) = X002;
+      Q.at(0, 2) = D(X001 * F(-0.074658f) + X003 * F(0.513280f) + X005 * F(0.768178f) + X007 * F(-0.375330f));
+      Q.at(0, 3) = X006;
+      Q.at(1, 0) = D(X011 * F(0.906127f) + X013 * F(-0.318190f) + X015 * F(0.212608f) + X017 * F(-0.180240f));
+      Q.at(1, 1) = X012;
+      Q.at(1, 2) = D(X011 * F(-0.074658f) + X013 * F(0.513280f) + X015 * F(0.768178f) + X017 * F(-0.375330f));
+      Q.at(1, 3) = X016;
+      Q.at(2, 0) = D(X021 * F(0.906127f) + X023 * F(-0.318190f) + X025 * F(0.212608f) + X027 * F(-0.180240f));
+      Q.at(2, 1) = X022;
+      Q.at(2, 2) = D(X021 * F(-0.074658f) + X023 * F(0.513280f) + X025 * F(0.768178f) + X027 * F(-0.375330f));
+      Q.at(2, 3) = X026;
+      Q.at(3, 0) = D(X031 * F(0.906127f) + X033 * F(-0.318190f) + X035 * F(0.212608f) + X037 * F(-0.180240f));
+      Q.at(3, 1) = X032;
+      Q.at(3, 2) = D(X031 * F(-0.074658f) + X033 * F(0.513280f) + X035 * F(0.768178f) + X037 * F(-0.375330f));
+      Q.at(3, 3) = X036;
+      // 40 muls 24 adds
+    }
+  };
+
+  template<int NUM_ROWS, int NUM_COLS>
+  struct R_S
+  {
+    static void calc(Matrix44& R, Matrix44& S, const jpgd_block_t* pSrc)
+    {
+      // 4x8 = 4x8 times 8x8, matrix 0 is constant
+      const Temp_Type X100 = D(F(0.906127f) * AT(1, 0) + F(-0.318190f) * AT(3, 0) + F(0.212608f) * AT(5, 0) + F(-0.180240f) * AT(7, 0));
+      const Temp_Type X101 = D(F(0.906127f) * AT(1, 1) + F(-0.318190f) * AT(3, 1) + F(0.212608f) * AT(5, 1) + F(-0.180240f) * AT(7, 1));
+      const Temp_Type X102 = D(F(0.906127f) * AT(1, 2) + F(-0.318190f) * AT(3, 2) + F(0.212608f) * AT(5, 2) + F(-0.180240f) * AT(7, 2));
+      const Temp_Type X103 = D(F(0.906127f) * AT(1, 3) + F(-0.318190f) * AT(3, 3) + F(0.212608f) * AT(5, 3) + F(-0.180240f) * AT(7, 3));
+      const Temp_Type X104 = D(F(0.906127f) * AT(1, 4) + F(-0.318190f) * AT(3, 4) + F(0.212608f) * AT(5, 4) + F(-0.180240f) * AT(7, 4));
+      const Temp_Type X105 = D(F(0.906127f) * AT(1, 5) + F(-0.318190f) * AT(3, 5) + F(0.212608f) * AT(5, 5) + F(-0.180240f) * AT(7, 5));
+      const Temp_Type X106 = D(F(0.906127f) * AT(1, 6) + F(-0.318190f) * AT(3, 6) + F(0.212608f) * AT(5, 6) + F(-0.180240f) * AT(7, 6));
+      const Temp_Type X107 = D(F(0.906127f) * AT(1, 7) + F(-0.318190f) * AT(3, 7) + F(0.212608f) * AT(5, 7) + F(-0.180240f) * AT(7, 7));
+      const Temp_Type X110 = AT(2, 0);
+      const Temp_Type X111 = AT(2, 1);
+      const Temp_Type X112 = AT(2, 2);
+      const Temp_Type X113 = AT(2, 3);
+      const Temp_Type X114 = AT(2, 4);
+      const Temp_Type X115 = AT(2, 5);
+      const Temp_Type X116 = AT(2, 6);
+      const Temp_Type X117 = AT(2, 7);
+      const Temp_Type X120 = D(F(-0.074658f) * AT(1, 0) + F(0.513280f) * AT(3, 0) + F(0.768178f) * AT(5, 0) + F(-0.375330f) * AT(7, 0));
+      const Temp_Type X121 = D(F(-0.074658f) * AT(1, 1) + F(0.513280f) * AT(3, 1) + F(0.768178f) * AT(5, 1) + F(-0.375330f) * AT(7, 1));
+      const Temp_Type X122 = D(F(-0.074658f) * AT(1, 2) + F(0.513280f) * AT(3, 2) + F(0.768178f) * AT(5, 2) + F(-0.375330f) * AT(7, 2));
+      const Temp_Type X123 = D(F(-0.074658f) * AT(1, 3) + F(0.513280f) * AT(3, 3) + F(0.768178f) * AT(5, 3) + F(-0.375330f) * AT(7, 3));
+      const Temp_Type X124 = D(F(-0.074658f) * AT(1, 4) + F(0.513280f) * AT(3, 4) + F(0.768178f) * AT(5, 4) + F(-0.375330f) * AT(7, 4));
+      const Temp_Type X125 = D(F(-0.074658f) * AT(1, 5) + F(0.513280f) * AT(3, 5) + F(0.768178f) * AT(5, 5) + F(-0.375330f) * AT(7, 5));
+      const Temp_Type X126 = D(F(-0.074658f) * AT(1, 6) + F(0.513280f) * AT(3, 6) + F(0.768178f) * AT(5, 6) + F(-0.375330f) * AT(7, 6));
+      const Temp_Type X127 = D(F(-0.074658f) * AT(1, 7) + F(0.513280f) * AT(3, 7) + F(0.768178f) * AT(5, 7) + F(-0.375330f) * AT(7, 7));
+      const Temp_Type X130 = AT(6, 0);
+      const Temp_Type X131 = AT(6, 1);
+      const Temp_Type X132 = AT(6, 2);
+      const Temp_Type X133 = AT(6, 3);
+      const Temp_Type X134 = AT(6, 4);
+      const Temp_Type X135 = AT(6, 5);
+      const Temp_Type X136 = AT(6, 6);
+      const Temp_Type X137 = AT(6, 7);
+      // 80 muls 48 adds
+
+      // 4x4 = 4x8 times 8x4, matrix 1 is constant
+      R.at(0, 0) = X100;
+      R.at(0, 1) = D(X101 * F(0.415735f) + X103 * F(0.791065f) + X105 * F(-0.352443f) + X107 * F(0.277785f));
+      R.at(0, 2) = X104;
+      R.at(0, 3) = D(X101 * F(0.022887f) + X103 * F(-0.097545f) + X105 * F(0.490393f) + X107 * F(0.865723f));
+      R.at(1, 0) = X110;
+      R.at(1, 1) = D(X111 * F(0.415735f) + X113 * F(0.791065f) + X115 * F(-0.352443f) + X117 * F(0.277785f));
+      R.at(1, 2) = X114;
+      R.at(1, 3) = D(X111 * F(0.022887f) + X113 * F(-0.097545f) + X115 * F(0.490393f) + X117 * F(0.865723f));
+      R.at(2, 0) = X120;
+      R.at(2, 1) = D(X121 * F(0.415735f) + X123 * F(0.791065f) + X125 * F(-0.352443f) + X127 * F(0.277785f));
+      R.at(2, 2) = X124;
+      R.at(2, 3) = D(X121 * F(0.022887f) + X123 * F(-0.097545f) + X125 * F(0.490393f) + X127 * F(0.865723f));
+      R.at(3, 0) = X130;
+      R.at(3, 1) = D(X131 * F(0.415735f) + X133 * F(0.791065f) + X135 * F(-0.352443f) + X137 * F(0.277785f));
+      R.at(3, 2) = X134;
+      R.at(3, 3) = D(X131 * F(0.022887f) + X133 * F(-0.097545f) + X135 * F(0.490393f) + X137 * F(0.865723f));
+      // 40 muls 24 adds
+      // 4x4 = 4x8 times 8x4, matrix 1 is constant
+      S.at(0, 0) = D(X101 * F(0.906127f) + X103 * F(-0.318190f) + X105 * F(0.212608f) + X107 * F(-0.180240f));
+      S.at(0, 1) = X102;
+      S.at(0, 2) = D(X101 * F(-0.074658f) + X103 * F(0.513280f) + X105 * F(0.768178f) + X107 * F(-0.375330f));
+      S.at(0, 3) = X106;
+      S.at(1, 0) = D(X111 * F(0.906127f) + X113 * F(-0.318190f) + X115 * F(0.212608f) + X117 * F(-0.180240f));
+      S.at(1, 1) = X112;
+      S.at(1, 2) = D(X111 * F(-0.074658f) + X113 * F(0.513280f) + X115 * F(0.768178f) + X117 * F(-0.375330f));
+      S.at(1, 3) = X116;
+      S.at(2, 0) = D(X121 * F(0.906127f) + X123 * F(-0.318190f) + X125 * F(0.212608f) + X127 * F(-0.180240f));
+      S.at(2, 1) = X122;
+      S.at(2, 2) = D(X121 * F(-0.074658f) + X123 * F(0.513280f) + X125 * F(0.768178f) + X127 * F(-0.375330f));
+      S.at(2, 3) = X126;
+      S.at(3, 0) = D(X131 * F(0.906127f) + X133 * F(-0.318190f) + X135 * F(0.212608f) + X137 * F(-0.180240f));
+      S.at(3, 1) = X132;
+      S.at(3, 2) = D(X131 * F(-0.074658f) + X133 * F(0.513280f) + X135 * F(0.768178f) + X137 * F(-0.375330f));
+      S.at(3, 3) = X136;
+      // 40 muls 24 adds
+    }
+  };
+} // end namespace DCT_Upsample
+
+// Unconditionally frees all allocated m_blocks.
+void jpeg_decoder::free_all_blocks()
+{
+  m_pStream = NULL;
+  for (mem_block *b = m_pMem_blocks; b; )
+  {
+    mem_block *n = b->m_pNext;
+    jpgd_free(b);
+    b = n;
+  }
+  m_pMem_blocks = NULL;
+}
+
+// This method handles all errors. It will never return.
+// It could easily be changed to use C++ exceptions.
+JPGD_NORETURN void jpeg_decoder::stop_decoding(jpgd_status status)
+{
+  m_error_code = status;
+  free_all_blocks();
+  longjmp(m_jmp_state, status);
+}
+
+void *jpeg_decoder::alloc(size_t nSize, bool zero)
+{
+  nSize = (JPGD_MAX(nSize, 1) + 3) & ~3;
+  char *rv = NULL;
+  for (mem_block *b = m_pMem_blocks; b; b = b->m_pNext)
+  {
+    if ((b->m_used_count + nSize) <= b->m_size)
+    {
+      rv = b->m_data + b->m_used_count;
+      b->m_used_count += nSize;
+      break;
+    }
+  }
+  if (!rv)
+  {
+    int capacity = JPGD_MAX(32768 - 256, (nSize + 2047) & ~2047);
+    mem_block *b = (mem_block*)jpgd_malloc(sizeof(mem_block) + capacity);
+    if (!b) { stop_decoding(JPGD_NOTENOUGHMEM); }
+    b->m_pNext = m_pMem_blocks; m_pMem_blocks = b;
+    b->m_used_count = nSize;
+    b->m_size = capacity;
+    rv = b->m_data;
+  }
+  if (zero) memset(rv, 0, nSize);
+  return rv;
+}
+
+void jpeg_decoder::word_clear(void *p, uint16 c, uint n)
+{
+  uint8 *pD = (uint8*)p;
+  const uint8 l = c & 0xFF, h = (c >> 8) & 0xFF;
+  while (n)
+  {
+    pD[0] = l; pD[1] = h; pD += 2;
+    n--;
+  }
+}
+
+// Refill the input buffer.
+// This method will sit in a loop until (A) the buffer is full or (B)
+// the stream's read() method reports and end of file condition.
+void jpeg_decoder::prep_in_buffer()
+{
+  m_in_buf_left = 0;
+  m_pIn_buf_ofs = m_in_buf;
+
+  if (m_eof_flag)
+    return;
+
+  do
+  {
+    int bytes_read = m_pStream->read(m_in_buf + m_in_buf_left, JPGD_IN_BUF_SIZE - m_in_buf_left, &m_eof_flag);
+    if (bytes_read == -1)
+      stop_decoding(JPGD_STREAM_READ);
+
+    m_in_buf_left += bytes_read;
+  } while ((m_in_buf_left < JPGD_IN_BUF_SIZE) && (!m_eof_flag));
+
+  m_total_bytes_read += m_in_buf_left;
+
+  // Pad the end of the block with M_EOI (prevents the decompressor from going off the rails if the stream is invalid).
+  // (This dates way back to when this decompressor was written in C/asm, and the all-asm Huffman decoder did some fancy things to increase perf.)
+  word_clear(m_pIn_buf_ofs + m_in_buf_left, 0xD9FF, 64);
+}
+
+// Read a Huffman code table.
+void jpeg_decoder::read_dht_marker()
+{
+  int i, index, count;
+  uint8 huff_num[17];
+  uint8 huff_val[256];
+
+  uint num_left = get_bits(16);
+
+  if (num_left < 2)
+    stop_decoding(JPGD_BAD_DHT_MARKER);
+
+  num_left -= 2;
+
+  while (num_left)
+  {
+    index = get_bits(8);
+
+    huff_num[0] = 0;
+
+    count = 0;
+
+    for (i = 1; i <= 16; i++)
+    {
+      huff_num[i] = static_cast<uint8>(get_bits(8));
+      count += huff_num[i];
+    }
+
+    if (count > 255)
+      stop_decoding(JPGD_BAD_DHT_COUNTS);
+
+    for (i = 0; i < count; i++)
+      huff_val[i] = static_cast<uint8>(get_bits(8));
+
+    i = 1 + 16 + count;
+
+    if (num_left < (uint)i)
+      stop_decoding(JPGD_BAD_DHT_MARKER);
+
+    num_left -= i;
+
+    if ((index & 0x10) > 0x10)
+      stop_decoding(JPGD_BAD_DHT_INDEX);
+
+    index = (index & 0x0F) + ((index & 0x10) >> 4) * (JPGD_MAX_HUFF_TABLES >> 1);
+
+    if (index >= JPGD_MAX_HUFF_TABLES)
+      stop_decoding(JPGD_BAD_DHT_INDEX);
+
+    if (!m_huff_num[index])
+      m_huff_num[index] = (uint8 *)alloc(17);
+
+    if (!m_huff_val[index])
+      m_huff_val[index] = (uint8 *)alloc(256);
+
+    m_huff_ac[index] = (index & 0x10) != 0;
+    memcpy(m_huff_num[index], huff_num, 17);
+    memcpy(m_huff_val[index], huff_val, 256);
+  }
+}
+
+// Read a quantization table.
+void jpeg_decoder::read_dqt_marker()
+{
+  int n, i, prec;
+  uint num_left;
+  uint temp;
+
+  num_left = get_bits(16);
+
+  if (num_left < 2)
+    stop_decoding(JPGD_BAD_DQT_MARKER);
+
+  num_left -= 2;
+
+  while (num_left)
+  {
+    n = get_bits(8);
+    prec = n >> 4;
+    n &= 0x0F;
+
+    if (n >= JPGD_MAX_QUANT_TABLES)
+      stop_decoding(JPGD_BAD_DQT_TABLE);
+
+    if (!m_quant[n])
+      m_quant[n] = (jpgd_quant_t *)alloc(64 * sizeof(jpgd_quant_t));
+
+    // read quantization entries, in zag order
+    for (i = 0; i < 64; i++)
+    {
+      temp = get_bits(8);
+
+      if (prec)
+        temp = (temp << 8) + get_bits(8);
+
+            m_quant[n][i] = static_cast<jpgd_quant_t>(temp);
+    }
+
+    i = 64 + 1;
+
+    if (prec)
+      i += 64;
+
+    if (num_left < (uint)i)
+      stop_decoding(JPGD_BAD_DQT_LENGTH);
+
+    num_left -= i;
+  }
+}
+
+// Read the start of frame (SOF) marker.
+void jpeg_decoder::read_sof_marker()
+{
+  int i;
+  uint num_left;
+
+  num_left = get_bits(16);
+
+  if (get_bits(8) != 8)   /* precision: sorry, only 8-bit precision is supported right now */
+    stop_decoding(JPGD_BAD_PRECISION);
+
+  m_image_y_size = get_bits(16);
+
+  if ((m_image_y_size < 1) || (m_image_y_size > JPGD_MAX_HEIGHT))
+    stop_decoding(JPGD_BAD_HEIGHT);
+
+  m_image_x_size = get_bits(16);
+
+  if ((m_image_x_size < 1) || (m_image_x_size > JPGD_MAX_WIDTH))
+    stop_decoding(JPGD_BAD_WIDTH);
+
+  m_comps_in_frame = get_bits(8);
+
+  if (m_comps_in_frame > JPGD_MAX_COMPONENTS)
+    stop_decoding(JPGD_TOO_MANY_COMPONENTS);
+
+  if (num_left != (uint)(m_comps_in_frame * 3 + 8))
+    stop_decoding(JPGD_BAD_SOF_LENGTH);
+
+  for (i = 0; i < m_comps_in_frame; i++)
+  {
+    m_comp_ident[i]  = get_bits(8);
+    m_comp_h_samp[i] = get_bits(4);
+    m_comp_v_samp[i] = get_bits(4);
+    m_comp_quant[i]  = get_bits(8);
+  }
+}
+
+// Used to skip unrecognized markers.
+void jpeg_decoder::skip_variable_marker()
+{
+  uint num_left;
+
+  num_left = get_bits(16);
+
+  if (num_left < 2)
+    stop_decoding(JPGD_BAD_VARIABLE_MARKER);
+
+  num_left -= 2;
+
+  while (num_left)
+  {
+    get_bits(8);
+    num_left--;
+  }
+}
+
+// Read a define restart interval (DRI) marker.
+void jpeg_decoder::read_dri_marker()
+{
+  if (get_bits(16) != 4)
+    stop_decoding(JPGD_BAD_DRI_LENGTH);
+
+  m_restart_interval = get_bits(16);
+}
+
+// Read a start of scan (SOS) marker.
+void jpeg_decoder::read_sos_marker()
+{
+  uint num_left;
+  int i, ci, n, c, cc;
+
+  num_left = get_bits(16);
+
+  n = get_bits(8);
+
+  m_comps_in_scan = n;
+
+  num_left -= 3;
+
+  if ( (num_left != (uint)(n * 2 + 3)) || (n < 1) || (n > JPGD_MAX_COMPS_IN_SCAN) )
+    stop_decoding(JPGD_BAD_SOS_LENGTH);
+
+  for (i = 0; i < n; i++)
+  {
+    cc = get_bits(8);
+    c = get_bits(8);
+    num_left -= 2;
+
+    for (ci = 0; ci < m_comps_in_frame; ci++)
+      if (cc == m_comp_ident[ci])
+        break;
+
+    if (ci >= m_comps_in_frame)
+      stop_decoding(JPGD_BAD_SOS_COMP_ID);
+
+    m_comp_list[i]    = ci;
+    m_comp_dc_tab[ci] = (c >> 4) & 15;
+    m_comp_ac_tab[ci] = (c & 15) + (JPGD_MAX_HUFF_TABLES >> 1);
+  }
+
+  m_spectral_start  = get_bits(8);
+  m_spectral_end    = get_bits(8);
+  m_successive_high = get_bits(4);
+  m_successive_low  = get_bits(4);
+
+  if (!m_progressive_flag)
+  {
+    m_spectral_start = 0;
+    m_spectral_end = 63;
+  }
+
+  num_left -= 3;
+
+  while (num_left)                  /* read past whatever is num_left */
+  {
+    get_bits(8);
+    num_left--;
+  }
+}
+
+// Finds the next marker.
+int jpeg_decoder::next_marker()
+{
+  uint c, bytes;
+
+  bytes = 0;
+
+  do
+  {
+    do
+    {
+      bytes++;
+      c = get_bits(8);
+    } while (c != 0xFF);
+
+    do
+    {
+      c = get_bits(8);
+    } while (c == 0xFF);
+
+  } while (c == 0);
+
+  // If bytes > 0 here, there where extra bytes before the marker (not good).
+
+  return c;
+}
+
+// Process markers. Returns when an SOFx, SOI, EOI, or SOS marker is
+// encountered.
+int jpeg_decoder::process_markers()
+{
+  int c;
+
+  for ( ; ; )
+  {
+    c = next_marker();
+
+    switch (c)
+    {
+      case M_SOF0:
+      case M_SOF1:
+      case M_SOF2:
+      case M_SOF3:
+      case M_SOF5:
+      case M_SOF6:
+      case M_SOF7:
+//      case M_JPG:
+      case M_SOF9:
+      case M_SOF10:
+      case M_SOF11:
+      case M_SOF13:
+      case M_SOF14:
+      case M_SOF15:
+      case M_SOI:
+      case M_EOI:
+      case M_SOS:
+      {
+        return c;
+      }
+      case M_DHT:
+      {
+        read_dht_marker();
+        break;
+      }
+      // No arithmitic support - dumb patents!
+      case M_DAC:
+      {
+        stop_decoding(JPGD_NO_ARITHMITIC_SUPPORT);
+        break;
+      }
+      case M_DQT:
+      {
+        read_dqt_marker();
+        break;
+      }
+      case M_DRI:
+      {
+        read_dri_marker();
+        break;
+      }
+      //case M_APP0:  /* no need to read the JFIF marker */
+
+      case M_JPG:
+      case M_RST0:    /* no parameters */
+      case M_RST1:
+      case M_RST2:
+      case M_RST3:
+      case M_RST4:
+      case M_RST5:
+      case M_RST6:
+      case M_RST7:
+      case M_TEM:
+      {
+        stop_decoding(JPGD_UNEXPECTED_MARKER);
+        break;
+      }
+      default:    /* must be DNL, DHP, EXP, APPn, JPGn, COM, or RESn or APP0 */
+      {
+        skip_variable_marker();
+        break;
+      }
+    }
+  }
+}
+
+// Finds the start of image (SOI) marker.
+// This code is rather defensive: it only checks the first 512 bytes to avoid
+// false positives.
+void jpeg_decoder::locate_soi_marker()
+{
+  uint lastchar, thischar;
+  uint bytesleft;
+
+  lastchar = get_bits(8);
+
+  thischar = get_bits(8);
+
+  /* ok if it's a normal JPEG file without a special header */
+
+  if ((lastchar == 0xFF) && (thischar == M_SOI))
+    return;
+
+  bytesleft = 4096; //512;
+
+  for ( ; ; )
+  {
+    if (--bytesleft == 0)
+      stop_decoding(JPGD_NOT_JPEG);
+
+    lastchar = thischar;
+
+    thischar = get_bits(8);
+
+    if (lastchar == 0xFF)
+    {
+      if (thischar == M_SOI)
+        break;
+      else if (thischar == M_EOI) // get_bits will keep returning M_EOI if we read past the end
+        stop_decoding(JPGD_NOT_JPEG);
+    }
+  }
+
+  // Check the next character after marker: if it's not 0xFF, it can't be the start of the next marker, so the file is bad.
+  thischar = (m_bit_buf >> 24) & 0xFF;
+
+  if (thischar != 0xFF)
+    stop_decoding(JPGD_NOT_JPEG);
+}
+
+// Find a start of frame (SOF) marker.
+void jpeg_decoder::locate_sof_marker()
+{
+  locate_soi_marker();
+
+  int c = process_markers();
+
+  switch (c)
+  {
+    case M_SOF2:
+      m_progressive_flag = JPGD_TRUE;
+    case M_SOF0:  /* baseline DCT */
+    case M_SOF1:  /* extended sequential DCT */
+    {
+      read_sof_marker();
+      break;
+    }
+    case M_SOF9:  /* Arithmitic coding */
+    {
+      stop_decoding(JPGD_NO_ARITHMITIC_SUPPORT);
+      break;
+    }
+    default:
+    {
+      stop_decoding(JPGD_UNSUPPORTED_MARKER);
+      break;
+    }
+  }
+}
+
+// Find a start of scan (SOS) marker.
+int jpeg_decoder::locate_sos_marker()
+{
+  int c;
+
+  c = process_markers();
+
+  if (c == M_EOI)
+    return JPGD_FALSE;
+  else if (c != M_SOS)
+    stop_decoding(JPGD_UNEXPECTED_MARKER);
+
+  read_sos_marker();
+
+  return JPGD_TRUE;
+}
+
+// Reset everything to default/uninitialized state.
+void jpeg_decoder::init(jpeg_decoder_stream *pStream)
+{
+  m_pMem_blocks = NULL;
+  m_error_code = JPGD_SUCCESS;
+  m_ready_flag = false;
+  m_image_x_size = m_image_y_size = 0;
+  m_pStream = pStream;
+  m_progressive_flag = JPGD_FALSE;
+
+  memset(m_huff_ac, 0, sizeof(m_huff_ac));
+  memset(m_huff_num, 0, sizeof(m_huff_num));
+  memset(m_huff_val, 0, sizeof(m_huff_val));
+  memset(m_quant, 0, sizeof(m_quant));
+
+  m_scan_type = 0;
+  m_comps_in_frame = 0;
+
+  memset(m_comp_h_samp, 0, sizeof(m_comp_h_samp));
+  memset(m_comp_v_samp, 0, sizeof(m_comp_v_samp));
+  memset(m_comp_quant, 0, sizeof(m_comp_quant));
+  memset(m_comp_ident, 0, sizeof(m_comp_ident));
+  memset(m_comp_h_blocks, 0, sizeof(m_comp_h_blocks));
+  memset(m_comp_v_blocks, 0, sizeof(m_comp_v_blocks));
+
+  m_comps_in_scan = 0;
+  memset(m_comp_list, 0, sizeof(m_comp_list));
+  memset(m_comp_dc_tab, 0, sizeof(m_comp_dc_tab));
+  memset(m_comp_ac_tab, 0, sizeof(m_comp_ac_tab));
+
+  m_spectral_start = 0;
+  m_spectral_end = 0;
+  m_successive_low = 0;
+  m_successive_high = 0;
+  m_max_mcu_x_size = 0;
+  m_max_mcu_y_size = 0;
+  m_blocks_per_mcu = 0;
+  m_max_blocks_per_row = 0;
+  m_mcus_per_row = 0;
+  m_mcus_per_col = 0;
+  m_expanded_blocks_per_component = 0;
+  m_expanded_blocks_per_mcu = 0;
+  m_expanded_blocks_per_row = 0;
+  m_freq_domain_chroma_upsample = false;
+
+  memset(m_mcu_org, 0, sizeof(m_mcu_org));
+
+  m_total_lines_left = 0;
+  m_mcu_lines_left = 0;
+  m_real_dest_bytes_per_scan_line = 0;
+  m_dest_bytes_per_scan_line = 0;
+  m_dest_bytes_per_pixel = 0;
+
+  memset(m_pHuff_tabs, 0, sizeof(m_pHuff_tabs));
+
+  memset(m_dc_coeffs, 0, sizeof(m_dc_coeffs));
+  memset(m_ac_coeffs, 0, sizeof(m_ac_coeffs));
+  memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
+
+  m_eob_run = 0;
+
+  memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
+
+  m_pIn_buf_ofs = m_in_buf;
+  m_in_buf_left = 0;
+  m_eof_flag = false;
+  m_tem_flag = 0;
+
+  memset(m_in_buf_pad_start, 0, sizeof(m_in_buf_pad_start));
+  memset(m_in_buf, 0, sizeof(m_in_buf));
+  memset(m_in_buf_pad_end, 0, sizeof(m_in_buf_pad_end));
+
+  m_restart_interval = 0;
+  m_restarts_left    = 0;
+  m_next_restart_num = 0;
+
+  m_max_mcus_per_row = 0;
+  m_max_blocks_per_mcu = 0;
+  m_max_mcus_per_col = 0;
+
+  memset(m_last_dc_val, 0, sizeof(m_last_dc_val));
+  m_pMCU_coefficients = NULL;
+  m_pSample_buf = NULL;
+
+  m_total_bytes_read = 0;
+
+  m_pScan_line_0 = NULL;
+  m_pScan_line_1 = NULL;
+
+  // Ready the input buffer.
+  prep_in_buffer();
+
+  // Prime the bit buffer.
+  m_bits_left = 16;
+  m_bit_buf = 0;
+
+  get_bits(16);
+  get_bits(16);
+
+  for (int i = 0; i < JPGD_MAX_BLOCKS_PER_MCU; i++)
+    m_mcu_block_max_zag[i] = 64;
+}
+
+#define SCALEBITS 16
+#define ONE_HALF  ((int) 1 << (SCALEBITS-1))
+#define FIX(x)    ((int) ((x) * (1L<<SCALEBITS) + 0.5f))
+
+// Create a few tables that allow us to quickly convert YCbCr to RGB.
+void jpeg_decoder::create_look_ups()
+{
+  for (int i = 0; i <= 255; i++)
+  {
+    int k = i - 128;
+    m_crr[i] = ( FIX(1.40200f)  * k + ONE_HALF) >> SCALEBITS;
+    m_cbb[i] = ( FIX(1.77200f)  * k + ONE_HALF) >> SCALEBITS;
+    m_crg[i] = (-FIX(0.71414f)) * k;
+    m_cbg[i] = (-FIX(0.34414f)) * k + ONE_HALF;
+  }
+}
+
+// This method throws back into the stream any bytes that where read
+// into the bit buffer during initial marker scanning.
+void jpeg_decoder::fix_in_buffer()
+{
+  // In case any 0xFF's where pulled into the buffer during marker scanning.
+  JPGD_ASSERT((m_bits_left & 7) == 0);
+
+  if (m_bits_left == 16)
+    stuff_char( (uint8)(m_bit_buf & 0xFF));
+
+  if (m_bits_left >= 8)
+    stuff_char( (uint8)((m_bit_buf >> 8) & 0xFF));
+
+  stuff_char((uint8)((m_bit_buf >> 16) & 0xFF));
+  stuff_char((uint8)((m_bit_buf >> 24) & 0xFF));
+
+  m_bits_left = 16;
+  get_bits_no_markers(16);
+  get_bits_no_markers(16);
+}
+
+void jpeg_decoder::transform_mcu(int mcu_row)
+{
+  jpgd_block_t* pSrc_ptr = m_pMCU_coefficients;
+  uint8* pDst_ptr = m_pSample_buf + mcu_row * m_blocks_per_mcu * 64;
+
+  for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+  {
+    idct(pSrc_ptr, pDst_ptr, m_mcu_block_max_zag[mcu_block]);
+    pSrc_ptr += 64;
+    pDst_ptr += 64;
+  }
+}
+
+static const uint8 s_max_rc[64] =
+{
+  17, 18, 34, 50, 50, 51, 52, 52, 52, 68, 84, 84, 84, 84, 85, 86, 86, 86, 86, 86,
+  102, 118, 118, 118, 118, 118, 118, 119, 120, 120, 120, 120, 120, 120, 120, 136,
+  136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136,
+  136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136
+};
+
+void jpeg_decoder::transform_mcu_expand(int mcu_row)
+{
+  jpgd_block_t* pSrc_ptr = m_pMCU_coefficients;
+  uint8* pDst_ptr = m_pSample_buf + mcu_row * m_expanded_blocks_per_mcu * 64;
+
+  // Y IDCT
+    int mcu_block;
+  for (mcu_block = 0; mcu_block < m_expanded_blocks_per_component; mcu_block++)
+  {
+    idct(pSrc_ptr, pDst_ptr, m_mcu_block_max_zag[mcu_block]);
+    pSrc_ptr += 64;
+    pDst_ptr += 64;
+  }
+
+  // Chroma IDCT, with upsampling
+    jpgd_block_t temp_block[64];
+
+  for (int i = 0; i < 2; i++)
+  {
+    DCT_Upsample::Matrix44 P, Q, R, S;
+
+    JPGD_ASSERT(m_mcu_block_max_zag[mcu_block] >= 1);
+    JPGD_ASSERT(m_mcu_block_max_zag[mcu_block] <= 64);
+
+    int max_zag = m_mcu_block_max_zag[mcu_block++] - 1; 
+    if (max_zag <= 0) max_zag = 0; // should never happen, only here to shut up static analysis
+    switch (s_max_rc[max_zag])
+    {
+    case 1*16+1:
+      DCT_Upsample::P_Q<1, 1>::calc(P, Q, pSrc_ptr);
+      DCT_Upsample::R_S<1, 1>::calc(R, S, pSrc_ptr);
+      break;
+    case 1*16+2:
+      DCT_Upsample::P_Q<1, 2>::calc(P, Q, pSrc_ptr);
+      DCT_Upsample::R_S<1, 2>::calc(R, S, pSrc_ptr);
+      break;
+    case 2*16+2:
+      DCT_Upsample::P_Q<2, 2>::calc(P, Q, pSrc_ptr);
+      DCT_Upsample::R_S<2, 2>::calc(R, S, pSrc_ptr);
+      break;
+    case 3*16+2:
+      DCT_Upsample::P_Q<3, 2>::calc(P, Q, pSrc_ptr);
+      DCT_Upsample::R_S<3, 2>::calc(R, S, pSrc_ptr);
+      break;
+    case 3*16+3:
+      DCT_Upsample::P_Q<3, 3>::calc(P, Q, pSrc_ptr);
+      DCT_Upsample::R_S<3, 3>::calc(R, S, pSrc_ptr);
+      break;
+    case 3*16+4:
+      DCT_Upsample::P_Q<3, 4>::calc(P, Q, pSrc_ptr);
+      DCT_Upsample::R_S<3, 4>::calc(R, S, pSrc_ptr);
+      break;
+    case 4*16+4:
+      DCT_Upsample::P_Q<4, 4>::calc(P, Q, pSrc_ptr);
+      DCT_Upsample::R_S<4, 4>::calc(R, S, pSrc_ptr);
+      break;
+    case 5*16+4:
+      DCT_Upsample::P_Q<5, 4>::calc(P, Q, pSrc_ptr);
+      DCT_Upsample::R_S<5, 4>::calc(R, S, pSrc_ptr);
+      break;
+    case 5*16+5:
+      DCT_Upsample::P_Q<5, 5>::calc(P, Q, pSrc_ptr);
+      DCT_Upsample::R_S<5, 5>::calc(R, S, pSrc_ptr);
+      break;
+    case 5*16+6:
+      DCT_Upsample::P_Q<5, 6>::calc(P, Q, pSrc_ptr);
+      DCT_Upsample::R_S<5, 6>::calc(R, S, pSrc_ptr);
+      break;
+    case 6*16+6:
+      DCT_Upsample::P_Q<6, 6>::calc(P, Q, pSrc_ptr);
+      DCT_Upsample::R_S<6, 6>::calc(R, S, pSrc_ptr);
+      break;
+    case 7*16+6:
+      DCT_Upsample::P_Q<7, 6>::calc(P, Q, pSrc_ptr);
+      DCT_Upsample::R_S<7, 6>::calc(R, S, pSrc_ptr);
+      break;
+    case 7*16+7:
+      DCT_Upsample::P_Q<7, 7>::calc(P, Q, pSrc_ptr);
+      DCT_Upsample::R_S<7, 7>::calc(R, S, pSrc_ptr);
+      break;
+    case 7*16+8:
+      DCT_Upsample::P_Q<7, 8>::calc(P, Q, pSrc_ptr);
+      DCT_Upsample::R_S<7, 8>::calc(R, S, pSrc_ptr);
+      break;
+    case 8*16+8:
+      DCT_Upsample::P_Q<8, 8>::calc(P, Q, pSrc_ptr);
+      DCT_Upsample::R_S<8, 8>::calc(R, S, pSrc_ptr);
+      break;
+    default:
+      JPGD_ASSERT(false);
+    }
+
+    DCT_Upsample::Matrix44 a(P + Q); P -= Q;
+    DCT_Upsample::Matrix44& b = P;
+    DCT_Upsample::Matrix44 c(R + S); R -= S;
+    DCT_Upsample::Matrix44& d = R;
+
+    DCT_Upsample::Matrix44::add_and_store(temp_block, a, c);
+    idct_4x4(temp_block, pDst_ptr);
+    pDst_ptr += 64;
+
+    DCT_Upsample::Matrix44::sub_and_store(temp_block, a, c);
+    idct_4x4(temp_block, pDst_ptr);
+    pDst_ptr += 64;
+
+    DCT_Upsample::Matrix44::add_and_store(temp_block, b, d);
+    idct_4x4(temp_block, pDst_ptr);
+    pDst_ptr += 64;
+
+    DCT_Upsample::Matrix44::sub_and_store(temp_block, b, d);
+    idct_4x4(temp_block, pDst_ptr);
+    pDst_ptr += 64;
+
+    pSrc_ptr += 64;
+  }
+}
+
+// Loads and dequantizes the next row of (already decoded) coefficients.
+// Progressive images only.
+void jpeg_decoder::load_next_row()
+{
+  int i;
+  jpgd_block_t *p;
+  jpgd_quant_t *q;
+  int mcu_row, mcu_block, row_block = 0;
+  int component_num, component_id;
+  int block_x_mcu[JPGD_MAX_COMPONENTS];
+
+  memset(block_x_mcu, 0, JPGD_MAX_COMPONENTS * sizeof(int));
+
+  for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
+  {
+    int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
+
+    for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+    {
+      component_id = m_mcu_org[mcu_block];
+      q = m_quant[m_comp_quant[component_id]];
+
+      p = m_pMCU_coefficients + 64 * mcu_block;
+
+      jpgd_block_t* pAC = coeff_buf_getp(m_ac_coeffs[component_id], block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
+      jpgd_block_t* pDC = coeff_buf_getp(m_dc_coeffs[component_id], block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
+      p[0] = pDC[0];
+      memcpy(&p[1], &pAC[1], 63 * sizeof(jpgd_block_t));
+
+      for (i = 63; i > 0; i--)
+        if (p[g_ZAG[i]])
+          break;
+
+      m_mcu_block_max_zag[mcu_block] = i + 1;
+
+      for ( ; i >= 0; i--)
+                if (p[g_ZAG[i]])
+                    p[g_ZAG[i]] = static_cast<jpgd_block_t>(p[g_ZAG[i]] * q[i]);
+
+      row_block++;
+
+      if (m_comps_in_scan == 1)
+        block_x_mcu[component_id]++;
+      else
+      {
+        if (++block_x_mcu_ofs == m_comp_h_samp[component_id])
+        {
+          block_x_mcu_ofs = 0;
+
+          if (++block_y_mcu_ofs == m_comp_v_samp[component_id])
+          {
+            block_y_mcu_ofs = 0;
+
+            block_x_mcu[component_id] += m_comp_h_samp[component_id];
+          }
+        }
+      }
+    }
+
+    if (m_freq_domain_chroma_upsample)
+      transform_mcu_expand(mcu_row);
+    else
+      transform_mcu(mcu_row);
+  }
+
+  if (m_comps_in_scan == 1)
+    m_block_y_mcu[m_comp_list[0]]++;
+  else
+  {
+    for (component_num = 0; component_num < m_comps_in_scan; component_num++)
+    {
+      component_id = m_comp_list[component_num];
+
+      m_block_y_mcu[component_id] += m_comp_v_samp[component_id];
+    }
+  }
+}
+
+// Restart interval processing.
+void jpeg_decoder::process_restart()
+{
+  int i;
+  int c = 0;
+
+  // Align to a byte boundry
+  // FIXME: Is this really necessary? get_bits_no_markers() never reads in markers!
+  //get_bits_no_markers(m_bits_left & 7);
+
+  // Let's scan a little bit to find the marker, but not _too_ far.
+  // 1536 is a "fudge factor" that determines how much to scan.
+  for (i = 1536; i > 0; i--)
+    if (get_char() == 0xFF)
+      break;
+
+  if (i == 0)
+    stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+  for ( ; i > 0; i--)
+    if ((c = get_char()) != 0xFF)
+      break;
+
+  if (i == 0)
+    stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+  // Is it the expected marker? If not, something bad happened.
+  if (c != (m_next_restart_num + M_RST0))
+    stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+  // Reset each component's DC prediction values.
+  memset(&m_last_dc_val, 0, m_comps_in_frame * sizeof(uint));
+
+  m_eob_run = 0;
+
+  m_restarts_left = m_restart_interval;
+
+  m_next_restart_num = (m_next_restart_num + 1) & 7;
+
+  // Get the bit buffer going again...
+
+  m_bits_left = 16;
+  get_bits_no_markers(16);
+  get_bits_no_markers(16);
+}
+
+static inline int dequantize_ac(int c, int q) { c *= q; return c; }
+
+// Decodes and dequantizes the next row of coefficients.
+void jpeg_decoder::decode_next_row()
+{
+  int row_block = 0;
+
+  for (int mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
+  {
+    if ((m_restart_interval) && (m_restarts_left == 0))
+      process_restart();
+
+    jpgd_block_t* p = m_pMCU_coefficients;
+    for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++, p += 64)
+    {
+      int component_id = m_mcu_org[mcu_block];
+      jpgd_quant_t* q = m_quant[m_comp_quant[component_id]];
+
+      int r, s;
+      s = huff_decode(m_pHuff_tabs[m_comp_dc_tab[component_id]], r);
+      s = JPGD_HUFF_EXTEND(r, s);
+
+      m_last_dc_val[component_id] = (s += m_last_dc_val[component_id]);
+
+      p[0] = static_cast<jpgd_block_t>(s * q[0]);
+
+      int prev_num_set = m_mcu_block_max_zag[mcu_block];
+
+      huff_tables *pH = m_pHuff_tabs[m_comp_ac_tab[component_id]];
+
+      int k;
+      for (k = 1; k < 64; k++)
+      {
+        int extra_bits;
+        s = huff_decode(pH, extra_bits);
+
+        r = s >> 4;
+        s &= 15;
+
+        if (s)
+        {
+          if (r)
+          {
+            if ((k + r) > 63)
+              stop_decoding(JPGD_DECODE_ERROR);
+
+            if (k < prev_num_set)
+            {
+              int n = JPGD_MIN(r, prev_num_set - k);
+              int kt = k;
+              while (n--)
+                p[g_ZAG[kt++]] = 0;
+            }
+
+            k += r;
+          }
+          
+          s = JPGD_HUFF_EXTEND(extra_bits, s);
+
+          JPGD_ASSERT(k < 64);
+
+          p[g_ZAG[k]] = static_cast<jpgd_block_t>(dequantize_ac(s, q[k])); //s * q[k];
+        }
+        else
+        {
+          if (r == 15)
+          {
+            if ((k + 16) > 64)
+              stop_decoding(JPGD_DECODE_ERROR);
+
+            if (k < prev_num_set)
+            {
+              int n = JPGD_MIN(16, prev_num_set - k);
+              int kt = k;
+              while (n--)
+              {
+                JPGD_ASSERT(kt <= 63);
+                p[g_ZAG[kt++]] = 0;
+              }
+            }
+
+            k += 16 - 1; // - 1 because the loop counter is k
+            JPGD_ASSERT(p[g_ZAG[k]] == 0);
+          }
+          else
+            break;
+        }
+      }
+
+      if (k < prev_num_set)
+      {
+        int kt = k;
+        while (kt < prev_num_set)
+          p[g_ZAG[kt++]] = 0;
+      }
+
+      m_mcu_block_max_zag[mcu_block] = k;
+
+      row_block++;
+    }
+
+    if (m_freq_domain_chroma_upsample)
+      transform_mcu_expand(mcu_row);
+    else
+      transform_mcu(mcu_row);
+
+    m_restarts_left--;
+  }
+}
+
+// YCbCr H1V1 (1x1:1:1, 3 m_blocks per MCU) to RGB
+void jpeg_decoder::H1V1Convert()
+{
+  int row = m_max_mcu_y_size - m_mcu_lines_left;
+  uint8 *d = m_pScan_line_0;
+  uint8 *s = m_pSample_buf + row * 8;
+
+  for (int i = m_max_mcus_per_row; i > 0; i--)
+  {
+    for (int j = 0; j < 8; j++)
+    {
+      int y = s[j];
+      int cb = s[64+j];
+      int cr = s[128+j];
+
+      d[0] = clamp(y + m_crr[cr]);
+      d[1] = clamp(y + ((m_crg[cr] + m_cbg[cb]) >> 16));
+      d[2] = clamp(y + m_cbb[cb]);
+      d[3] = 255;
+
+      d += 4;
+    }
+
+    s += 64*3;
+  }
+}
+
+// YCbCr H2V1 (2x1:1:1, 4 m_blocks per MCU) to RGB
+void jpeg_decoder::H2V1Convert()
+{
+  int row = m_max_mcu_y_size - m_mcu_lines_left;
+  uint8 *d0 = m_pScan_line_0;
+  uint8 *y = m_pSample_buf + row * 8;
+  uint8 *c = m_pSample_buf + 2*64 + row * 8;
+
+  for (int i = m_max_mcus_per_row; i > 0; i--)
+  {
+    for (int l = 0; l < 2; l++)
+    {
+      for (int j = 0; j < 4; j++)
+      {
+        int cb = c[0];
+        int cr = c[64];
+
+        int rc = m_crr[cr];
+        int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+        int bc = m_cbb[cb];
+
+        int yy = y[j<<1];
+        d0[0] = clamp(yy+rc);
+        d0[1] = clamp(yy+gc);
+        d0[2] = clamp(yy+bc);
+        d0[3] = 255;
+
+        yy = y[(j<<1)+1];
+        d0[4] = clamp(yy+rc);
+        d0[5] = clamp(yy+gc);
+        d0[6] = clamp(yy+bc);
+        d0[7] = 255;
+
+        d0 += 8;
+
+        c++;
+      }
+      y += 64;
+    }
+
+    y += 64*4 - 64*2;
+    c += 64*4 - 8;
+  }
+}
+
+// YCbCr H2V1 (1x2:1:1, 4 m_blocks per MCU) to RGB
+void jpeg_decoder::H1V2Convert()
+{
+  int row = m_max_mcu_y_size - m_mcu_lines_left;
+  uint8 *d0 = m_pScan_line_0;
+  uint8 *d1 = m_pScan_line_1;
+  uint8 *y;
+  uint8 *c;
+
+  if (row < 8)
+    y = m_pSample_buf + row * 8;
+  else
+    y = m_pSample_buf + 64*1 + (row & 7) * 8;
+
+  c = m_pSample_buf + 64*2 + (row >> 1) * 8;
+
+  for (int i = m_max_mcus_per_row; i > 0; i--)
+  {
+    for (int j = 0; j < 8; j++)
+    {
+      int cb = c[0+j];
+      int cr = c[64+j];
+
+      int rc = m_crr[cr];
+      int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+      int bc = m_cbb[cb];
+
+      int yy = y[j];
+      d0[0] = clamp(yy+rc);
+      d0[1] = clamp(yy+gc);
+      d0[2] = clamp(yy+bc);
+      d0[3] = 255;
+
+      yy = y[8+j];
+      d1[0] = clamp(yy+rc);
+      d1[1] = clamp(yy+gc);
+      d1[2] = clamp(yy+bc);
+      d1[3] = 255;
+
+      d0 += 4;
+      d1 += 4;
+    }
+
+    y += 64*4;
+    c += 64*4;
+  }
+}
+
+// YCbCr H2V2 (2x2:1:1, 6 m_blocks per MCU) to RGB
+void jpeg_decoder::H2V2Convert()
+{
+    int row = m_max_mcu_y_size - m_mcu_lines_left;
+    uint8 *d0 = m_pScan_line_0;
+    uint8 *d1 = m_pScan_line_1;
+    uint8 *y;
+    uint8 *c;
+
+    if (row < 8)
+        y = m_pSample_buf + row * 8;
+    else
+        y = m_pSample_buf + 64*2 + (row & 7) * 8;
+
+    c = m_pSample_buf + 64*4 + (row >> 1) * 8;
+
+    for (int i = m_max_mcus_per_row; i > 0; i--)
+    {
+        for (int l = 0; l < 2; l++)
+        {
+            for (int j = 0; j < 8; j += 2)
+            {
+                int cb = c[0];
+                int cr = c[64];
+
+                int rc = m_crr[cr];
+                int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+                int bc = m_cbb[cb];
+
+                int yy = y[j];
+                d0[0] = clamp(yy+rc);
+                d0[1] = clamp(yy+gc);
+                d0[2] = clamp(yy+bc);
+                d0[3] = 255;
+
+                yy = y[j+1];
+                d0[4] = clamp(yy+rc);
+                d0[5] = clamp(yy+gc);
+                d0[6] = clamp(yy+bc);
+                d0[7] = 255;
+
+                yy = y[j+8];
+                d1[0] = clamp(yy+rc);
+                d1[1] = clamp(yy+gc);
+                d1[2] = clamp(yy+bc);
+                d1[3] = 255;
+
+                yy = y[j+8+1];
+                d1[4] = clamp(yy+rc);
+                d1[5] = clamp(yy+gc);
+                d1[6] = clamp(yy+bc);
+                d1[7] = 255;
+
+                d0 += 8;
+                d1 += 8;
+
+                c++;
+            }
+            y += 64;
+        }
+
+        y += 64*6 - 64*2;
+        c += 64*6 - 8;
+    }
+}
+
+// Y (1 block per MCU) to 8-bit grayscale
+void jpeg_decoder::gray_convert()
+{
+  int row = m_max_mcu_y_size - m_mcu_lines_left;
+  uint8 *d = m_pScan_line_0;
+  uint8 *s = m_pSample_buf + row * 8;
+
+  for (int i = m_max_mcus_per_row; i > 0; i--)
+  {
+    *(uint *)d = *(uint *)s;
+    *(uint *)(&d[4]) = *(uint *)(&s[4]);
+
+    s += 64;
+    d += 8;
+  }
+}
+
+void jpeg_decoder::expanded_convert()
+{
+  int row = m_max_mcu_y_size - m_mcu_lines_left;
+
+  uint8* Py = m_pSample_buf + (row / 8) * 64 * m_comp_h_samp[0] + (row & 7) * 8;
+
+  uint8* d = m_pScan_line_0;
+
+  for (int i = m_max_mcus_per_row; i > 0; i--)
+  {
+    for (int k = 0; k < m_max_mcu_x_size; k += 8)
+    {
+      const int Y_ofs = k * 8;
+      const int Cb_ofs = Y_ofs + 64 * m_expanded_blocks_per_component;
+      const int Cr_ofs = Y_ofs + 64 * m_expanded_blocks_per_component * 2;
+      for (int j = 0; j < 8; j++)
+      {
+        int y = Py[Y_ofs + j];
+        int cb = Py[Cb_ofs + j];
+        int cr = Py[Cr_ofs + j];
+
+        d[0] = clamp(y + m_crr[cr]);
+        d[1] = clamp(y + ((m_crg[cr] + m_cbg[cb]) >> 16));
+        d[2] = clamp(y + m_cbb[cb]);
+        d[3] = 255;
+
+        d += 4;
+      }
+    }
+
+    Py += 64 * m_expanded_blocks_per_mcu;
+  }
+}
+
+// Find end of image (EOI) marker, so we can return to the user the exact size of the input stream.
+void jpeg_decoder::find_eoi()
+{
+  if (!m_progressive_flag)
+  {
+    // Attempt to read the EOI marker.
+    //get_bits_no_markers(m_bits_left & 7);
+
+    // Prime the bit buffer
+    m_bits_left = 16;
+    get_bits(16);
+    get_bits(16);
+
+    // The next marker _should_ be EOI
+    process_markers();
+  }
+
+  m_total_bytes_read -= m_in_buf_left;
+}
+
+int jpeg_decoder::decode(const void** pScan_line, uint* pScan_line_len)
+{
+  if ((m_error_code) || (!m_ready_flag))
+    return JPGD_FAILED;
+
+  if (m_total_lines_left == 0)
+    return JPGD_DONE;
+
+  if (m_mcu_lines_left == 0)
+  {
+    if (setjmp(m_jmp_state))
+      return JPGD_FAILED;
+
+    if (m_progressive_flag)
+      load_next_row();
+    else
+      decode_next_row();
+
+    // Find the EOI marker if that was the last row.
+    if (m_total_lines_left <= m_max_mcu_y_size)
+      find_eoi();
+
+    m_mcu_lines_left = m_max_mcu_y_size;
+  }
+
+  if (m_freq_domain_chroma_upsample)
+  {
+    expanded_convert();
+    *pScan_line = m_pScan_line_0;
+  }
+  else
+  {
+    switch (m_scan_type)
+    {
+      case JPGD_YH2V2:
+      {
+        if ((m_mcu_lines_left & 1) == 0)
+        {
+          H2V2Convert();
+          *pScan_line = m_pScan_line_0;
+        }
+        else
+          *pScan_line = m_pScan_line_1;
+
+        break;
+      }
+      case JPGD_YH2V1:
+      {
+        H2V1Convert();
+        *pScan_line = m_pScan_line_0;
+        break;
+      }
+      case JPGD_YH1V2:
+      {
+        if ((m_mcu_lines_left & 1) == 0)
+        {
+          H1V2Convert();
+          *pScan_line = m_pScan_line_0;
+        }
+        else
+          *pScan_line = m_pScan_line_1;
+
+        break;
+      }
+      case JPGD_YH1V1:
+      {
+        H1V1Convert();
+        *pScan_line = m_pScan_line_0;
+        break;
+      }
+      case JPGD_GRAYSCALE:
+      {
+        gray_convert();
+        *pScan_line = m_pScan_line_0;
+
+        break;
+      }
+    }
+  }
+
+  *pScan_line_len = m_real_dest_bytes_per_scan_line;
+
+  m_mcu_lines_left--;
+  m_total_lines_left--;
+
+  return JPGD_SUCCESS;
+}
+
+// Creates the tables needed for efficient Huffman decoding.
+void jpeg_decoder::make_huff_table(int index, huff_tables *pH)
+{
+  int p, i, l, si;
+  uint8 huffsize[257];
+  uint huffcode[257];
+  uint code;
+  uint subtree;
+  int code_size;
+  int lastp;
+  int nextfreeentry;
+  int currententry;
+
+  pH->ac_table = m_huff_ac[index] != 0;
+
+  p = 0;
+
+  for (l = 1; l <= 16; l++)
+  {
+    for (i = 1; i <= m_huff_num[index][l]; i++)
+      huffsize[p++] = static_cast<uint8>(l);
+  }
+
+  huffsize[p] = 0;
+
+  lastp = p;
+
+  code = 0;
+  si = huffsize[0];
+  p = 0;
+
+  while (huffsize[p])
+  {
+    while (huffsize[p] == si)
+    {
+      huffcode[p++] = code;
+      code++;
+    }
+
+    code <<= 1;
+    si++;
+  }
+
+  memset(pH->look_up, 0, sizeof(pH->look_up));
+  memset(pH->look_up2, 0, sizeof(pH->look_up2));
+  memset(pH->tree, 0, sizeof(pH->tree));
+  memset(pH->code_size, 0, sizeof(pH->code_size));
+
+  nextfreeentry = -1;
+
+  p = 0;
+
+  while (p < lastp)
+  {
+    i = m_huff_val[index][p];
+    code = huffcode[p];
+    code_size = huffsize[p];
+
+    pH->code_size[i] = static_cast<uint8>(code_size);
+
+    if (code_size <= 8)
+    {
+      code <<= (8 - code_size);
+
+      for (l = 1 << (8 - code_size); l > 0; l--)
+      {
+        JPGD_ASSERT(i < 256);
+
+        pH->look_up[code] = i;
+
+        bool has_extrabits = false;
+                int extra_bits = 0;
+        int num_extra_bits = i & 15;
+
+        int bits_to_fetch = code_size;
+        if (num_extra_bits)
+        {
+          int total_codesize = code_size + num_extra_bits;
+          if (total_codesize <= 8)
+          {
+            has_extrabits = true;
+            extra_bits = ((1 << num_extra_bits) - 1) & (code >> (8 - total_codesize));
+            JPGD_ASSERT(extra_bits <= 0x7FFF);
+            bits_to_fetch += num_extra_bits;
+          }
+        }
+
+        if (!has_extrabits)
+          pH->look_up2[code] = i | (bits_to_fetch << 8);
+        else
+          pH->look_up2[code] = i | 0x8000 | (extra_bits << 16) | (bits_to_fetch << 8);
+
+        code++;
+      }
+    }
+    else
+    {
+      subtree = (code >> (code_size - 8)) & 0xFF;
+
+      currententry = pH->look_up[subtree];
+
+      if (currententry == 0)
+      {
+        pH->look_up[subtree] = currententry = nextfreeentry;
+        pH->look_up2[subtree] = currententry = nextfreeentry;
+
+        nextfreeentry -= 2;
+      }
+
+      code <<= (16 - (code_size - 8));
+
+      for (l = code_size; l > 9; l--)
+      {
+        if ((code & 0x8000) == 0)
+          currententry--;
+
+        if (pH->tree[-currententry - 1] == 0)
+        {
+          pH->tree[-currententry - 1] = nextfreeentry;
+
+          currententry = nextfreeentry;
+
+          nextfreeentry -= 2;
+        }
+        else
+          currententry = pH->tree[-currententry - 1];
+
+        code <<= 1;
+      }
+
+      if ((code & 0x8000) == 0)
+        currententry--;
+
+      pH->tree[-currententry - 1] = i;
+    }
+
+    p++;
+  }
+}
+
+// Verifies the quantization tables needed for this scan are available.
+void jpeg_decoder::check_quant_tables()
+{
+  for (int i = 0; i < m_comps_in_scan; i++)
+    if (m_quant[m_comp_quant[m_comp_list[i]]] == NULL)
+      stop_decoding(JPGD_UNDEFINED_QUANT_TABLE);
+}
+
+// Verifies that all the Huffman tables needed for this scan are available.
+void jpeg_decoder::check_huff_tables()
+{
+  for (int i = 0; i < m_comps_in_scan; i++)
+  {
+    if ((m_spectral_start == 0) && (m_huff_num[m_comp_dc_tab[m_comp_list[i]]] == NULL))
+      stop_decoding(JPGD_UNDEFINED_HUFF_TABLE);
+
+    if ((m_spectral_end > 0) && (m_huff_num[m_comp_ac_tab[m_comp_list[i]]] == NULL))
+      stop_decoding(JPGD_UNDEFINED_HUFF_TABLE);
+  }
+
+  for (int i = 0; i < JPGD_MAX_HUFF_TABLES; i++)
+    if (m_huff_num[i])
+    {
+      if (!m_pHuff_tabs[i])
+        m_pHuff_tabs[i] = (huff_tables *)alloc(sizeof(huff_tables));
+
+      make_huff_table(i, m_pHuff_tabs[i]);
+    }
+}
+
+// Determines the component order inside each MCU.
+// Also calcs how many MCU's are on each row, etc.
+void jpeg_decoder::calc_mcu_block_order()
+{
+  int component_num, component_id;
+  int max_h_samp = 0, max_v_samp = 0;
+
+  for (component_id = 0; component_id < m_comps_in_frame; component_id++)
+  {
+    if (m_comp_h_samp[component_id] > max_h_samp)
+      max_h_samp = m_comp_h_samp[component_id];
+
+    if (m_comp_v_samp[component_id] > max_v_samp)
+      max_v_samp = m_comp_v_samp[component_id];
+  }
+
+  for (component_id = 0; component_id < m_comps_in_frame; component_id++)
+  {
+    m_comp_h_blocks[component_id] = ((((m_image_x_size * m_comp_h_samp[component_id]) + (max_h_samp - 1)) / max_h_samp) + 7) / 8;
+    m_comp_v_blocks[component_id] = ((((m_image_y_size * m_comp_v_samp[component_id]) + (max_v_samp - 1)) / max_v_samp) + 7) / 8;
+  }
+
+  if (m_comps_in_scan == 1)
+  {
+    m_mcus_per_row = m_comp_h_blocks[m_comp_list[0]];
+    m_mcus_per_col = m_comp_v_blocks[m_comp_list[0]];
+  }
+  else
+  {
+    m_mcus_per_row = (((m_image_x_size + 7) / 8) + (max_h_samp - 1)) / max_h_samp;
+    m_mcus_per_col = (((m_image_y_size + 7) / 8) + (max_v_samp - 1)) / max_v_samp;
+  }
+
+  if (m_comps_in_scan == 1)
+  {
+    m_mcu_org[0] = m_comp_list[0];
+
+    m_blocks_per_mcu = 1;
+  }
+  else
+  {
+    m_blocks_per_mcu = 0;
+
+    for (component_num = 0; component_num < m_comps_in_scan; component_num++)
+    {
+      int num_blocks;
+
+      component_id = m_comp_list[component_num];
+
+      num_blocks = m_comp_h_samp[component_id] * m_comp_v_samp[component_id];
+
+      while (num_blocks--)
+        m_mcu_org[m_blocks_per_mcu++] = component_id;
+    }
+  }
+}
+
+// Starts a new scan.
+int jpeg_decoder::init_scan()
+{
+  if (!locate_sos_marker())
+    return JPGD_FALSE;
+
+  calc_mcu_block_order();
+
+  check_huff_tables();
+
+  check_quant_tables();
+
+  memset(m_last_dc_val, 0, m_comps_in_frame * sizeof(uint));
+
+  m_eob_run = 0;
+
+  if (m_restart_interval)
+  {
+    m_restarts_left = m_restart_interval;
+    m_next_restart_num = 0;
+  }
+
+  fix_in_buffer();
+
+  return JPGD_TRUE;
+}
+
+// Starts a frame. Determines if the number of components or sampling factors
+// are supported.
+void jpeg_decoder::init_frame()
+{
+  int i;
+
+  if (m_comps_in_frame == 1)
+  {
+    if ((m_comp_h_samp[0] != 1) || (m_comp_v_samp[0] != 1))
+      stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+
+    m_scan_type = JPGD_GRAYSCALE;
+    m_max_blocks_per_mcu = 1;
+    m_max_mcu_x_size = 8;
+    m_max_mcu_y_size = 8;
+  }
+  else if (m_comps_in_frame == 3)
+  {
+    if ( ((m_comp_h_samp[1] != 1) || (m_comp_v_samp[1] != 1)) ||
+         ((m_comp_h_samp[2] != 1) || (m_comp_v_samp[2] != 1)) )
+      stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+
+    if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 1))
+    {
+      m_scan_type = JPGD_YH1V1;
+
+      m_max_blocks_per_mcu = 3;
+      m_max_mcu_x_size = 8;
+      m_max_mcu_y_size = 8;
+    }
+    else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 1))
+    {
+      m_scan_type = JPGD_YH2V1;
+      m_max_blocks_per_mcu = 4;
+      m_max_mcu_x_size = 16;
+      m_max_mcu_y_size = 8;
+    }
+    else if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 2))
+    {
+      m_scan_type = JPGD_YH1V2;
+      m_max_blocks_per_mcu = 4;
+      m_max_mcu_x_size = 8;
+      m_max_mcu_y_size = 16;
+    }
+    else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 2))
+    {
+      m_scan_type = JPGD_YH2V2;
+      m_max_blocks_per_mcu = 6;
+      m_max_mcu_x_size = 16;
+      m_max_mcu_y_size = 16;
+    }
+    else
+      stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+  }
+  else
+    stop_decoding(JPGD_UNSUPPORTED_COLORSPACE);
+
+  m_max_mcus_per_row = (m_image_x_size + (m_max_mcu_x_size - 1)) / m_max_mcu_x_size;
+  m_max_mcus_per_col = (m_image_y_size + (m_max_mcu_y_size - 1)) / m_max_mcu_y_size;
+
+  // These values are for the *destination* pixels: after conversion.
+  if (m_scan_type == JPGD_GRAYSCALE)
+    m_dest_bytes_per_pixel = 1;
+  else
+    m_dest_bytes_per_pixel = 4;
+
+  m_dest_bytes_per_scan_line = ((m_image_x_size + 15) & 0xFFF0) * m_dest_bytes_per_pixel;
+
+  m_real_dest_bytes_per_scan_line = (m_image_x_size * m_dest_bytes_per_pixel);
+
+  // Initialize two scan line buffers.
+  m_pScan_line_0 = (uint8 *)alloc(m_dest_bytes_per_scan_line, true);
+  if ((m_scan_type == JPGD_YH1V2) || (m_scan_type == JPGD_YH2V2))
+    m_pScan_line_1 = (uint8 *)alloc(m_dest_bytes_per_scan_line, true);
+
+  m_max_blocks_per_row = m_max_mcus_per_row * m_max_blocks_per_mcu;
+
+  // Should never happen
+  if (m_max_blocks_per_row > JPGD_MAX_BLOCKS_PER_ROW)
+    stop_decoding(JPGD_ASSERTION_ERROR);
+
+  // Allocate the coefficient buffer, enough for one MCU
+  m_pMCU_coefficients = (jpgd_block_t*)alloc(m_max_blocks_per_mcu * 64 * sizeof(jpgd_block_t));
+
+  for (i = 0; i < m_max_blocks_per_mcu; i++)
+    m_mcu_block_max_zag[i] = 64;
+
+  m_expanded_blocks_per_component = m_comp_h_samp[0] * m_comp_v_samp[0];
+  m_expanded_blocks_per_mcu = m_expanded_blocks_per_component * m_comps_in_frame;
+  m_expanded_blocks_per_row = m_max_mcus_per_row * m_expanded_blocks_per_mcu;
+    // Freq. domain chroma upsampling is only supported for H2V2 subsampling factor (the most common one I've seen).
+  m_freq_domain_chroma_upsample = false;
+#if JPGD_SUPPORT_FREQ_DOMAIN_UPSAMPLING
+  m_freq_domain_chroma_upsample = (m_expanded_blocks_per_mcu == 4*3);
+#endif
+
+  if (m_freq_domain_chroma_upsample)
+    m_pSample_buf = (uint8 *)alloc(m_expanded_blocks_per_row * 64);
+  else
+    m_pSample_buf = (uint8 *)alloc(m_max_blocks_per_row * 64);
+
+  m_total_lines_left = m_image_y_size;
+
+  m_mcu_lines_left = 0;
+
+  create_look_ups();
+}
+
+// The coeff_buf series of methods originally stored the coefficients
+// into a "virtual" file which was located in EMS, XMS, or a disk file. A cache
+// was used to make this process more efficient. Now, we can store the entire
+// thing in RAM.
+jpeg_decoder::coeff_buf* jpeg_decoder::coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y)
+{
+  coeff_buf* cb = (coeff_buf*)alloc(sizeof(coeff_buf));
+
+  cb->block_num_x = block_num_x;
+  cb->block_num_y = block_num_y;
+  cb->block_len_x = block_len_x;
+  cb->block_len_y = block_len_y;
+  cb->block_size = (block_len_x * block_len_y) * sizeof(jpgd_block_t);
+  cb->pData = (uint8 *)alloc(cb->block_size * block_num_x * block_num_y, true);
+  return cb;
+}
+
+inline jpgd_block_t *jpeg_decoder::coeff_buf_getp(coeff_buf *cb, int block_x, int block_y)
+{
+  JPGD_ASSERT((block_x < cb->block_num_x) && (block_y < cb->block_num_y));
+  return (jpgd_block_t *)(cb->pData + block_x * cb->block_size + block_y * (cb->block_size * cb->block_num_x));
+}
+
+// The following methods decode the various types of m_blocks encountered
+// in progressively encoded images.
+void jpeg_decoder::decode_block_dc_first(jpeg_decoder *pD, int component_id, int block_x, int block_y)
+{
+  int s, r;
+  jpgd_block_t *p = pD->coeff_buf_getp(pD->m_dc_coeffs[component_id], block_x, block_y);
+
+  if ((s = pD->huff_decode(pD->m_pHuff_tabs[pD->m_comp_dc_tab[component_id]])) != 0)
+  {
+    r = pD->get_bits_no_markers(s);
+    s = JPGD_HUFF_EXTEND(r, s);
+  }
+
+  pD->m_last_dc_val[component_id] = (s += pD->m_last_dc_val[component_id]);
+
+  p[0] = static_cast<jpgd_block_t>(s << pD->m_successive_low);
+}
+
+void jpeg_decoder::decode_block_dc_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y)
+{
+  if (pD->get_bits_no_markers(1))
+  {
+    jpgd_block_t *p = pD->coeff_buf_getp(pD->m_dc_coeffs[component_id], block_x, block_y);
+
+    p[0] |= (1 << pD->m_successive_low);
+  }
+}
+
+void jpeg_decoder::decode_block_ac_first(jpeg_decoder *pD, int component_id, int block_x, int block_y)
+{
+  int k, s, r;
+
+  if (pD->m_eob_run)
+  {
+    pD->m_eob_run--;
+    return;
+  }
+
+  jpgd_block_t *p = pD->coeff_buf_getp(pD->m_ac_coeffs[component_id], block_x, block_y);
+
+  for (k = pD->m_spectral_start; k <= pD->m_spectral_end; k++)
+  {
+    s = pD->huff_decode(pD->m_pHuff_tabs[pD->m_comp_ac_tab[component_id]]);
+
+    r = s >> 4;
+    s &= 15;
+
+    if (s)
+    {
+      if ((k += r) > 63)
+        pD->stop_decoding(JPGD_DECODE_ERROR);
+
+      r = pD->get_bits_no_markers(s);
+      s = JPGD_HUFF_EXTEND(r, s);
+
+      p[g_ZAG[k]] = static_cast<jpgd_block_t>(s << pD->m_successive_low);
+    }
+    else
+    {
+      if (r == 15)
+      {
+        if ((k += 15) > 63)
+          pD->stop_decoding(JPGD_DECODE_ERROR);
+      }
+      else
+      {
+        pD->m_eob_run = 1 << r;
+
+        if (r)
+          pD->m_eob_run += pD->get_bits_no_markers(r);
+
+        pD->m_eob_run--;
+
+        break;
+      }
+    }
+  }
+}
+
+void jpeg_decoder::decode_block_ac_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y)
+{
+  int s, k, r;
+  int p1 = 1 << pD->m_successive_low;
+  int m1 = (-1) << pD->m_successive_low;
+  jpgd_block_t *p = pD->coeff_buf_getp(pD->m_ac_coeffs[component_id], block_x, block_y);
+  
+  JPGD_ASSERT(pD->m_spectral_end <= 63);
+  
+  k = pD->m_spectral_start;
+  
+  if (pD->m_eob_run == 0)
+  {
+    for ( ; k <= pD->m_spectral_end; k++)
+    {
+      s = pD->huff_decode(pD->m_pHuff_tabs[pD->m_comp_ac_tab[component_id]]);
+
+      r = s >> 4;
+      s &= 15;
+
+      if (s)
+      {
+        if (s != 1)
+          pD->stop_decoding(JPGD_DECODE_ERROR);
+
+        if (pD->get_bits_no_markers(1))
+          s = p1;
+        else
+          s = m1;
+      }
+      else
+      {
+        if (r != 15)
+        {
+          pD->m_eob_run = 1 << r;
+
+          if (r)
+            pD->m_eob_run += pD->get_bits_no_markers(r);
+
+          break;
+        }
+      }
+
+      do
+      {
+        jpgd_block_t *this_coef = p + g_ZAG[k & 63];
+
+        if (*this_coef != 0)
+        {
+          if (pD->get_bits_no_markers(1))
+          {
+            if ((*this_coef & p1) == 0)
+            {
+              if (*this_coef >= 0)
+                *this_coef = static_cast<jpgd_block_t>(*this_coef + p1);
+              else
+                *this_coef = static_cast<jpgd_block_t>(*this_coef + m1);
+            }
+          }
+        }
+        else
+        {
+          if (--r < 0)
+            break;
+        }
+
+        k++;
+
+      } while (k <= pD->m_spectral_end);
+
+      if ((s) && (k < 64))
+      {
+        p[g_ZAG[k]] = static_cast<jpgd_block_t>(s);
+      }
+    }
+  }
+
+  if (pD->m_eob_run > 0)
+  {
+    for ( ; k <= pD->m_spectral_end; k++)
+    {
+      jpgd_block_t *this_coef = p + g_ZAG[k & 63]; // logical AND to shut up static code analysis
+
+      if (*this_coef != 0)
+      {
+        if (pD->get_bits_no_markers(1))
+        {
+          if ((*this_coef & p1) == 0)
+          {
+            if (*this_coef >= 0)
+              *this_coef = static_cast<jpgd_block_t>(*this_coef + p1);
+            else
+              *this_coef = static_cast<jpgd_block_t>(*this_coef + m1);
+          }
+        }
+      }
+    }
+
+    pD->m_eob_run--;
+  }
+}
+
+// Decode a scan in a progressively encoded image.
+void jpeg_decoder::decode_scan(pDecode_block_func decode_block_func)
+{
+  int mcu_row, mcu_col, mcu_block;
+  int block_x_mcu[JPGD_MAX_COMPONENTS], m_block_y_mcu[JPGD_MAX_COMPONENTS];
+
+  memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
+
+  for (mcu_col = 0; mcu_col < m_mcus_per_col; mcu_col++)
+  {
+    int component_num, component_id;
+
+    memset(block_x_mcu, 0, sizeof(block_x_mcu));
+
+    for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
+    {
+      int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
+
+      if ((m_restart_interval) && (m_restarts_left == 0))
+        process_restart();
+
+      for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+      {
+        component_id = m_mcu_org[mcu_block];
+
+        decode_block_func(this, component_id, block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
+
+        if (m_comps_in_scan == 1)
+          block_x_mcu[component_id]++;
+        else
+        {
+          if (++block_x_mcu_ofs == m_comp_h_samp[component_id])
+          {
+            block_x_mcu_ofs = 0;
+
+            if (++block_y_mcu_ofs == m_comp_v_samp[component_id])
+            {
+              block_y_mcu_ofs = 0;
+              block_x_mcu[component_id] += m_comp_h_samp[component_id];
+            }
+          }
+        }
+      }
+
+      m_restarts_left--;
+    }
+
+    if (m_comps_in_scan == 1)
+      m_block_y_mcu[m_comp_list[0]]++;
+    else
+    {
+      for (component_num = 0; component_num < m_comps_in_scan; component_num++)
+      {
+        component_id = m_comp_list[component_num];
+        m_block_y_mcu[component_id] += m_comp_v_samp[component_id];
+      }
+    }
+  }
+}
+
+// Decode a progressively encoded image.
+void jpeg_decoder::init_progressive()
+{
+  int i;
+
+  if (m_comps_in_frame == 4)
+    stop_decoding(JPGD_UNSUPPORTED_COLORSPACE);
+
+  // Allocate the coefficient buffers.
+  for (i = 0; i < m_comps_in_frame; i++)
+  {
+    m_dc_coeffs[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp[i], m_max_mcus_per_col * m_comp_v_samp[i], 1, 1);
+    m_ac_coeffs[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp[i], m_max_mcus_per_col * m_comp_v_samp[i], 8, 8);
+  }
+
+  for ( ; ; )
+  {
+    int dc_only_scan, refinement_scan;
+    pDecode_block_func decode_block_func;
+
+    if (!init_scan())
+      break;
+
+    dc_only_scan = (m_spectral_start == 0);
+    refinement_scan = (m_successive_high != 0);
+
+    if ((m_spectral_start > m_spectral_end) || (m_spectral_end > 63))
+      stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+
+    if (dc_only_scan)
+    {
+      if (m_spectral_end)
+        stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+    }
+    else if (m_comps_in_scan != 1)  /* AC scans can only contain one component */
+      stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+
+    if ((refinement_scan) && (m_successive_low != m_successive_high - 1))
+      stop_decoding(JPGD_BAD_SOS_SUCCESSIVE);
+
+    if (dc_only_scan)
+    {
+      if (refinement_scan)
+        decode_block_func = decode_block_dc_refine;
+      else
+        decode_block_func = decode_block_dc_first;
+    }
+    else
+    {
+      if (refinement_scan)
+        decode_block_func = decode_block_ac_refine;
+      else
+        decode_block_func = decode_block_ac_first;
+    }
+
+    decode_scan(decode_block_func);
+
+    m_bits_left = 16;
+    get_bits(16);
+    get_bits(16);
+  }
+
+  m_comps_in_scan = m_comps_in_frame;
+
+  for (i = 0; i < m_comps_in_frame; i++)
+    m_comp_list[i] = i;
+
+  calc_mcu_block_order();
+}
+
+void jpeg_decoder::init_sequential()
+{
+  if (!init_scan())
+    stop_decoding(JPGD_UNEXPECTED_MARKER);
+}
+
+void jpeg_decoder::decode_start()
+{
+  init_frame();
+
+  if (m_progressive_flag)
+    init_progressive();
+  else
+    init_sequential();
+}
+
+void jpeg_decoder::decode_init(jpeg_decoder_stream *pStream)
+{
+  init(pStream);
+  locate_sof_marker();
+}
+
+jpeg_decoder::jpeg_decoder(jpeg_decoder_stream *pStream)
+{
+  if (setjmp(m_jmp_state))
+    return;
+  decode_init(pStream);
+}
+
+int jpeg_decoder::begin_decoding()
+{
+  if (m_ready_flag)
+    return JPGD_SUCCESS;
+
+  if (m_error_code)
+    return JPGD_FAILED;
+
+  if (setjmp(m_jmp_state))
+    return JPGD_FAILED;
+
+  decode_start();
+
+  m_ready_flag = true;
+
+  return JPGD_SUCCESS;
+}
+
+jpeg_decoder::~jpeg_decoder()
+{
+  free_all_blocks();
+}
+
+jpeg_decoder_file_stream::jpeg_decoder_file_stream()
+{
+  m_pFile = NULL;
+  m_eof_flag = false;
+  m_error_flag = false;
+}
+
+void jpeg_decoder_file_stream::close()
+{
+  if (m_pFile)
+  {
+    fclose(m_pFile);
+    m_pFile = NULL;
+  }
+
+  m_eof_flag = false;
+  m_error_flag = false;
+}
+
+jpeg_decoder_file_stream::~jpeg_decoder_file_stream()
+{
+  close();
+}
+
+bool jpeg_decoder_file_stream::open(const char *Pfilename)
+{
+  close();
+
+  m_eof_flag = false;
+  m_error_flag = false;
+
+#if defined(_MSC_VER)
+  m_pFile = NULL;
+  fopen_s(&m_pFile, Pfilename, "rb");
+#else
+  m_pFile = fopen(Pfilename, "rb");
+#endif
+  return m_pFile != NULL;
+}
+
+int jpeg_decoder_file_stream::read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag)
+{
+  if (!m_pFile)
+    return -1;
+
+  if (m_eof_flag)
+  {
+    *pEOF_flag = true;
+    return 0;
+  }
+
+  if (m_error_flag)
+    return -1;
+
+  int bytes_read = static_cast<int>(fread(pBuf, 1, max_bytes_to_read, m_pFile));
+  if (bytes_read < max_bytes_to_read)
+  {
+    if (ferror(m_pFile))
+    {
+      m_error_flag = true;
+      return -1;
+    }
+
+    m_eof_flag = true;
+    *pEOF_flag = true;
+  }
+
+  return bytes_read;
+}
+
+bool jpeg_decoder_mem_stream::open(const uint8 *pSrc_data, uint size)
+{
+  close();
+  m_pSrc_data = pSrc_data;
+  m_ofs = 0;
+  m_size = size;
+  return true;
+}
+
+int jpeg_decoder_mem_stream::read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag)
+{
+  *pEOF_flag = false;
+
+  if (!m_pSrc_data)
+    return -1;
+
+  uint bytes_remaining = m_size - m_ofs;
+  if ((uint)max_bytes_to_read > bytes_remaining)
+  {
+    max_bytes_to_read = bytes_remaining;
+    *pEOF_flag = true;
+  }
+
+  memcpy(pBuf, m_pSrc_data + m_ofs, max_bytes_to_read);
+  m_ofs += max_bytes_to_read;
+
+  return max_bytes_to_read;
+}
+
+unsigned char *decompress_jpeg_image_from_stream(jpeg_decoder_stream *pStream, int *width, int *height, int *actual_comps, int req_comps)
+{
+  if (!actual_comps)
+    return NULL;
+  *actual_comps = 0;
+
+  if ((!pStream) || (!width) || (!height) || (!req_comps))
+    return NULL;
+
+  if ((req_comps != 1) && (req_comps != 3) && (req_comps != 4))
+    return NULL;
+
+  jpeg_decoder decoder(pStream);
+  if (decoder.get_error_code() != JPGD_SUCCESS)
+    return NULL;
+
+  const int image_width = decoder.get_width(), image_height = decoder.get_height();
+  *width = image_width;
+  *height = image_height;
+  *actual_comps = decoder.get_num_components();
+
+  if (decoder.begin_decoding() != JPGD_SUCCESS)
+    return NULL;
+
+  const int dst_bpl = image_width * req_comps;
+
+  uint8 *pImage_data = (uint8*)jpgd_malloc(dst_bpl * image_height);
+  if (!pImage_data)
+    return NULL;
+
+  for (int y = 0; y < image_height; y++)
+  {
+    const uint8* pScan_line;
+    uint scan_line_len;
+    if (decoder.decode((const void**)&pScan_line, &scan_line_len) != JPGD_SUCCESS)
+    {
+      jpgd_free(pImage_data);
+      return NULL;
+    }
+
+    uint8 *pDst = pImage_data + y * dst_bpl;
+
+#if 1
+    //Return as BGRA
+    if ((req_comps == 4) && (decoder.get_num_components() == 3)) {
+      for (int x = 0; x < image_width; x++)
+      {
+        pDst[0] = pScan_line[x*4+2];
+        pDst[1] = pScan_line[x*4+1];
+        pDst[2] = pScan_line[x*4+0];
+        pDst[3] = 255;
+        pDst += 4;
+      }
+    } else
+#endif
+    if (((req_comps == 1) && (decoder.get_num_components() == 1)) || ((req_comps == 4) && (decoder.get_num_components() == 3)))
+      memcpy(pDst, pScan_line, dst_bpl);
+    else if (decoder.get_num_components() == 1)
+    {
+      if (req_comps == 3)
+      {
+        for (int x = 0; x < image_width; x++)
+        {
+          uint8 luma = pScan_line[x];
+          pDst[0] = luma;
+          pDst[1] = luma;
+          pDst[2] = luma;
+          pDst += 3;
+        }
+      }
+      else
+      {
+        for (int x = 0; x < image_width; x++)
+        {
+          uint8 luma = pScan_line[x];
+          pDst[0] = luma;
+          pDst[1] = luma;
+          pDst[2] = luma;
+          pDst[3] = 255;
+          pDst += 4;
+        }
+      }
+    }
+    else if (decoder.get_num_components() == 3)
+    {
+      if (req_comps == 1)
+      {
+        const int YR = 19595, YG = 38470, YB = 7471;
+        for (int x = 0; x < image_width; x++)
+        {
+          int r = pScan_line[x*4+0];
+          int g = pScan_line[x*4+1];
+          int b = pScan_line[x*4+2];
+          *pDst++ = static_cast<uint8>((r * YR + g * YG + b * YB + 32768) >> 16);
+        }
+      }
+      else
+      {
+        for (int x = 0; x < image_width; x++)
+        {
+          pDst[0] = pScan_line[x*4+0];
+          pDst[1] = pScan_line[x*4+1];
+          pDst[2] = pScan_line[x*4+2];
+          pDst += 3;
+        }
+      }
+    }
+  }
+
+  return pImage_data;
+}
+
+unsigned char *decompress_jpeg_image_from_memory(const unsigned char *pSrc_data, int src_data_size, int *width, int *height, int *actual_comps, int req_comps)
+{
+  jpgd::jpeg_decoder_mem_stream mem_stream(pSrc_data, src_data_size);
+  return decompress_jpeg_image_from_stream(&mem_stream, width, height, actual_comps, req_comps);
+}
+
+unsigned char *decompress_jpeg_image_from_file(const char *pSrc_filename, int *width, int *height, int *actual_comps, int req_comps)
+{
+  jpgd::jpeg_decoder_file_stream file_stream;
+  if (!file_stream.open(pSrc_filename))
+    return NULL;
+  return decompress_jpeg_image_from_stream(&file_stream, width, height, actual_comps, req_comps);
+}
+
+} // namespace jpgd
diff --git a/src/loaders/jpg/jpgd.h b/src/loaders/jpg/jpgd.h
new file mode 100644
index 0000000..46069a1
--- /dev/null
+++ b/src/loaders/jpg/jpgd.h
@@ -0,0 +1,319 @@
+// jpgd.h - C++ class for JPEG decompression.
+// Public domain, Rich Geldreich <richgel99@gmail.com>
+#ifndef JPEG_DECODER_H
+#define JPEG_DECODER_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <setjmp.h>
+
+#ifdef _MSC_VER
+  #define JPGD_NORETURN __declspec(noreturn) 
+#elif defined(__GNUC__)
+  #define JPGD_NORETURN __attribute__ ((noreturn))
+#else
+  #define JPGD_NORETURN
+#endif
+
+namespace jpgd
+{
+  typedef unsigned char  uint8;
+  typedef   signed short int16;
+  typedef unsigned short uint16;
+  typedef unsigned int   uint;
+  typedef   signed int   int32;
+
+  // Loads a JPEG image from a memory buffer or a file.
+  // req_comps can be 1 (grayscale), 3 (RGB), or 4 (RGBA).
+  // On return, width/height will be set to the image's dimensions, and actual_comps will be set to the either 1 (grayscale) or 3 (RGB).
+  // Notes: For more control over where and how the source data is read, see the decompress_jpeg_image_from_stream() function below, or call the jpeg_decoder class directly.
+  // Requesting a 8 or 32bpp image is currently a little faster than 24bpp because the jpeg_decoder class itself currently always unpacks to either 8 or 32bpp.
+  unsigned char *decompress_jpeg_image_from_memory(const unsigned char *pSrc_data, int src_data_size, int *width, int *height, int *actual_comps, int req_comps);
+  unsigned char *decompress_jpeg_image_from_file(const char *pSrc_filename, int *width, int *height, int *actual_comps, int req_comps);
+
+  // Success/failure error codes.
+  enum jpgd_status
+  {
+    JPGD_SUCCESS = 0, JPGD_FAILED = -1, JPGD_DONE = 1,
+    JPGD_BAD_DHT_COUNTS = -256, JPGD_BAD_DHT_INDEX, JPGD_BAD_DHT_MARKER, JPGD_BAD_DQT_MARKER, JPGD_BAD_DQT_TABLE, 
+    JPGD_BAD_PRECISION, JPGD_BAD_HEIGHT, JPGD_BAD_WIDTH, JPGD_TOO_MANY_COMPONENTS, 
+    JPGD_BAD_SOF_LENGTH, JPGD_BAD_VARIABLE_MARKER, JPGD_BAD_DRI_LENGTH, JPGD_BAD_SOS_LENGTH,
+    JPGD_BAD_SOS_COMP_ID, JPGD_W_EXTRA_BYTES_BEFORE_MARKER, JPGD_NO_ARITHMITIC_SUPPORT, JPGD_UNEXPECTED_MARKER,
+    JPGD_NOT_JPEG, JPGD_UNSUPPORTED_MARKER, JPGD_BAD_DQT_LENGTH, JPGD_TOO_MANY_BLOCKS,
+    JPGD_UNDEFINED_QUANT_TABLE, JPGD_UNDEFINED_HUFF_TABLE, JPGD_NOT_SINGLE_SCAN, JPGD_UNSUPPORTED_COLORSPACE,
+    JPGD_UNSUPPORTED_SAMP_FACTORS, JPGD_DECODE_ERROR, JPGD_BAD_RESTART_MARKER, JPGD_ASSERTION_ERROR,
+    JPGD_BAD_SOS_SPECTRAL, JPGD_BAD_SOS_SUCCESSIVE, JPGD_STREAM_READ, JPGD_NOTENOUGHMEM
+  };
+    
+  // Input stream interface.
+  // Derive from this class to read input data from sources other than files or memory. Set m_eof_flag to true when no more data is available.
+  // The decoder is rather greedy: it will keep on calling this method until its internal input buffer is full, or until the EOF flag is set.
+  // It the input stream contains data after the JPEG stream's EOI (end of image) marker it will probably be pulled into the internal buffer.
+  // Call the get_total_bytes_read() method to determine the actual size of the JPEG stream after successful decoding.
+  class jpeg_decoder_stream
+  {
+  public:
+    jpeg_decoder_stream() { }
+    virtual ~jpeg_decoder_stream() { }
+
+    // The read() method is called when the internal input buffer is empty.
+    // Parameters:
+    // pBuf - input buffer
+    // max_bytes_to_read - maximum bytes that can be written to pBuf
+    // pEOF_flag - set this to true if at end of stream (no more bytes remaining)
+    // Returns -1 on error, otherwise return the number of bytes actually written to the buffer (which may be 0).
+    // Notes: This method will be called in a loop until you set *pEOF_flag to true or the internal buffer is full.
+    virtual int read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag) = 0;
+  };
+
+  // stdio FILE stream class.
+  class jpeg_decoder_file_stream : public jpeg_decoder_stream
+  {
+    jpeg_decoder_file_stream(const jpeg_decoder_file_stream &);
+    jpeg_decoder_file_stream &operator =(const jpeg_decoder_file_stream &);
+
+    FILE *m_pFile;
+    bool m_eof_flag, m_error_flag;
+
+  public:
+    jpeg_decoder_file_stream();
+    virtual ~jpeg_decoder_file_stream();
+    
+    bool open(const char *Pfilename);
+    void close();
+
+    virtual int read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag);
+  };
+
+  // Memory stream class.
+  class jpeg_decoder_mem_stream : public jpeg_decoder_stream
+  {
+    const uint8 *m_pSrc_data;
+    uint m_ofs, m_size;
+
+  public:
+    jpeg_decoder_mem_stream() : m_pSrc_data(NULL), m_ofs(0), m_size(0) { }
+    jpeg_decoder_mem_stream(const uint8 *pSrc_data, uint size) : m_pSrc_data(pSrc_data), m_ofs(0), m_size(size) { }
+
+    virtual ~jpeg_decoder_mem_stream() { }
+
+    bool open(const uint8 *pSrc_data, uint size);
+    void close() { m_pSrc_data = NULL; m_ofs = 0; m_size = 0; }
+    
+    virtual int read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag);
+  };
+
+  // Loads JPEG file from a jpeg_decoder_stream.
+  unsigned char *decompress_jpeg_image_from_stream(jpeg_decoder_stream *pStream, int *width, int *height, int *actual_comps, int req_comps);
+
+  enum 
+  { 
+    JPGD_IN_BUF_SIZE = 8192, JPGD_MAX_BLOCKS_PER_MCU = 10, JPGD_MAX_HUFF_TABLES = 8, JPGD_MAX_QUANT_TABLES = 4, 
+    JPGD_MAX_COMPONENTS = 4, JPGD_MAX_COMPS_IN_SCAN = 4, JPGD_MAX_BLOCKS_PER_ROW = 8192, JPGD_MAX_HEIGHT = 16384, JPGD_MAX_WIDTH = 16384 
+  };
+          
+  typedef int16 jpgd_quant_t;
+  typedef int16 jpgd_block_t;
+
+  class jpeg_decoder
+  {
+  public:
+    // Call get_error_code() after constructing to determine if the stream is valid or not. You may call the get_width(), get_height(), etc.
+    // methods after the constructor is called. You may then either destruct the object, or begin decoding the image by calling begin_decoding(), then decode() on each scanline.
+    jpeg_decoder(jpeg_decoder_stream *pStream);
+
+    ~jpeg_decoder();
+
+    // Call this method after constructing the object to begin decompression.
+    // If JPGD_SUCCESS is returned you may then call decode() on each scanline.
+    int begin_decoding();
+
+    // Returns the next scan line.
+    // For grayscale images, pScan_line will point to a buffer containing 8-bit pixels (get_bytes_per_pixel() will return 1). 
+    // Otherwise, it will always point to a buffer containing 32-bit RGBA pixels (A will always be 255, and get_bytes_per_pixel() will return 4).
+    // Returns JPGD_SUCCESS if a scan line has been returned.
+    // Returns JPGD_DONE if all scan lines have been returned.
+    // Returns JPGD_FAILED if an error occurred. Call get_error_code() for a more info.
+    int decode(const void** pScan_line, uint* pScan_line_len);
+    
+    inline jpgd_status get_error_code() const { return m_error_code; }
+
+    inline int get_width() const { return m_image_x_size; }
+    inline int get_height() const { return m_image_y_size; }
+
+    inline int get_num_components() const { return m_comps_in_frame; }
+
+    inline int get_bytes_per_pixel() const { return m_dest_bytes_per_pixel; }
+    inline int get_bytes_per_scan_line() const { return m_image_x_size * get_bytes_per_pixel(); }
+
+    // Returns the total number of bytes actually consumed by the decoder (which should equal the actual size of the JPEG file).
+    inline int get_total_bytes_read() const { return m_total_bytes_read; }
+    
+  private:
+    jpeg_decoder(const jpeg_decoder &);
+    jpeg_decoder &operator =(const jpeg_decoder &);
+
+    typedef void (*pDecode_block_func)(jpeg_decoder *, int, int, int);
+
+    struct huff_tables
+    {
+      bool ac_table;
+      uint  look_up[256];
+      uint  look_up2[256];
+      uint8 code_size[256];
+      uint  tree[512];
+    };
+
+    struct coeff_buf
+    {
+      uint8 *pData;
+      int block_num_x, block_num_y;
+      int block_len_x, block_len_y;
+      int block_size;
+    };
+
+    struct mem_block
+    {
+      mem_block *m_pNext;
+      size_t m_used_count;
+      size_t m_size;
+      char m_data[1];
+    };
+
+    jmp_buf m_jmp_state;
+    mem_block *m_pMem_blocks;
+    int m_image_x_size;
+    int m_image_y_size;
+    jpeg_decoder_stream *m_pStream;
+    int m_progressive_flag;
+    uint8 m_huff_ac[JPGD_MAX_HUFF_TABLES];
+    uint8* m_huff_num[JPGD_MAX_HUFF_TABLES];      // pointer to number of Huffman codes per bit size
+    uint8* m_huff_val[JPGD_MAX_HUFF_TABLES];      // pointer to Huffman codes per bit size
+    jpgd_quant_t* m_quant[JPGD_MAX_QUANT_TABLES]; // pointer to quantization tables
+    int m_scan_type;                              // Gray, Yh1v1, Yh1v2, Yh2v1, Yh2v2 (CMYK111, CMYK4114 no longer supported)
+    int m_comps_in_frame;                         // # of components in frame
+    int m_comp_h_samp[JPGD_MAX_COMPONENTS];       // component's horizontal sampling factor
+    int m_comp_v_samp[JPGD_MAX_COMPONENTS];       // component's vertical sampling factor
+    int m_comp_quant[JPGD_MAX_COMPONENTS];        // component's quantization table selector
+    int m_comp_ident[JPGD_MAX_COMPONENTS];        // component's ID
+    int m_comp_h_blocks[JPGD_MAX_COMPONENTS];
+    int m_comp_v_blocks[JPGD_MAX_COMPONENTS];
+    int m_comps_in_scan;                          // # of components in scan
+    int m_comp_list[JPGD_MAX_COMPS_IN_SCAN];      // components in this scan
+    int m_comp_dc_tab[JPGD_MAX_COMPONENTS];       // component's DC Huffman coding table selector
+    int m_comp_ac_tab[JPGD_MAX_COMPONENTS];       // component's AC Huffman coding table selector
+    int m_spectral_start;                         // spectral selection start
+    int m_spectral_end;                           // spectral selection end
+    int m_successive_low;                         // successive approximation low
+    int m_successive_high;                        // successive approximation high
+    int m_max_mcu_x_size;                         // MCU's max. X size in pixels
+    int m_max_mcu_y_size;                         // MCU's max. Y size in pixels
+    int m_blocks_per_mcu;
+    int m_max_blocks_per_row;
+    int m_mcus_per_row, m_mcus_per_col;
+    int m_mcu_org[JPGD_MAX_BLOCKS_PER_MCU];
+    int m_total_lines_left;                       // total # lines left in image
+    int m_mcu_lines_left;                         // total # lines left in this MCU
+    int m_real_dest_bytes_per_scan_line;
+    int m_dest_bytes_per_scan_line;               // rounded up
+    int m_dest_bytes_per_pixel;                   // 4 (RGB) or 1 (Y)
+    huff_tables* m_pHuff_tabs[JPGD_MAX_HUFF_TABLES];
+    coeff_buf* m_dc_coeffs[JPGD_MAX_COMPONENTS];
+    coeff_buf* m_ac_coeffs[JPGD_MAX_COMPONENTS];
+    int m_eob_run;
+    int m_block_y_mcu[JPGD_MAX_COMPONENTS];
+    uint8* m_pIn_buf_ofs;
+    int m_in_buf_left;
+    int m_tem_flag;
+    bool m_eof_flag;
+    uint8 m_in_buf_pad_start[128];
+    uint8 m_in_buf[JPGD_IN_BUF_SIZE + 128];
+    uint8 m_in_buf_pad_end[128];
+    int m_bits_left;
+    uint m_bit_buf;
+    int m_restart_interval;
+    int m_restarts_left;
+    int m_next_restart_num;
+    int m_max_mcus_per_row;
+    int m_max_blocks_per_mcu;
+    int m_expanded_blocks_per_mcu;
+    int m_expanded_blocks_per_row;
+    int m_expanded_blocks_per_component;
+    bool  m_freq_domain_chroma_upsample;
+    int m_max_mcus_per_col;
+    uint m_last_dc_val[JPGD_MAX_COMPONENTS];
+    jpgd_block_t* m_pMCU_coefficients;
+    int m_mcu_block_max_zag[JPGD_MAX_BLOCKS_PER_MCU];
+    uint8* m_pSample_buf;
+    int m_crr[256];
+    int m_cbb[256];
+    int m_crg[256];
+    int m_cbg[256];
+    uint8* m_pScan_line_0;
+    uint8* m_pScan_line_1;
+    jpgd_status m_error_code;
+    bool m_ready_flag;
+    int m_total_bytes_read;
+
+    void free_all_blocks();
+    JPGD_NORETURN void stop_decoding(jpgd_status status);
+    void *alloc(size_t n, bool zero = false);
+    void word_clear(void *p, uint16 c, uint n);
+    void prep_in_buffer();
+    void read_dht_marker();
+    void read_dqt_marker();
+    void read_sof_marker();
+    void skip_variable_marker();
+    void read_dri_marker();
+    void read_sos_marker();
+    int next_marker();
+    int process_markers();
+    void locate_soi_marker();
+    void locate_sof_marker();
+    int locate_sos_marker();
+    void init(jpeg_decoder_stream * pStream);
+    void create_look_ups();
+    void fix_in_buffer();
+    void transform_mcu(int mcu_row);
+    void transform_mcu_expand(int mcu_row);
+    coeff_buf* coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y);
+    inline jpgd_block_t *coeff_buf_getp(coeff_buf *cb, int block_x, int block_y);
+    void load_next_row();
+    void decode_next_row();
+    void make_huff_table(int index, huff_tables *pH);
+    void check_quant_tables();
+    void check_huff_tables();
+    void calc_mcu_block_order();
+    int init_scan();
+    void init_frame();
+    void process_restart();
+    void decode_scan(pDecode_block_func decode_block_func);
+    void init_progressive();
+    void init_sequential();
+    void decode_start();
+    void decode_init(jpeg_decoder_stream * pStream);
+    void H2V2Convert();
+    void H2V1Convert();
+    void H1V2Convert();
+    void H1V1Convert();
+    void gray_convert();
+    void expanded_convert();
+    void find_eoi();
+    inline uint get_char();
+    inline uint get_char(bool *pPadding_flag);
+    inline void stuff_char(uint8 q);
+    inline uint8 get_octet();
+    inline uint get_bits(int num_bits);
+    inline uint get_bits_no_markers(int numbits);
+    inline int huff_decode(huff_tables *pH);
+    inline int huff_decode(huff_tables *pH, int& extrabits);
+    static inline uint8 clamp(int i);
+    static void decode_block_dc_first(jpeg_decoder *pD, int component_id, int block_x, int block_y);
+    static void decode_block_dc_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y);
+    static void decode_block_ac_first(jpeg_decoder *pD, int component_id, int block_x, int block_y);
+    static void decode_block_ac_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y);
+  };
+  
+} // namespace jpgd
+
+#endif // JPEG_DECODER_H
diff --git a/src/loaders/jpg/meson.build b/src/loaders/jpg/meson.build
index 9784848..087e36b 100644
--- a/src/loaders/jpg/meson.build
+++ b/src/loaders/jpg/meson.build
@@ -1,5 +1,7 @@
 source_file = [
+   'jpgd.h',
    'tvgJpgLoader.h',
+   'jpgd.cpp',
    'tvgJpgLoader.cpp',
 ]
 
diff --git a/src/loaders/jpg/tvgJpgLoader.cpp b/src/loaders/jpg/tvgJpgLoader.cpp
index 850d79b..511a888 100644
--- a/src/loaders/jpg/tvgJpgLoader.cpp
+++ b/src/loaders/jpg/tvgJpgLoader.cpp
@@ -20,6 +20,7 @@
  * SOFTWARE.
  */
 
+#include "jpgd.h"
 #include "tvgLoader.h"
 #include "tvgJpgLoader.h"
 
@@ -32,53 +33,51 @@
 /* External Class Implementation                                        */
 /************************************************************************/
 
-JpgLoader::JpgLoader()
-{
-    //TODO:
-}
-
 
 JpgLoader::~JpgLoader()
 {
-    //TODO:
+    free(image);
+    image = NULL;
 }
 
-
 bool JpgLoader::open(const string& path)
 {
-    //TODO:
+    int width, height, actual_comps;
+    image = jpgd::decompress_jpeg_image_from_file(path.c_str(), &width, &height, &actual_comps, 4);
+    if (!image) return false;
 
-    return false;
-}
+    vw = w = width;
+    vh = h = height;
 
+    return true;
+}
 
 bool JpgLoader::open(const char* data, uint32_t size, bool copy)
 {
-    //TODO:
+    int width, height, actual_comps;
+    image = jpgd::decompress_jpeg_image_from_memory((const unsigned char *)data, size, &width, &height, &actual_comps, 4);
+    if (!image) return false;
 
-    return false;
+    vw = w = width;
+    vh = h = height;
+
+    return true;
 }
 
 
 bool JpgLoader::read()
 {
-    //TODO:
-
-    return false;
+    return true;
 }
 
 
 bool JpgLoader::close()
 {
-    //TODO:
-
-    return false;
+    return true;
 }
 
 
 const uint32_t* JpgLoader::pixels()
 {
-    //TODO:
-
-    return nullptr;
-}
\ No newline at end of file
+    return (const uint32_t*)image;
+}
diff --git a/src/loaders/jpg/tvgJpgLoader.h b/src/loaders/jpg/tvgJpgLoader.h
index c654d64..e12a907 100644
--- a/src/loaders/jpg/tvgJpgLoader.h
+++ b/src/loaders/jpg/tvgJpgLoader.h
@@ -26,7 +26,6 @@
 class JpgLoader : public LoadModule
 {
 public:
-    JpgLoader();
     ~JpgLoader();
 
     using LoadModule::open;
@@ -36,6 +35,9 @@ public:
     bool close() override;
 
     const uint32_t* pixels() override;
+
+private:
+    unsigned char *image = nullptr;
 };
 
 #endif //_TVG_JPG_LOADER_H_
-- 
2.7.4


From e43f396ef38b358f8fdb5d6f8e4ead673c9d4c0a Mon Sep 17 00:00:00 2001
From: Hermet Park <hermetpark@gmail.com>
Date: Tue, 26 Oct 2021 15:52:24 +0900
Subject: [PATCH 11/16] Update CONTRIBUTING.md

---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index fe702b4..5713ff5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -19,7 +19,7 @@ Hermet Park (hermet) is the lead maintainer. Also there are designated maintaine
 After updating ThorVG code, please verify your change doesn't break the library. We suggest unit-tests using the Catch2 framework. You can easily run it with build commands as the following: <br />
 <br/>
 `
-$meson . build -Dtests=true -Dloaders="all" -Dsavers="tvg" -Dbindings="capi" -Dtools="svg2png, svg2tvg" -Dlog=true
+$meson . build -Dtests=true -Dloaders="all" -Dsavers="tvg" -Dbindings="capi" -Dtools="all" -Dlog=true
 `
 <br />
 `
-- 
2.7.4


From f0eb141af5d0f59fe5815cd50cee6914e176c203 Mon Sep 17 00:00:00 2001
From: Mira Grudzinska <m.grudzinska@samsung.com>
Date: Tue, 26 Oct 2021 02:13:57 +0200
Subject: [PATCH 12/16] example: gradtransf.svg

The file enables to check the correctness of the gradient transformations
and units.
Removed duplicate file.
---
 src/examples/images/gradtransf.svg |  64 +++++++++++++++++++++++++++++++++++++
 src/examples/images/gradtransf.tvg | Bin 0 -> 905 bytes
 src/examples/images/ietf.svg.tvg   | Bin 469 -> 0 bytes
 3 files changed, 64 insertions(+)
 create mode 100644 src/examples/images/gradtransf.svg
 create mode 100644 src/examples/images/gradtransf.tvg
 delete mode 100644 src/examples/images/ietf.svg.tvg

diff --git a/src/examples/images/gradtransf.svg b/src/examples/images/gradtransf.svg
new file mode 100644
index 0000000..4efc1b8
--- /dev/null
+++ b/src/examples/images/gradtransf.svg
@@ -0,0 +1,64 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="30 50 180 180">
+  <defs>
+<!-- LINEAR GRAD -->
+<!-- Object bbox -->
+    <linearGradient id="linearGradient1" x1="10%" y1="10%" x2="25%" y2="5%" gradientUnits="objectBoundingBox" spreadMethod="reflect" gradientTransform="matrix(4, 0, 0, 0.5, 0, 0)">
+      <stop style="stop-color:#ff0000;stop-opacity:1;" offset="0"/>
+      <stop style="stop-color:#0000ff;stop-opacity:1;" offset="1"/>
+    </linearGradient>
+
+    <linearGradient id="linearGradient2" x1="0.1" y1="0.1" x2="0.25" y2="0.05" gradientUnits="objectBoundingBox" spreadMethod="reflect" gradientTransform="matrix(0.5, 0, 0, 4, 0, 0)">
+      <stop style="stop-color:#ff0000;stop-opacity:1;" offset="0"/>
+      <stop style="stop-color:#0000ff;stop-opacity:1;" offset="1"/>
+    </linearGradient>
+
+<!-- User space -->
+    <linearGradient id="linearGradient3" x1="4%" y1="4%" x2="10%" y2="15%" gradientUnits="userSpaceOnUse" spreadMethod="reflect" gradientTransform="matrix(4, 1, 0.2, 0.5, 0, 0)">
+      <stop style="stop-color:#ff0000;stop-opacity:1;" offset="0"/>
+      <stop style="stop-color:#0000ff;stop-opacity:1;" offset="1"/>
+    </linearGradient>
+
+    <linearGradient id="linearGradient4" x1="7" y1="7" x2="10" y2="40" gradientUnits="userSpaceOnUse" spreadMethod="reflect" gradientTransform="matrix(4, 3, 0, 0.5, 0, 0)">
+      <stop style="stop-color:#ff0000;stop-opacity:1;" offset="0"/>
+      <stop style="stop-color:#0000ff;stop-opacity:1;" offset="1"/>
+    </linearGradient>
+
+<!-- RADIAL GRAD -->
+<!-- Object bbox -->
+    <radialGradient id="radialGradient1" r="10%" cx="100%" cy="50%" gradientUnits="objectBoundingBox" spreadMethod="reflect" gradientTransform="matrix(5.4, 1.1, 1, 0.5, 0, 0.3)">
+      <stop style="stop-color:#ff00ff;stop-opacity:1;" offset="0"/>
+      <stop style="stop-color:#00ff00;stop-opacity:1;" offset="1"/>
+    </radialGradient>
+
+    <radialGradient id="radialGradient2" r="0.1" cx="1" cy="0.5" gradientUnits="objectBoundingBox" spreadMethod="reflect" gradientTransform="matrix(0.5, -0.3, -0.3, 4, 0, 0)">
+      <stop style="stop-color:#ff00ff;stop-opacity:1;" offset="0"/>
+      <stop style="stop-color:#00ff00;stop-opacity:1;" offset="1"/>
+    </radialGradient>
+
+<!-- User space -->
+    <radialGradient id="radialGradient3" r="7%" cx="10%" cy="150%" gradientUnits="userSpaceOnUse" spreadMethod="reflect" gradientTransform="matrix(4, 0.4, 0.9, 0.5, 0, -10)">
+      <stop style="stop-color:#ff00ff;stop-opacity:1;" offset="0"/>
+      <stop style="stop-color:#00ff00;stop-opacity:1;" offset="1"/>
+    </radialGradient>
+
+    <radialGradient id="radialGradient4" r="20" cx="20" cy="350" gradientUnits="userSpaceOnUse" spreadMethod="reflect" gradientTransform="matrix(5, 1.1, 0, 0.5, 0, -60)">
+      <stop style="stop-color:#ff00ff;stop-opacity:1;" offset="0"/>
+      <stop style="stop-color:#00ff00;stop-opacity:1;" offset="1"/>
+    </radialGradient>
+  </defs>
+
+  <g transform="matrix(0.9, -0.25, -0.1, 0.8, -12, 80)">
+    <g transform="translate(20, -5) scale(0.5, 1)">
+      <rect x="50" y="0" width="100" height="50" fill="url(#linearGradient1)" transform="matrix(0.5, 1, 1.4, 1.4, 0, -100)"/>
+      <rect x="200" y="0" width="100" height="60" fill="url(#linearGradient2)" transform="matrix(1.1, 0.2, 0.4, 1, -110, -20)"/>
+      <rect x="50" y="150" width="100" height="70" fill="url(#linearGradient3)" transform="matrix(0.9, 1.1, 1.4, 0.8, -100, -90)"/>
+      <rect x="200" y="150" width="100" height="70" fill="url(#linearGradient4)" transform="matrix(0.6, -0.1, -0.1, 1.3, 0, -60)"/>
+    </g>
+    <g transform="translate(120, 70) scale(0.5, 0.8) rotate(-25)">
+      <circle cx="50" cy="50" r="80" fill="url(#radialGradient1)" transform="matrix(0.5, -0.1, -0.1, 0.9, 50, 30)"/>
+      <circle cx="200" cy="50" r="50" fill="url(#radialGradient2)" transform="matrix(0.6, -0.1, 0.1, 1.3, 20, 50)"/>
+      <circle cx="50" cy="150" r="70" fill="url(#radialGradient3)" transform="matrix(0.8, 0.1, 0.1, 0.6, 30, 100)"/>
+      <circle cx="200" cy="150" r="70" fill="url(#radialGradient4)" transform="matrix(0.5, -0.3, 0.3, 0.5, 50, 160)"/>
+    </g>
+  </g>
+</svg>
diff --git a/src/examples/images/gradtransf.tvg b/src/examples/images/gradtransf.tvg
new file mode 100644
index 0000000000000000000000000000000000000000..3418b5348e027dc4ce88bef0d4e16f975f97987a
GIT binary patch
literal 905
zcmV;419tpWXm4^>M=&rjG%zp#05n4Y05n4Z3<dxIXafKMJsSW3{tg%b0H7bFfdGWS
zI0y#9!GUmw3<LpWprwcaK>ZjK;9vkT4IK{rCvqU5A0qz&X>qW?K>{5G#5{;ZNJsz{
zE;__HkS5QX0BA-aaqwT!9!SFJ^Dz)hux8A7{3x_8&Pf1lgxPdxvf#ml3qc-~_7hl;
zf#H4v39_-{N3A0Pz%_|dB}<nuW%A5<(4x)00O0&Yaxt*bgEMB#^O4}%s2&3y34<2}
z7mtBJtXeXwwd9Yk1HTSrcrZ>65JNO&8YIMwnI{)p31K_XV_1i9<ytLTcVNqbFKNdV
zF!Sfmn-`56=W?+pfaQh<@=$Qe$5RGY-3Sw>hsX~;6x=NZpdl+~09*}%Rw&vw016QS
zEx34kV2L5K0n~m=^1SYpD|@>uxV(Tu?xMep`)fJ926}F&fl}b&5g!nn!Iyy;2}TNB
zJn&^hL#)Ck81wq;ry8$VI4BGTLgImlW|%->2qdzAMK+FTbIYcYis~$`xPY+ZAb|#I
zprZkh9Duli;0eZ&E_nH=$$<nI0)-QFp$LE~eWG%RC!+-OCoX{+(@HG0B(q5|nH;gq
zHm}SwPA#W=6M!b61K=GAtneWfa`@;0l^jwLqK7>6@Fj#L2Kd2*L8SPJGp<eqEyaOW
z42hec^brDuOR~gIza8Vs?Z=da9EeB&jHE3|B^f-3hfc=WYal&_@W6suHp8_`0C2@a
zhfRc-wbp@<5$vL5i4lW^Ws4g^SYih<t~s1^<+a&oZ5qs>JbN%D5Fc*%CJ7(dabpr8
zWDy4vUlIwWgde2wq7q59q7+3HS#$`VArMPp2mv-x;>NbxBdN#!P7QL%gIdk+)g>_!
zf}1E}`3V#$g&@HdZ#b^@B6x&w2NfQMxH#hmnfqxu5aOs|vjs5<#$=QS0>w?^boLlz
zm}SmcFpY)ClMr@v@j(Cv*6Cpr1W*y83mF9QC5$H=@qrLEgu&M)eJ$lTBwqm-VhA&k
zVdmf*6F${c9vxN{;v<us>7tEIcH{1!4jIB{z6~6x@0*9n`y!gXb#4d=t_b1qi>f5t
zOOd_&n{mg5j&<Wa<;bT8AF5zy2p<TL<`^L~$YB;=@JRpzA55V_9<M_(DF7o50s+VT
f`q|;2A%MG$HvoJCAou`;BM|@^Y+e2$=51mg4vBeM

literal 0
HcmV?d00001

diff --git a/src/examples/images/ietf.svg.tvg b/src/examples/images/ietf.svg.tvg
deleted file mode 100644
index dc7c1f3981dbf673bbe6804e662e800a8f70c83d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 469
zcmV;`0V@7fXm4^>M=&rjG%zp#053xT0AxY|e**vjv;hDBn+*T}{`3q00HB|=00V~*
z7zm(OuVVjz5IYEf$3hqj0|?LvAmhf40yt^}`7xwNktIi-B)RhBKsZ7c0vQMiCIB!&
zT-Jn!69AB!HwVJ1*>fPWpFD*Q+&K`_(ShiO%0xI2C;+HRc|vmv01{6?JmK*K$Fu7|
z5H*2G9Xk+5&94K|)LaXot=XCg<npBCmM5#c1NYh;i03Z=W_$;dD|}cm09u6w3>o+q
zfCI?@=0<MJuCRy8lvP}w>$$N6(L7%c3_G(XikhhbV5#YIpzEMDR|7CwyJphZHJkE&
zikl~Z%eJl7l&!S2kbdqK2Jx^^io%5he?XZCKtPo30;X8M{x9Lef7YZx?6t7MLJAl~
z#+N@I<UnvzvGF8OGe6s!XLG^|u>WdetS6qX0$?fbE(3z62rlanfE6O-i9*$Qg5a{;
zGy{OL1amWxvcLd<;Wgs?8_2|kP{d5Cu=)XGq4KT+03s<CQeYtj;4^8Xfq;-|K>)HU
zjwgJ03UVMMYrAa07M$b>$|ZS1;W8uxFk&GPwhN%29cC2IJRn+FsDOpUBdNw3U%2eL
LHU}bOsD<PNWSzvz

-- 
2.7.4


From e649f684b8dcd7aefac6b212356284cb5261908b Mon Sep 17 00:00:00 2001
From: Mira Grudzinska <m.grudzinska@samsung.com>
Date: Tue, 26 Oct 2021 11:11:56 +0200
Subject: [PATCH 13/16] capi: tvg_paint_transform -> tvg_paint_set_transform

---
 src/bindings/capi/thorvg_capi.h | 2 +-
 src/bindings/capi/tvgCapi.cpp   | 2 +-
 src/examples/Capi.cpp           | 2 +-
 test/capi/capiPaint.cpp         | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/bindings/capi/thorvg_capi.h b/src/bindings/capi/thorvg_capi.h
index b6c7ab4..b9f2667 100644
--- a/src/bindings/capi/thorvg_capi.h
+++ b/src/bindings/capi/thorvg_capi.h
@@ -780,7 +780,7 @@ TVG_EXPORT Tvg_Result tvg_paint_translate(Tvg_Paint* paint, float x, float y);
 * \retval TVG_RESULT_INVALID_ARGUMENT A @c nullptr is passed as the argument.
 * \retval TVG_RESULT_FAILED_ALLOCATION An internal error with memory allocation.
 */
-TVG_EXPORT Tvg_Result tvg_paint_transform(Tvg_Paint* paint, const Tvg_Matrix* m);
+TVG_EXPORT Tvg_Result tvg_paint_set_transform(Tvg_Paint* paint, const Tvg_Matrix* m);
 
 
 /*!
diff --git a/src/bindings/capi/tvgCapi.cpp b/src/bindings/capi/tvgCapi.cpp
index 860472a..7c2fd12 100644
--- a/src/bindings/capi/tvgCapi.cpp
+++ b/src/bindings/capi/tvgCapi.cpp
@@ -162,7 +162,7 @@ TVG_EXPORT Tvg_Result tvg_paint_translate(Tvg_Paint* paint, float x, float y)
 }
 
 
-TVG_EXPORT Tvg_Result tvg_paint_transform(Tvg_Paint* paint, const Tvg_Matrix* m)
+TVG_EXPORT Tvg_Result tvg_paint_set_transform(Tvg_Paint* paint, const Tvg_Matrix* m)
 {
     if (!paint || !m) return TVG_RESULT_INVALID_ARGUMENT;
     return (Tvg_Result) reinterpret_cast<Paint*>(paint)->transform(*(reinterpret_cast<const Matrix*>(m)));
diff --git a/src/examples/Capi.cpp b/src/examples/Capi.cpp
index 946dc8f..67649b1 100644
--- a/src/examples/Capi.cpp
+++ b/src/examples/Capi.cpp
@@ -191,7 +191,7 @@ void testCapi()
         tvg_picture_get_size(pict, &w, &h);
         tvg_picture_set_size(pict, w/2, h/2);
         Tvg_Matrix m = {0.8f, 0.0f, 400.0f, 0.0f, 0.8f, 400.0f, 0.0f, 0.0f, 1.0f};
-        tvg_paint_transform(pict, &m);
+        tvg_paint_set_transform(pict, &m);
 
         // Set a composite shape
         Tvg_Paint* comp = tvg_shape_new();
diff --git a/test/capi/capiPaint.cpp b/test/capi/capiPaint.cpp
index 383e045..d8feff2 100644
--- a/test/capi/capiPaint.cpp
+++ b/test/capi/capiPaint.cpp
@@ -31,7 +31,7 @@ TEST_CASE("Paint Transform", "[capiPaint]")
 
     Tvg_Matrix matrix_set = {1, 0, 0, 0, 1, 0, 0, 0, 1}, matrix_get;
 
-    REQUIRE(tvg_paint_transform(paint, &matrix_set) == TVG_RESULT_SUCCESS);
+    REQUIRE(tvg_paint_set_transform(paint, &matrix_set) == TVG_RESULT_SUCCESS);
     REQUIRE(tvg_paint_get_transform(paint, &matrix_get) == TVG_RESULT_SUCCESS);
     REQUIRE(matrix_get.e11 == Approx(matrix_set.e11).margin(0.000001));
     REQUIRE(matrix_get.e12 == Approx(matrix_set.e12).margin(0.000001));
-- 
2.7.4


From 6a6ccf869614c3fee4173e522fdcde534ecb7eee Mon Sep 17 00:00:00 2001
From: Hermet Park <chuneon.park@samsung.com>
Date: Tue, 26 Oct 2021 19:37:06 +0900
Subject: [PATCH 14/16] jpg_loader: code refactoring

revise code to satisfy with the tvg world.
no logical changes.

 - modified file names
 - added the samsung copyright
 - removed unnecessary typedefs
 - ++indentation
 - --LOC
---
 src/loaders/jpg/jpgd.cpp         | 3185 --------------------------------------
 src/loaders/jpg/jpgd.h           |  319 ----
 src/loaders/jpg/meson.build      |    4 +-
 src/loaders/jpg/tvgJpgLoader.cpp |   14 +-
 src/loaders/jpg/tvgJpgd.cpp      | 3014 ++++++++++++++++++++++++++++++++++++
 src/loaders/jpg/tvgJpgd.h        |   36 +
 6 files changed, 3059 insertions(+), 3513 deletions(-)
 delete mode 100644 src/loaders/jpg/jpgd.cpp
 delete mode 100644 src/loaders/jpg/jpgd.h
 create mode 100644 src/loaders/jpg/tvgJpgd.cpp
 create mode 100644 src/loaders/jpg/tvgJpgd.h

diff --git a/src/loaders/jpg/jpgd.cpp b/src/loaders/jpg/jpgd.cpp
deleted file mode 100644
index bd40cd3..0000000
--- a/src/loaders/jpg/jpgd.cpp
+++ /dev/null
@@ -1,3185 +0,0 @@
-// jpgd.cpp - C++ class for JPEG decompression.
-// Public domain, Rich Geldreich <richgel99@gmail.com>
-// Alex Evans: Linear memory allocator (taken from jpge.h).
-// v1.04, May. 19, 2012: Code tweaks to fix VS2008 static code analysis warnings (all looked harmless)
-//
-// Supports progressive and baseline sequential JPEG image files, and the most common chroma subsampling factors: Y, H1V1, H2V1, H1V2, and H2V2.
-//
-// Chroma upsampling quality: H2V2 is upsampled in the frequency domain, H2V1 and H1V2 are upsampled using point sampling.
-// Chroma upsampling reference: "Fast Scheme for Image Size Change in the Compressed Domain"
-// http://vision.ai.uiuc.edu/~dugad/research/dct/index.html
-
-#include "jpgd.h"
-#include <string.h>
-
-#include <assert.h>
-#define JPGD_ASSERT(x) assert(x)
-
-#ifdef _MSC_VER
-#pragma warning (disable : 4611) // warning C4611: interaction between '_setjmp' and C++ object destruction is non-portable
-#endif
-
-// Set to 1 to enable freq. domain chroma upsampling on images using H2V2 subsampling (0=faster nearest neighbor sampling).
-// This is slower, but results in higher quality on images with highly saturated colors.
-#define JPGD_SUPPORT_FREQ_DOMAIN_UPSAMPLING 1
-
-#define JPGD_TRUE (1)
-#define JPGD_FALSE (0)
-
-#define JPGD_MAX(a,b) (((a)>(b)) ? (a) : (b))
-#define JPGD_MIN(a,b) (((a)<(b)) ? (a) : (b))
-
-namespace jpgd {
-
-static inline void *jpgd_malloc(size_t nSize) { return malloc(nSize); }
-static inline void jpgd_free(void *p) { free(p); }
-
-// DCT coefficients are stored in this sequence.
-static int g_ZAG[64] = {  0,1,8,16,9,2,3,10,17,24,32,25,18,11,4,5,12,19,26,33,40,48,41,34,27,20,13,6,7,14,21,28,35,42,49,56,57,50,43,36,29,22,15,23,30,37,44,51,58,59,52,45,38,31,39,46,53,60,61,54,47,55,62,63 };
-
-enum JPEG_MARKER
-{
-  M_SOF0  = 0xC0, M_SOF1  = 0xC1, M_SOF2  = 0xC2, M_SOF3  = 0xC3, M_SOF5  = 0xC5, M_SOF6  = 0xC6, M_SOF7  = 0xC7, M_JPG   = 0xC8,
-  M_SOF9  = 0xC9, M_SOF10 = 0xCA, M_SOF11 = 0xCB, M_SOF13 = 0xCD, M_SOF14 = 0xCE, M_SOF15 = 0xCF, M_DHT   = 0xC4, M_DAC   = 0xCC,
-  M_RST0  = 0xD0, M_RST1  = 0xD1, M_RST2  = 0xD2, M_RST3  = 0xD3, M_RST4  = 0xD4, M_RST5  = 0xD5, M_RST6  = 0xD6, M_RST7  = 0xD7,
-  M_SOI   = 0xD8, M_EOI   = 0xD9, M_SOS   = 0xDA, M_DQT   = 0xDB, M_DNL   = 0xDC, M_DRI   = 0xDD, M_DHP   = 0xDE, M_EXP   = 0xDF,
-  M_APP0  = 0xE0, M_APP15 = 0xEF, M_JPG0  = 0xF0, M_JPG13 = 0xFD, M_COM   = 0xFE, M_TEM   = 0x01, M_ERROR = 0x100, RST0   = 0xD0
-};
-
-enum JPEG_SUBSAMPLING { JPGD_GRAYSCALE = 0, JPGD_YH1V1, JPGD_YH2V1, JPGD_YH1V2, JPGD_YH2V2 };
-
-#define CONST_BITS  13
-#define PASS1_BITS  2
-#define SCALEDONE ((int32)1)
-
-#define FIX_0_298631336  ((int32)2446)        /* FIX(0.298631336) */
-#define FIX_0_390180644  ((int32)3196)        /* FIX(0.390180644) */
-#define FIX_0_541196100  ((int32)4433)        /* FIX(0.541196100) */
-#define FIX_0_765366865  ((int32)6270)        /* FIX(0.765366865) */
-#define FIX_0_899976223  ((int32)7373)        /* FIX(0.899976223) */
-#define FIX_1_175875602  ((int32)9633)        /* FIX(1.175875602) */
-#define FIX_1_501321110  ((int32)12299)       /* FIX(1.501321110) */
-#define FIX_1_847759065  ((int32)15137)       /* FIX(1.847759065) */
-#define FIX_1_961570560  ((int32)16069)       /* FIX(1.961570560) */
-#define FIX_2_053119869  ((int32)16819)       /* FIX(2.053119869) */
-#define FIX_2_562915447  ((int32)20995)       /* FIX(2.562915447) */
-#define FIX_3_072711026  ((int32)25172)       /* FIX(3.072711026) */
-
-#define DESCALE(x,n)  (((x) + (SCALEDONE << ((n)-1))) >> (n))
-#define DESCALE_ZEROSHIFT(x,n)  (((x) + (128 << (n)) + (SCALEDONE << ((n)-1))) >> (n))
-
-#define MULTIPLY(var, cnst)  ((var) * (cnst))
-
-#define CLAMP(i) ((static_cast<uint>(i) > 255) ? (((~i) >> 31) & 0xFF) : (i))
-
-// Compiler creates a fast path 1D IDCT for X non-zero columns
-template <int NONZERO_COLS>
-struct Row
-{
-  static void idct(int* pTemp, const jpgd_block_t* pSrc)
-  {
-    // ACCESS_COL() will be optimized at compile time to either an array access, or 0.
-    #define ACCESS_COL(x) (((x) < NONZERO_COLS) ? (int)pSrc[x] : 0)
-
-    const int z2 = ACCESS_COL(2), z3 = ACCESS_COL(6);
-
-    const int z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    const int tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
-    const int tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
-
-    const int tmp0 = (ACCESS_COL(0) + ACCESS_COL(4)) << CONST_BITS;
-    const int tmp1 = (ACCESS_COL(0) - ACCESS_COL(4)) << CONST_BITS;
-
-    const int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
-
-    const int atmp0 = ACCESS_COL(7), atmp1 = ACCESS_COL(5), atmp2 = ACCESS_COL(3), atmp3 = ACCESS_COL(1);
-
-    const int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
-    const int bz5 = MULTIPLY(bz3 + bz4, FIX_1_175875602);
-
-    const int az1 = MULTIPLY(bz1, - FIX_0_899976223);
-    const int az2 = MULTIPLY(bz2, - FIX_2_562915447);
-    const int az3 = MULTIPLY(bz3, - FIX_1_961570560) + bz5;
-    const int az4 = MULTIPLY(bz4, - FIX_0_390180644) + bz5;
-
-    const int btmp0 = MULTIPLY(atmp0, FIX_0_298631336) + az1 + az3;
-    const int btmp1 = MULTIPLY(atmp1, FIX_2_053119869) + az2 + az4;
-    const int btmp2 = MULTIPLY(atmp2, FIX_3_072711026) + az2 + az3;
-    const int btmp3 = MULTIPLY(atmp3, FIX_1_501321110) + az1 + az4;
-
-    pTemp[0] = DESCALE(tmp10 + btmp3, CONST_BITS-PASS1_BITS);
-    pTemp[7] = DESCALE(tmp10 - btmp3, CONST_BITS-PASS1_BITS);
-    pTemp[1] = DESCALE(tmp11 + btmp2, CONST_BITS-PASS1_BITS);
-    pTemp[6] = DESCALE(tmp11 - btmp2, CONST_BITS-PASS1_BITS);
-    pTemp[2] = DESCALE(tmp12 + btmp1, CONST_BITS-PASS1_BITS);
-    pTemp[5] = DESCALE(tmp12 - btmp1, CONST_BITS-PASS1_BITS);
-    pTemp[3] = DESCALE(tmp13 + btmp0, CONST_BITS-PASS1_BITS);
-    pTemp[4] = DESCALE(tmp13 - btmp0, CONST_BITS-PASS1_BITS);
-  }
-};
-
-template <>
-struct Row<0>
-{
-  static void idct(int* pTemp, const jpgd_block_t* pSrc)
-  {
-#ifdef _MSC_VER
-    pTemp; pSrc;
-#endif
-  }
-};
-
-template <>
-struct Row<1>
-{
-  static void idct(int* pTemp, const jpgd_block_t* pSrc)
-  {
-    const int dcval = (pSrc[0] << PASS1_BITS);
-
-    pTemp[0] = dcval;
-    pTemp[1] = dcval;
-    pTemp[2] = dcval;
-    pTemp[3] = dcval;
-    pTemp[4] = dcval;
-    pTemp[5] = dcval;
-    pTemp[6] = dcval;
-    pTemp[7] = dcval;
-  }
-};
-
-// Compiler creates a fast path 1D IDCT for X non-zero rows
-template <int NONZERO_ROWS>
-struct Col
-{
-  static void idct(uint8* pDst_ptr, const int* pTemp)
-  {
-    // ACCESS_ROW() will be optimized at compile time to either an array access, or 0.
-    #define ACCESS_ROW(x) (((x) < NONZERO_ROWS) ? pTemp[x * 8] : 0)
-
-    const int z2 = ACCESS_ROW(2);
-    const int z3 = ACCESS_ROW(6);
-
-    const int z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
-    const int tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
-    const int tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
-
-    const int tmp0 = (ACCESS_ROW(0) + ACCESS_ROW(4)) << CONST_BITS;
-    const int tmp1 = (ACCESS_ROW(0) - ACCESS_ROW(4)) << CONST_BITS;
-
-    const int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
-
-    const int atmp0 = ACCESS_ROW(7), atmp1 = ACCESS_ROW(5), atmp2 = ACCESS_ROW(3), atmp3 = ACCESS_ROW(1);
-
-    const int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
-    const int bz5 = MULTIPLY(bz3 + bz4, FIX_1_175875602);
-
-    const int az1 = MULTIPLY(bz1, - FIX_0_899976223);
-    const int az2 = MULTIPLY(bz2, - FIX_2_562915447);
-    const int az3 = MULTIPLY(bz3, - FIX_1_961570560) + bz5;
-    const int az4 = MULTIPLY(bz4, - FIX_0_390180644) + bz5;
-
-    const int btmp0 = MULTIPLY(atmp0, FIX_0_298631336) + az1 + az3;
-    const int btmp1 = MULTIPLY(atmp1, FIX_2_053119869) + az2 + az4;
-    const int btmp2 = MULTIPLY(atmp2, FIX_3_072711026) + az2 + az3;
-    const int btmp3 = MULTIPLY(atmp3, FIX_1_501321110) + az1 + az4;
-
-    int i = DESCALE_ZEROSHIFT(tmp10 + btmp3, CONST_BITS+PASS1_BITS+3);
-    pDst_ptr[8*0] = (uint8)CLAMP(i);
-
-    i = DESCALE_ZEROSHIFT(tmp10 - btmp3, CONST_BITS+PASS1_BITS+3);
-    pDst_ptr[8*7] = (uint8)CLAMP(i);
-
-    i = DESCALE_ZEROSHIFT(tmp11 + btmp2, CONST_BITS+PASS1_BITS+3);
-    pDst_ptr[8*1] = (uint8)CLAMP(i);
-
-    i = DESCALE_ZEROSHIFT(tmp11 - btmp2, CONST_BITS+PASS1_BITS+3);
-    pDst_ptr[8*6] = (uint8)CLAMP(i);
-
-    i = DESCALE_ZEROSHIFT(tmp12 + btmp1, CONST_BITS+PASS1_BITS+3);
-    pDst_ptr[8*2] = (uint8)CLAMP(i);
-
-    i = DESCALE_ZEROSHIFT(tmp12 - btmp1, CONST_BITS+PASS1_BITS+3);
-    pDst_ptr[8*5] = (uint8)CLAMP(i);
-
-    i = DESCALE_ZEROSHIFT(tmp13 + btmp0, CONST_BITS+PASS1_BITS+3);
-    pDst_ptr[8*3] = (uint8)CLAMP(i);
-
-    i = DESCALE_ZEROSHIFT(tmp13 - btmp0, CONST_BITS+PASS1_BITS+3);
-    pDst_ptr[8*4] = (uint8)CLAMP(i);
-  }
-};
-
-template <>
-struct Col<1>
-{
-  static void idct(uint8* pDst_ptr, const int* pTemp)
-  {
-    int dcval = DESCALE_ZEROSHIFT(pTemp[0], PASS1_BITS+3);
-    const uint8 dcval_clamped = (uint8)CLAMP(dcval);
-    pDst_ptr[0*8] = dcval_clamped;
-    pDst_ptr[1*8] = dcval_clamped;
-    pDst_ptr[2*8] = dcval_clamped;
-    pDst_ptr[3*8] = dcval_clamped;
-    pDst_ptr[4*8] = dcval_clamped;
-    pDst_ptr[5*8] = dcval_clamped;
-    pDst_ptr[6*8] = dcval_clamped;
-    pDst_ptr[7*8] = dcval_clamped;
-  }
-};
-
-static const uint8 s_idct_row_table[] =
-{
-  1,0,0,0,0,0,0,0, 2,0,0,0,0,0,0,0, 2,1,0,0,0,0,0,0, 2,1,1,0,0,0,0,0, 2,2,1,0,0,0,0,0, 3,2,1,0,0,0,0,0, 4,2,1,0,0,0,0,0, 4,3,1,0,0,0,0,0,
-  4,3,2,0,0,0,0,0, 4,3,2,1,0,0,0,0, 4,3,2,1,1,0,0,0, 4,3,2,2,1,0,0,0, 4,3,3,2,1,0,0,0, 4,4,3,2,1,0,0,0, 5,4,3,2,1,0,0,0, 6,4,3,2,1,0,0,0,
-  6,5,3,2,1,0,0,0, 6,5,4,2,1,0,0,0, 6,5,4,3,1,0,0,0, 6,5,4,3,2,0,0,0, 6,5,4,3,2,1,0,0, 6,5,4,3,2,1,1,0, 6,5,4,3,2,2,1,0, 6,5,4,3,3,2,1,0,
-  6,5,4,4,3,2,1,0, 6,5,5,4,3,2,1,0, 6,6,5,4,3,2,1,0, 7,6,5,4,3,2,1,0, 8,6,5,4,3,2,1,0, 8,7,5,4,3,2,1,0, 8,7,6,4,3,2,1,0, 8,7,6,5,3,2,1,0,
-  8,7,6,5,4,2,1,0, 8,7,6,5,4,3,1,0, 8,7,6,5,4,3,2,0, 8,7,6,5,4,3,2,1, 8,7,6,5,4,3,2,2, 8,7,6,5,4,3,3,2, 8,7,6,5,4,4,3,2, 8,7,6,5,5,4,3,2,
-  8,7,6,6,5,4,3,2, 8,7,7,6,5,4,3,2, 8,8,7,6,5,4,3,2, 8,8,8,6,5,4,3,2, 8,8,8,7,5,4,3,2, 8,8,8,7,6,4,3,2, 8,8,8,7,6,5,3,2, 8,8,8,7,6,5,4,2,
-  8,8,8,7,6,5,4,3, 8,8,8,7,6,5,4,4, 8,8,8,7,6,5,5,4, 8,8,8,7,6,6,5,4, 8,8,8,7,7,6,5,4, 8,8,8,8,7,6,5,4, 8,8,8,8,8,6,5,4, 8,8,8,8,8,7,5,4,
-  8,8,8,8,8,7,6,4, 8,8,8,8,8,7,6,5, 8,8,8,8,8,7,6,6, 8,8,8,8,8,7,7,6, 8,8,8,8,8,8,7,6, 8,8,8,8,8,8,8,6, 8,8,8,8,8,8,8,7, 8,8,8,8,8,8,8,8,
-};
-
-static const uint8 s_idct_col_table[] = { 1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 };
-
-void idct(const jpgd_block_t* pSrc_ptr, uint8* pDst_ptr, int block_max_zag)
-{
-  JPGD_ASSERT(block_max_zag >= 1);
-  JPGD_ASSERT(block_max_zag <= 64);
-
-  if (block_max_zag <= 1)
-  {
-    int k = ((pSrc_ptr[0] + 4) >> 3) + 128;
-    k = CLAMP(k);
-    k = k | (k<<8);
-    k = k | (k<<16);
-
-    for (int i = 8; i > 0; i--)
-    {
-      *(int*)&pDst_ptr[0] = k;
-      *(int*)&pDst_ptr[4] = k;
-      pDst_ptr += 8;
-    }
-    return;
-  }
-
-  int temp[64];
-
-  const jpgd_block_t* pSrc = pSrc_ptr;
-  int* pTemp = temp;
-
-  const uint8* pRow_tab = &s_idct_row_table[(block_max_zag - 1) * 8];
-  int i;
-  for (i = 8; i > 0; i--, pRow_tab++)
-  {
-    switch (*pRow_tab)
-    {
-      case 0: Row<0>::idct(pTemp, pSrc); break;
-      case 1: Row<1>::idct(pTemp, pSrc); break;
-      case 2: Row<2>::idct(pTemp, pSrc); break;
-      case 3: Row<3>::idct(pTemp, pSrc); break;
-      case 4: Row<4>::idct(pTemp, pSrc); break;
-      case 5: Row<5>::idct(pTemp, pSrc); break;
-      case 6: Row<6>::idct(pTemp, pSrc); break;
-      case 7: Row<7>::idct(pTemp, pSrc); break;
-      case 8: Row<8>::idct(pTemp, pSrc); break;
-    }
-
-    pSrc += 8;
-    pTemp += 8;
-  }
-
-  pTemp = temp;
-
-  const int nonzero_rows = s_idct_col_table[block_max_zag - 1];
-  for (i = 8; i > 0; i--)
-  {
-    switch (nonzero_rows)
-    {
-      case 1: Col<1>::idct(pDst_ptr, pTemp); break;
-      case 2: Col<2>::idct(pDst_ptr, pTemp); break;
-      case 3: Col<3>::idct(pDst_ptr, pTemp); break;
-      case 4: Col<4>::idct(pDst_ptr, pTemp); break;
-      case 5: Col<5>::idct(pDst_ptr, pTemp); break;
-      case 6: Col<6>::idct(pDst_ptr, pTemp); break;
-      case 7: Col<7>::idct(pDst_ptr, pTemp); break;
-      case 8: Col<8>::idct(pDst_ptr, pTemp); break;
-    }
-
-    pTemp++;
-    pDst_ptr++;
-  }
-}
-
-void idct_4x4(const jpgd_block_t* pSrc_ptr, uint8* pDst_ptr)
-{
-  int temp[64];
-  int* pTemp = temp;
-  const jpgd_block_t* pSrc = pSrc_ptr;
-
-  for (int i = 4; i > 0; i--)
-  {
-    Row<4>::idct(pTemp, pSrc);
-    pSrc += 8;
-    pTemp += 8;
-  }
-
-  pTemp = temp;
-  for (int i = 8; i > 0; i--)
-  {
-    Col<4>::idct(pDst_ptr, pTemp);
-    pTemp++;
-    pDst_ptr++;
-  }
-}
-
-// Retrieve one character from the input stream.
-inline uint jpeg_decoder::get_char()
-{
-  // Any bytes remaining in buffer?
-  if (!m_in_buf_left)
-  {
-    // Try to get more bytes.
-    prep_in_buffer();
-    // Still nothing to get?
-    if (!m_in_buf_left)
-    {
-      // Pad the end of the stream with 0xFF 0xD9 (EOI marker)
-      int t = m_tem_flag;
-      m_tem_flag ^= 1;
-      if (t)
-        return 0xD9;
-      else
-        return 0xFF;
-    }
-  }
-
-  uint c = *m_pIn_buf_ofs++;
-  m_in_buf_left--;
-
-  return c;
-}
-
-// Same as previous method, except can indicate if the character is a pad character or not.
-inline uint jpeg_decoder::get_char(bool *pPadding_flag)
-{
-  if (!m_in_buf_left)
-  {
-    prep_in_buffer();
-    if (!m_in_buf_left)
-    {
-      *pPadding_flag = true;
-      int t = m_tem_flag;
-      m_tem_flag ^= 1;
-      if (t)
-        return 0xD9;
-      else
-        return 0xFF;
-    }
-  }
-
-  *pPadding_flag = false;
-
-  uint c = *m_pIn_buf_ofs++;
-  m_in_buf_left--;
-
-  return c;
-}
-
-// Inserts a previously retrieved character back into the input buffer.
-inline void jpeg_decoder::stuff_char(uint8 q)
-{
-  *(--m_pIn_buf_ofs) = q;
-  m_in_buf_left++;
-}
-
-// Retrieves one character from the input stream, but does not read past markers. Will continue to return 0xFF when a marker is encountered.
-inline uint8 jpeg_decoder::get_octet()
-{
-  bool padding_flag;
-  int c = get_char(&padding_flag);
-
-  if (c == 0xFF)
-  {
-    if (padding_flag)
-      return 0xFF;
-
-    c = get_char(&padding_flag);
-    if (padding_flag)
-    {
-      stuff_char(0xFF);
-      return 0xFF;
-    }
-
-    if (c == 0x00)
-      return 0xFF;
-    else
-    {
-      stuff_char(static_cast<uint8>(c));
-      stuff_char(0xFF);
-      return 0xFF;
-    }
-  }
-
-  return static_cast<uint8>(c);
-}
-
-// Retrieves a variable number of bits from the input stream. Does not recognize markers.
-inline uint jpeg_decoder::get_bits(int num_bits)
-{
-  if (!num_bits)
-    return 0;
-
-  uint i = m_bit_buf >> (32 - num_bits);
-
-  if ((m_bits_left -= num_bits) <= 0)
-  {
-    m_bit_buf <<= (num_bits += m_bits_left);
-
-    uint c1 = get_char();
-    uint c2 = get_char();
-    m_bit_buf = (m_bit_buf & 0xFFFF0000) | (c1 << 8) | c2;
-
-    m_bit_buf <<= -m_bits_left;
-
-    m_bits_left += 16;
-
-    JPGD_ASSERT(m_bits_left >= 0);
-  }
-  else
-    m_bit_buf <<= num_bits;
-
-  return i;
-}
-
-// Retrieves a variable number of bits from the input stream. Markers will not be read into the input bit buffer. Instead, an infinite number of all 1's will be returned when a marker is encountered.
-inline uint jpeg_decoder::get_bits_no_markers(int num_bits)
-{
-  if (!num_bits)
-    return 0;
-
-  uint i = m_bit_buf >> (32 - num_bits);
-
-  if ((m_bits_left -= num_bits) <= 0)
-  {
-    m_bit_buf <<= (num_bits += m_bits_left);
-
-    if ((m_in_buf_left < 2) || (m_pIn_buf_ofs[0] == 0xFF) || (m_pIn_buf_ofs[1] == 0xFF))
-    {
-      uint c1 = get_octet();
-      uint c2 = get_octet();
-      m_bit_buf |= (c1 << 8) | c2;
-    }
-    else
-    {
-      m_bit_buf |= ((uint)m_pIn_buf_ofs[0] << 8) | m_pIn_buf_ofs[1];
-      m_in_buf_left -= 2;
-      m_pIn_buf_ofs += 2;
-    }
-
-    m_bit_buf <<= -m_bits_left;
-
-    m_bits_left += 16;
-
-    JPGD_ASSERT(m_bits_left >= 0);
-  }
-  else
-    m_bit_buf <<= num_bits;
-
-  return i;
-}
-
-// Decodes a Huffman encoded symbol.
-inline int jpeg_decoder::huff_decode(huff_tables *pH)
-{
-  int symbol;
-
-  // Check first 8-bits: do we have a complete symbol?
-  if ((symbol = pH->look_up[m_bit_buf >> 24]) < 0)
-  {
-    // Decode more bits, use a tree traversal to find symbol.
-    int ofs = 23;
-    do
-    {
-      symbol = pH->tree[-(int)(symbol + ((m_bit_buf >> ofs) & 1))];
-      ofs--;
-    } while (symbol < 0);
-
-    get_bits_no_markers(8 + (23 - ofs));
-  }
-  else
-    get_bits_no_markers(pH->code_size[symbol]);
-
-  return symbol;
-}
-
-// Decodes a Huffman encoded symbol.
-inline int jpeg_decoder::huff_decode(huff_tables *pH, int& extra_bits)
-{
-  int symbol;
-
-  // Check first 8-bits: do we have a complete symbol?
-  if ((symbol = pH->look_up2[m_bit_buf >> 24]) < 0)
-  {
-    // Use a tree traversal to find symbol.
-    int ofs = 23;
-    do
-    {
-      symbol = pH->tree[-(int)(symbol + ((m_bit_buf >> ofs) & 1))];
-      ofs--;
-    } while (symbol < 0);
-
-    get_bits_no_markers(8 + (23 - ofs));
-
-    extra_bits = get_bits_no_markers(symbol & 0xF);
-  }
-  else
-  {
-    JPGD_ASSERT(((symbol >> 8) & 31) == pH->code_size[symbol & 255] + ((symbol & 0x8000) ? (symbol & 15) : 0));
-
-    if (symbol & 0x8000)
-    {
-      get_bits_no_markers((symbol >> 8) & 31);
-      extra_bits = symbol >> 16;
-    }
-    else
-    {
-      int code_size = (symbol >> 8) & 31;
-      int num_extra_bits = symbol & 0xF;
-      int bits = code_size + num_extra_bits;
-      if (bits <= (m_bits_left + 16))
-        extra_bits = get_bits_no_markers(bits) & ((1 << num_extra_bits) - 1);
-      else
-      {
-        get_bits_no_markers(code_size);
-        extra_bits = get_bits_no_markers(num_extra_bits);
-      }
-    }
-
-    symbol &= 0xFF;
-  }
-
-  return symbol;
-}
-
-// Tables and macro used to fully decode the DPCM differences.
-static const int s_extend_test[16] = { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
-static const int s_extend_offset[16] = { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1, ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1, ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1, ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
-static const int s_extend_mask[] = { 0, (1<<0), (1<<1), (1<<2), (1<<3), (1<<4), (1<<5), (1<<6), (1<<7), (1<<8), (1<<9), (1<<10), (1<<11), (1<<12), (1<<13), (1<<14), (1<<15), (1<<16) };
-// The logical AND's in this macro are to shut up static code analysis (aren't really necessary - couldn't find another way to do this)
-#define JPGD_HUFF_EXTEND(x, s) (((x) < s_extend_test[s & 15]) ? ((x) + s_extend_offset[s & 15]) : (x))
-
-// Clamps a value between 0-255.
-inline uint8 jpeg_decoder::clamp(int i)
-{
-  if (static_cast<uint>(i) > 255)
-    i = (((~i) >> 31) & 0xFF);
-
-  return static_cast<uint8>(i);
-}
-
-namespace DCT_Upsample
-{
-  struct Matrix44
-  {
-    typedef int Element_Type;
-    enum { NUM_ROWS = 4, NUM_COLS = 4 };
-
-    Element_Type v[NUM_ROWS][NUM_COLS];
-
-    inline int rows() const { return NUM_ROWS; }
-    inline int cols() const { return NUM_COLS; }
-
-    inline const Element_Type & at(int r, int c) const { return v[r][c]; }
-    inline       Element_Type & at(int r, int c)       { return v[r][c]; }
-
-    inline Matrix44() { }
-
-    inline Matrix44& operator += (const Matrix44& a)
-    {
-      for (int r = 0; r < NUM_ROWS; r++)
-      {
-        at(r, 0) += a.at(r, 0);
-        at(r, 1) += a.at(r, 1);
-        at(r, 2) += a.at(r, 2);
-        at(r, 3) += a.at(r, 3);
-      }
-      return *this;
-    }
-
-    inline Matrix44& operator -= (const Matrix44& a)
-    {
-      for (int r = 0; r < NUM_ROWS; r++)
-      {
-        at(r, 0) -= a.at(r, 0);
-        at(r, 1) -= a.at(r, 1);
-        at(r, 2) -= a.at(r, 2);
-        at(r, 3) -= a.at(r, 3);
-      }
-      return *this;
-    }
-
-    friend inline Matrix44 operator + (const Matrix44& a, const Matrix44& b)
-    {
-      Matrix44 ret;
-      for (int r = 0; r < NUM_ROWS; r++)
-      {
-        ret.at(r, 0) = a.at(r, 0) + b.at(r, 0);
-        ret.at(r, 1) = a.at(r, 1) + b.at(r, 1);
-        ret.at(r, 2) = a.at(r, 2) + b.at(r, 2);
-        ret.at(r, 3) = a.at(r, 3) + b.at(r, 3);
-      }
-      return ret;
-    }
-
-    friend inline Matrix44 operator - (const Matrix44& a, const Matrix44& b)
-    {
-      Matrix44 ret;
-      for (int r = 0; r < NUM_ROWS; r++)
-      {
-        ret.at(r, 0) = a.at(r, 0) - b.at(r, 0);
-        ret.at(r, 1) = a.at(r, 1) - b.at(r, 1);
-        ret.at(r, 2) = a.at(r, 2) - b.at(r, 2);
-        ret.at(r, 3) = a.at(r, 3) - b.at(r, 3);
-      }
-      return ret;
-    }
-
-    static inline void add_and_store(jpgd_block_t* pDst, const Matrix44& a, const Matrix44& b)
-    {
-      for (int r = 0; r < 4; r++)
-      {
-        pDst[0*8 + r] = static_cast<jpgd_block_t>(a.at(r, 0) + b.at(r, 0));
-        pDst[1*8 + r] = static_cast<jpgd_block_t>(a.at(r, 1) + b.at(r, 1));
-        pDst[2*8 + r] = static_cast<jpgd_block_t>(a.at(r, 2) + b.at(r, 2));
-        pDst[3*8 + r] = static_cast<jpgd_block_t>(a.at(r, 3) + b.at(r, 3));
-      }
-    }
-
-    static inline void sub_and_store(jpgd_block_t* pDst, const Matrix44& a, const Matrix44& b)
-    {
-      for (int r = 0; r < 4; r++)
-      {
-        pDst[0*8 + r] = static_cast<jpgd_block_t>(a.at(r, 0) - b.at(r, 0));
-        pDst[1*8 + r] = static_cast<jpgd_block_t>(a.at(r, 1) - b.at(r, 1));
-        pDst[2*8 + r] = static_cast<jpgd_block_t>(a.at(r, 2) - b.at(r, 2));
-        pDst[3*8 + r] = static_cast<jpgd_block_t>(a.at(r, 3) - b.at(r, 3));
-      }
-    }
-  };
-
-  const int FRACT_BITS = 10;
-  const int SCALE = 1 << FRACT_BITS;
-
-  typedef int Temp_Type;
-  #define D(i) (((i) + (SCALE >> 1)) >> FRACT_BITS)
-  #define F(i) ((int)((i) * SCALE + .5f))
-
-  // Any decent C++ compiler will optimize this at compile time to a 0, or an array access.
-  #define AT(c, r) ((((c)>=NUM_COLS)||((r)>=NUM_ROWS)) ? 0 : pSrc[(c)+(r)*8])
-
-  // NUM_ROWS/NUM_COLS = # of non-zero rows/cols in input matrix
-  template<int NUM_ROWS, int NUM_COLS>
-  struct P_Q
-  {
-    static void calc(Matrix44& P, Matrix44& Q, const jpgd_block_t* pSrc)
-    {
-      // 4x8 = 4x8 times 8x8, matrix 0 is constant
-      const Temp_Type X000 = AT(0, 0);
-      const Temp_Type X001 = AT(0, 1);
-      const Temp_Type X002 = AT(0, 2);
-      const Temp_Type X003 = AT(0, 3);
-      const Temp_Type X004 = AT(0, 4);
-      const Temp_Type X005 = AT(0, 5);
-      const Temp_Type X006 = AT(0, 6);
-      const Temp_Type X007 = AT(0, 7);
-      const Temp_Type X010 = D(F(0.415735f) * AT(1, 0) + F(0.791065f) * AT(3, 0) + F(-0.352443f) * AT(5, 0) + F(0.277785f) * AT(7, 0));
-      const Temp_Type X011 = D(F(0.415735f) * AT(1, 1) + F(0.791065f) * AT(3, 1) + F(-0.352443f) * AT(5, 1) + F(0.277785f) * AT(7, 1));
-      const Temp_Type X012 = D(F(0.415735f) * AT(1, 2) + F(0.791065f) * AT(3, 2) + F(-0.352443f) * AT(5, 2) + F(0.277785f) * AT(7, 2));
-      const Temp_Type X013 = D(F(0.415735f) * AT(1, 3) + F(0.791065f) * AT(3, 3) + F(-0.352443f) * AT(5, 3) + F(0.277785f) * AT(7, 3));
-      const Temp_Type X014 = D(F(0.415735f) * AT(1, 4) + F(0.791065f) * AT(3, 4) + F(-0.352443f) * AT(5, 4) + F(0.277785f) * AT(7, 4));
-      const Temp_Type X015 = D(F(0.415735f) * AT(1, 5) + F(0.791065f) * AT(3, 5) + F(-0.352443f) * AT(5, 5) + F(0.277785f) * AT(7, 5));
-      const Temp_Type X016 = D(F(0.415735f) * AT(1, 6) + F(0.791065f) * AT(3, 6) + F(-0.352443f) * AT(5, 6) + F(0.277785f) * AT(7, 6));
-      const Temp_Type X017 = D(F(0.415735f) * AT(1, 7) + F(0.791065f) * AT(3, 7) + F(-0.352443f) * AT(5, 7) + F(0.277785f) * AT(7, 7));
-      const Temp_Type X020 = AT(4, 0);
-      const Temp_Type X021 = AT(4, 1);
-      const Temp_Type X022 = AT(4, 2);
-      const Temp_Type X023 = AT(4, 3);
-      const Temp_Type X024 = AT(4, 4);
-      const Temp_Type X025 = AT(4, 5);
-      const Temp_Type X026 = AT(4, 6);
-      const Temp_Type X027 = AT(4, 7);
-      const Temp_Type X030 = D(F(0.022887f) * AT(1, 0) + F(-0.097545f) * AT(3, 0) + F(0.490393f) * AT(5, 0) + F(0.865723f) * AT(7, 0));
-      const Temp_Type X031 = D(F(0.022887f) * AT(1, 1) + F(-0.097545f) * AT(3, 1) + F(0.490393f) * AT(5, 1) + F(0.865723f) * AT(7, 1));
-      const Temp_Type X032 = D(F(0.022887f) * AT(1, 2) + F(-0.097545f) * AT(3, 2) + F(0.490393f) * AT(5, 2) + F(0.865723f) * AT(7, 2));
-      const Temp_Type X033 = D(F(0.022887f) * AT(1, 3) + F(-0.097545f) * AT(3, 3) + F(0.490393f) * AT(5, 3) + F(0.865723f) * AT(7, 3));
-      const Temp_Type X034 = D(F(0.022887f) * AT(1, 4) + F(-0.097545f) * AT(3, 4) + F(0.490393f) * AT(5, 4) + F(0.865723f) * AT(7, 4));
-      const Temp_Type X035 = D(F(0.022887f) * AT(1, 5) + F(-0.097545f) * AT(3, 5) + F(0.490393f) * AT(5, 5) + F(0.865723f) * AT(7, 5));
-      const Temp_Type X036 = D(F(0.022887f) * AT(1, 6) + F(-0.097545f) * AT(3, 6) + F(0.490393f) * AT(5, 6) + F(0.865723f) * AT(7, 6));
-      const Temp_Type X037 = D(F(0.022887f) * AT(1, 7) + F(-0.097545f) * AT(3, 7) + F(0.490393f) * AT(5, 7) + F(0.865723f) * AT(7, 7));
-
-      // 4x4 = 4x8 times 8x4, matrix 1 is constant
-      P.at(0, 0) = X000;
-      P.at(0, 1) = D(X001 * F(0.415735f) + X003 * F(0.791065f) + X005 * F(-0.352443f) + X007 * F(0.277785f));
-      P.at(0, 2) = X004;
-      P.at(0, 3) = D(X001 * F(0.022887f) + X003 * F(-0.097545f) + X005 * F(0.490393f) + X007 * F(0.865723f));
-      P.at(1, 0) = X010;
-      P.at(1, 1) = D(X011 * F(0.415735f) + X013 * F(0.791065f) + X015 * F(-0.352443f) + X017 * F(0.277785f));
-      P.at(1, 2) = X014;
-      P.at(1, 3) = D(X011 * F(0.022887f) + X013 * F(-0.097545f) + X015 * F(0.490393f) + X017 * F(0.865723f));
-      P.at(2, 0) = X020;
-      P.at(2, 1) = D(X021 * F(0.415735f) + X023 * F(0.791065f) + X025 * F(-0.352443f) + X027 * F(0.277785f));
-      P.at(2, 2) = X024;
-      P.at(2, 3) = D(X021 * F(0.022887f) + X023 * F(-0.097545f) + X025 * F(0.490393f) + X027 * F(0.865723f));
-      P.at(3, 0) = X030;
-      P.at(3, 1) = D(X031 * F(0.415735f) + X033 * F(0.791065f) + X035 * F(-0.352443f) + X037 * F(0.277785f));
-      P.at(3, 2) = X034;
-      P.at(3, 3) = D(X031 * F(0.022887f) + X033 * F(-0.097545f) + X035 * F(0.490393f) + X037 * F(0.865723f));
-      // 40 muls 24 adds
-
-      // 4x4 = 4x8 times 8x4, matrix 1 is constant
-      Q.at(0, 0) = D(X001 * F(0.906127f) + X003 * F(-0.318190f) + X005 * F(0.212608f) + X007 * F(-0.180240f));
-      Q.at(0, 1) = X002;
-      Q.at(0, 2) = D(X001 * F(-0.074658f) + X003 * F(0.513280f) + X005 * F(0.768178f) + X007 * F(-0.375330f));
-      Q.at(0, 3) = X006;
-      Q.at(1, 0) = D(X011 * F(0.906127f) + X013 * F(-0.318190f) + X015 * F(0.212608f) + X017 * F(-0.180240f));
-      Q.at(1, 1) = X012;
-      Q.at(1, 2) = D(X011 * F(-0.074658f) + X013 * F(0.513280f) + X015 * F(0.768178f) + X017 * F(-0.375330f));
-      Q.at(1, 3) = X016;
-      Q.at(2, 0) = D(X021 * F(0.906127f) + X023 * F(-0.318190f) + X025 * F(0.212608f) + X027 * F(-0.180240f));
-      Q.at(2, 1) = X022;
-      Q.at(2, 2) = D(X021 * F(-0.074658f) + X023 * F(0.513280f) + X025 * F(0.768178f) + X027 * F(-0.375330f));
-      Q.at(2, 3) = X026;
-      Q.at(3, 0) = D(X031 * F(0.906127f) + X033 * F(-0.318190f) + X035 * F(0.212608f) + X037 * F(-0.180240f));
-      Q.at(3, 1) = X032;
-      Q.at(3, 2) = D(X031 * F(-0.074658f) + X033 * F(0.513280f) + X035 * F(0.768178f) + X037 * F(-0.375330f));
-      Q.at(3, 3) = X036;
-      // 40 muls 24 adds
-    }
-  };
-
-  template<int NUM_ROWS, int NUM_COLS>
-  struct R_S
-  {
-    static void calc(Matrix44& R, Matrix44& S, const jpgd_block_t* pSrc)
-    {
-      // 4x8 = 4x8 times 8x8, matrix 0 is constant
-      const Temp_Type X100 = D(F(0.906127f) * AT(1, 0) + F(-0.318190f) * AT(3, 0) + F(0.212608f) * AT(5, 0) + F(-0.180240f) * AT(7, 0));
-      const Temp_Type X101 = D(F(0.906127f) * AT(1, 1) + F(-0.318190f) * AT(3, 1) + F(0.212608f) * AT(5, 1) + F(-0.180240f) * AT(7, 1));
-      const Temp_Type X102 = D(F(0.906127f) * AT(1, 2) + F(-0.318190f) * AT(3, 2) + F(0.212608f) * AT(5, 2) + F(-0.180240f) * AT(7, 2));
-      const Temp_Type X103 = D(F(0.906127f) * AT(1, 3) + F(-0.318190f) * AT(3, 3) + F(0.212608f) * AT(5, 3) + F(-0.180240f) * AT(7, 3));
-      const Temp_Type X104 = D(F(0.906127f) * AT(1, 4) + F(-0.318190f) * AT(3, 4) + F(0.212608f) * AT(5, 4) + F(-0.180240f) * AT(7, 4));
-      const Temp_Type X105 = D(F(0.906127f) * AT(1, 5) + F(-0.318190f) * AT(3, 5) + F(0.212608f) * AT(5, 5) + F(-0.180240f) * AT(7, 5));
-      const Temp_Type X106 = D(F(0.906127f) * AT(1, 6) + F(-0.318190f) * AT(3, 6) + F(0.212608f) * AT(5, 6) + F(-0.180240f) * AT(7, 6));
-      const Temp_Type X107 = D(F(0.906127f) * AT(1, 7) + F(-0.318190f) * AT(3, 7) + F(0.212608f) * AT(5, 7) + F(-0.180240f) * AT(7, 7));
-      const Temp_Type X110 = AT(2, 0);
-      const Temp_Type X111 = AT(2, 1);
-      const Temp_Type X112 = AT(2, 2);
-      const Temp_Type X113 = AT(2, 3);
-      const Temp_Type X114 = AT(2, 4);
-      const Temp_Type X115 = AT(2, 5);
-      const Temp_Type X116 = AT(2, 6);
-      const Temp_Type X117 = AT(2, 7);
-      const Temp_Type X120 = D(F(-0.074658f) * AT(1, 0) + F(0.513280f) * AT(3, 0) + F(0.768178f) * AT(5, 0) + F(-0.375330f) * AT(7, 0));
-      const Temp_Type X121 = D(F(-0.074658f) * AT(1, 1) + F(0.513280f) * AT(3, 1) + F(0.768178f) * AT(5, 1) + F(-0.375330f) * AT(7, 1));
-      const Temp_Type X122 = D(F(-0.074658f) * AT(1, 2) + F(0.513280f) * AT(3, 2) + F(0.768178f) * AT(5, 2) + F(-0.375330f) * AT(7, 2));
-      const Temp_Type X123 = D(F(-0.074658f) * AT(1, 3) + F(0.513280f) * AT(3, 3) + F(0.768178f) * AT(5, 3) + F(-0.375330f) * AT(7, 3));
-      const Temp_Type X124 = D(F(-0.074658f) * AT(1, 4) + F(0.513280f) * AT(3, 4) + F(0.768178f) * AT(5, 4) + F(-0.375330f) * AT(7, 4));
-      const Temp_Type X125 = D(F(-0.074658f) * AT(1, 5) + F(0.513280f) * AT(3, 5) + F(0.768178f) * AT(5, 5) + F(-0.375330f) * AT(7, 5));
-      const Temp_Type X126 = D(F(-0.074658f) * AT(1, 6) + F(0.513280f) * AT(3, 6) + F(0.768178f) * AT(5, 6) + F(-0.375330f) * AT(7, 6));
-      const Temp_Type X127 = D(F(-0.074658f) * AT(1, 7) + F(0.513280f) * AT(3, 7) + F(0.768178f) * AT(5, 7) + F(-0.375330f) * AT(7, 7));
-      const Temp_Type X130 = AT(6, 0);
-      const Temp_Type X131 = AT(6, 1);
-      const Temp_Type X132 = AT(6, 2);
-      const Temp_Type X133 = AT(6, 3);
-      const Temp_Type X134 = AT(6, 4);
-      const Temp_Type X135 = AT(6, 5);
-      const Temp_Type X136 = AT(6, 6);
-      const Temp_Type X137 = AT(6, 7);
-      // 80 muls 48 adds
-
-      // 4x4 = 4x8 times 8x4, matrix 1 is constant
-      R.at(0, 0) = X100;
-      R.at(0, 1) = D(X101 * F(0.415735f) + X103 * F(0.791065f) + X105 * F(-0.352443f) + X107 * F(0.277785f));
-      R.at(0, 2) = X104;
-      R.at(0, 3) = D(X101 * F(0.022887f) + X103 * F(-0.097545f) + X105 * F(0.490393f) + X107 * F(0.865723f));
-      R.at(1, 0) = X110;
-      R.at(1, 1) = D(X111 * F(0.415735f) + X113 * F(0.791065f) + X115 * F(-0.352443f) + X117 * F(0.277785f));
-      R.at(1, 2) = X114;
-      R.at(1, 3) = D(X111 * F(0.022887f) + X113 * F(-0.097545f) + X115 * F(0.490393f) + X117 * F(0.865723f));
-      R.at(2, 0) = X120;
-      R.at(2, 1) = D(X121 * F(0.415735f) + X123 * F(0.791065f) + X125 * F(-0.352443f) + X127 * F(0.277785f));
-      R.at(2, 2) = X124;
-      R.at(2, 3) = D(X121 * F(0.022887f) + X123 * F(-0.097545f) + X125 * F(0.490393f) + X127 * F(0.865723f));
-      R.at(3, 0) = X130;
-      R.at(3, 1) = D(X131 * F(0.415735f) + X133 * F(0.791065f) + X135 * F(-0.352443f) + X137 * F(0.277785f));
-      R.at(3, 2) = X134;
-      R.at(3, 3) = D(X131 * F(0.022887f) + X133 * F(-0.097545f) + X135 * F(0.490393f) + X137 * F(0.865723f));
-      // 40 muls 24 adds
-      // 4x4 = 4x8 times 8x4, matrix 1 is constant
-      S.at(0, 0) = D(X101 * F(0.906127f) + X103 * F(-0.318190f) + X105 * F(0.212608f) + X107 * F(-0.180240f));
-      S.at(0, 1) = X102;
-      S.at(0, 2) = D(X101 * F(-0.074658f) + X103 * F(0.513280f) + X105 * F(0.768178f) + X107 * F(-0.375330f));
-      S.at(0, 3) = X106;
-      S.at(1, 0) = D(X111 * F(0.906127f) + X113 * F(-0.318190f) + X115 * F(0.212608f) + X117 * F(-0.180240f));
-      S.at(1, 1) = X112;
-      S.at(1, 2) = D(X111 * F(-0.074658f) + X113 * F(0.513280f) + X115 * F(0.768178f) + X117 * F(-0.375330f));
-      S.at(1, 3) = X116;
-      S.at(2, 0) = D(X121 * F(0.906127f) + X123 * F(-0.318190f) + X125 * F(0.212608f) + X127 * F(-0.180240f));
-      S.at(2, 1) = X122;
-      S.at(2, 2) = D(X121 * F(-0.074658f) + X123 * F(0.513280f) + X125 * F(0.768178f) + X127 * F(-0.375330f));
-      S.at(2, 3) = X126;
-      S.at(3, 0) = D(X131 * F(0.906127f) + X133 * F(-0.318190f) + X135 * F(0.212608f) + X137 * F(-0.180240f));
-      S.at(3, 1) = X132;
-      S.at(3, 2) = D(X131 * F(-0.074658f) + X133 * F(0.513280f) + X135 * F(0.768178f) + X137 * F(-0.375330f));
-      S.at(3, 3) = X136;
-      // 40 muls 24 adds
-    }
-  };
-} // end namespace DCT_Upsample
-
-// Unconditionally frees all allocated m_blocks.
-void jpeg_decoder::free_all_blocks()
-{
-  m_pStream = NULL;
-  for (mem_block *b = m_pMem_blocks; b; )
-  {
-    mem_block *n = b->m_pNext;
-    jpgd_free(b);
-    b = n;
-  }
-  m_pMem_blocks = NULL;
-}
-
-// This method handles all errors. It will never return.
-// It could easily be changed to use C++ exceptions.
-JPGD_NORETURN void jpeg_decoder::stop_decoding(jpgd_status status)
-{
-  m_error_code = status;
-  free_all_blocks();
-  longjmp(m_jmp_state, status);
-}
-
-void *jpeg_decoder::alloc(size_t nSize, bool zero)
-{
-  nSize = (JPGD_MAX(nSize, 1) + 3) & ~3;
-  char *rv = NULL;
-  for (mem_block *b = m_pMem_blocks; b; b = b->m_pNext)
-  {
-    if ((b->m_used_count + nSize) <= b->m_size)
-    {
-      rv = b->m_data + b->m_used_count;
-      b->m_used_count += nSize;
-      break;
-    }
-  }
-  if (!rv)
-  {
-    int capacity = JPGD_MAX(32768 - 256, (nSize + 2047) & ~2047);
-    mem_block *b = (mem_block*)jpgd_malloc(sizeof(mem_block) + capacity);
-    if (!b) { stop_decoding(JPGD_NOTENOUGHMEM); }
-    b->m_pNext = m_pMem_blocks; m_pMem_blocks = b;
-    b->m_used_count = nSize;
-    b->m_size = capacity;
-    rv = b->m_data;
-  }
-  if (zero) memset(rv, 0, nSize);
-  return rv;
-}
-
-void jpeg_decoder::word_clear(void *p, uint16 c, uint n)
-{
-  uint8 *pD = (uint8*)p;
-  const uint8 l = c & 0xFF, h = (c >> 8) & 0xFF;
-  while (n)
-  {
-    pD[0] = l; pD[1] = h; pD += 2;
-    n--;
-  }
-}
-
-// Refill the input buffer.
-// This method will sit in a loop until (A) the buffer is full or (B)
-// the stream's read() method reports and end of file condition.
-void jpeg_decoder::prep_in_buffer()
-{
-  m_in_buf_left = 0;
-  m_pIn_buf_ofs = m_in_buf;
-
-  if (m_eof_flag)
-    return;
-
-  do
-  {
-    int bytes_read = m_pStream->read(m_in_buf + m_in_buf_left, JPGD_IN_BUF_SIZE - m_in_buf_left, &m_eof_flag);
-    if (bytes_read == -1)
-      stop_decoding(JPGD_STREAM_READ);
-
-    m_in_buf_left += bytes_read;
-  } while ((m_in_buf_left < JPGD_IN_BUF_SIZE) && (!m_eof_flag));
-
-  m_total_bytes_read += m_in_buf_left;
-
-  // Pad the end of the block with M_EOI (prevents the decompressor from going off the rails if the stream is invalid).
-  // (This dates way back to when this decompressor was written in C/asm, and the all-asm Huffman decoder did some fancy things to increase perf.)
-  word_clear(m_pIn_buf_ofs + m_in_buf_left, 0xD9FF, 64);
-}
-
-// Read a Huffman code table.
-void jpeg_decoder::read_dht_marker()
-{
-  int i, index, count;
-  uint8 huff_num[17];
-  uint8 huff_val[256];
-
-  uint num_left = get_bits(16);
-
-  if (num_left < 2)
-    stop_decoding(JPGD_BAD_DHT_MARKER);
-
-  num_left -= 2;
-
-  while (num_left)
-  {
-    index = get_bits(8);
-
-    huff_num[0] = 0;
-
-    count = 0;
-
-    for (i = 1; i <= 16; i++)
-    {
-      huff_num[i] = static_cast<uint8>(get_bits(8));
-      count += huff_num[i];
-    }
-
-    if (count > 255)
-      stop_decoding(JPGD_BAD_DHT_COUNTS);
-
-    for (i = 0; i < count; i++)
-      huff_val[i] = static_cast<uint8>(get_bits(8));
-
-    i = 1 + 16 + count;
-
-    if (num_left < (uint)i)
-      stop_decoding(JPGD_BAD_DHT_MARKER);
-
-    num_left -= i;
-
-    if ((index & 0x10) > 0x10)
-      stop_decoding(JPGD_BAD_DHT_INDEX);
-
-    index = (index & 0x0F) + ((index & 0x10) >> 4) * (JPGD_MAX_HUFF_TABLES >> 1);
-
-    if (index >= JPGD_MAX_HUFF_TABLES)
-      stop_decoding(JPGD_BAD_DHT_INDEX);
-
-    if (!m_huff_num[index])
-      m_huff_num[index] = (uint8 *)alloc(17);
-
-    if (!m_huff_val[index])
-      m_huff_val[index] = (uint8 *)alloc(256);
-
-    m_huff_ac[index] = (index & 0x10) != 0;
-    memcpy(m_huff_num[index], huff_num, 17);
-    memcpy(m_huff_val[index], huff_val, 256);
-  }
-}
-
-// Read a quantization table.
-void jpeg_decoder::read_dqt_marker()
-{
-  int n, i, prec;
-  uint num_left;
-  uint temp;
-
-  num_left = get_bits(16);
-
-  if (num_left < 2)
-    stop_decoding(JPGD_BAD_DQT_MARKER);
-
-  num_left -= 2;
-
-  while (num_left)
-  {
-    n = get_bits(8);
-    prec = n >> 4;
-    n &= 0x0F;
-
-    if (n >= JPGD_MAX_QUANT_TABLES)
-      stop_decoding(JPGD_BAD_DQT_TABLE);
-
-    if (!m_quant[n])
-      m_quant[n] = (jpgd_quant_t *)alloc(64 * sizeof(jpgd_quant_t));
-
-    // read quantization entries, in zag order
-    for (i = 0; i < 64; i++)
-    {
-      temp = get_bits(8);
-
-      if (prec)
-        temp = (temp << 8) + get_bits(8);
-
-            m_quant[n][i] = static_cast<jpgd_quant_t>(temp);
-    }
-
-    i = 64 + 1;
-
-    if (prec)
-      i += 64;
-
-    if (num_left < (uint)i)
-      stop_decoding(JPGD_BAD_DQT_LENGTH);
-
-    num_left -= i;
-  }
-}
-
-// Read the start of frame (SOF) marker.
-void jpeg_decoder::read_sof_marker()
-{
-  int i;
-  uint num_left;
-
-  num_left = get_bits(16);
-
-  if (get_bits(8) != 8)   /* precision: sorry, only 8-bit precision is supported right now */
-    stop_decoding(JPGD_BAD_PRECISION);
-
-  m_image_y_size = get_bits(16);
-
-  if ((m_image_y_size < 1) || (m_image_y_size > JPGD_MAX_HEIGHT))
-    stop_decoding(JPGD_BAD_HEIGHT);
-
-  m_image_x_size = get_bits(16);
-
-  if ((m_image_x_size < 1) || (m_image_x_size > JPGD_MAX_WIDTH))
-    stop_decoding(JPGD_BAD_WIDTH);
-
-  m_comps_in_frame = get_bits(8);
-
-  if (m_comps_in_frame > JPGD_MAX_COMPONENTS)
-    stop_decoding(JPGD_TOO_MANY_COMPONENTS);
-
-  if (num_left != (uint)(m_comps_in_frame * 3 + 8))
-    stop_decoding(JPGD_BAD_SOF_LENGTH);
-
-  for (i = 0; i < m_comps_in_frame; i++)
-  {
-    m_comp_ident[i]  = get_bits(8);
-    m_comp_h_samp[i] = get_bits(4);
-    m_comp_v_samp[i] = get_bits(4);
-    m_comp_quant[i]  = get_bits(8);
-  }
-}
-
-// Used to skip unrecognized markers.
-void jpeg_decoder::skip_variable_marker()
-{
-  uint num_left;
-
-  num_left = get_bits(16);
-
-  if (num_left < 2)
-    stop_decoding(JPGD_BAD_VARIABLE_MARKER);
-
-  num_left -= 2;
-
-  while (num_left)
-  {
-    get_bits(8);
-    num_left--;
-  }
-}
-
-// Read a define restart interval (DRI) marker.
-void jpeg_decoder::read_dri_marker()
-{
-  if (get_bits(16) != 4)
-    stop_decoding(JPGD_BAD_DRI_LENGTH);
-
-  m_restart_interval = get_bits(16);
-}
-
-// Read a start of scan (SOS) marker.
-void jpeg_decoder::read_sos_marker()
-{
-  uint num_left;
-  int i, ci, n, c, cc;
-
-  num_left = get_bits(16);
-
-  n = get_bits(8);
-
-  m_comps_in_scan = n;
-
-  num_left -= 3;
-
-  if ( (num_left != (uint)(n * 2 + 3)) || (n < 1) || (n > JPGD_MAX_COMPS_IN_SCAN) )
-    stop_decoding(JPGD_BAD_SOS_LENGTH);
-
-  for (i = 0; i < n; i++)
-  {
-    cc = get_bits(8);
-    c = get_bits(8);
-    num_left -= 2;
-
-    for (ci = 0; ci < m_comps_in_frame; ci++)
-      if (cc == m_comp_ident[ci])
-        break;
-
-    if (ci >= m_comps_in_frame)
-      stop_decoding(JPGD_BAD_SOS_COMP_ID);
-
-    m_comp_list[i]    = ci;
-    m_comp_dc_tab[ci] = (c >> 4) & 15;
-    m_comp_ac_tab[ci] = (c & 15) + (JPGD_MAX_HUFF_TABLES >> 1);
-  }
-
-  m_spectral_start  = get_bits(8);
-  m_spectral_end    = get_bits(8);
-  m_successive_high = get_bits(4);
-  m_successive_low  = get_bits(4);
-
-  if (!m_progressive_flag)
-  {
-    m_spectral_start = 0;
-    m_spectral_end = 63;
-  }
-
-  num_left -= 3;
-
-  while (num_left)                  /* read past whatever is num_left */
-  {
-    get_bits(8);
-    num_left--;
-  }
-}
-
-// Finds the next marker.
-int jpeg_decoder::next_marker()
-{
-  uint c, bytes;
-
-  bytes = 0;
-
-  do
-  {
-    do
-    {
-      bytes++;
-      c = get_bits(8);
-    } while (c != 0xFF);
-
-    do
-    {
-      c = get_bits(8);
-    } while (c == 0xFF);
-
-  } while (c == 0);
-
-  // If bytes > 0 here, there where extra bytes before the marker (not good).
-
-  return c;
-}
-
-// Process markers. Returns when an SOFx, SOI, EOI, or SOS marker is
-// encountered.
-int jpeg_decoder::process_markers()
-{
-  int c;
-
-  for ( ; ; )
-  {
-    c = next_marker();
-
-    switch (c)
-    {
-      case M_SOF0:
-      case M_SOF1:
-      case M_SOF2:
-      case M_SOF3:
-      case M_SOF5:
-      case M_SOF6:
-      case M_SOF7:
-//      case M_JPG:
-      case M_SOF9:
-      case M_SOF10:
-      case M_SOF11:
-      case M_SOF13:
-      case M_SOF14:
-      case M_SOF15:
-      case M_SOI:
-      case M_EOI:
-      case M_SOS:
-      {
-        return c;
-      }
-      case M_DHT:
-      {
-        read_dht_marker();
-        break;
-      }
-      // No arithmitic support - dumb patents!
-      case M_DAC:
-      {
-        stop_decoding(JPGD_NO_ARITHMITIC_SUPPORT);
-        break;
-      }
-      case M_DQT:
-      {
-        read_dqt_marker();
-        break;
-      }
-      case M_DRI:
-      {
-        read_dri_marker();
-        break;
-      }
-      //case M_APP0:  /* no need to read the JFIF marker */
-
-      case M_JPG:
-      case M_RST0:    /* no parameters */
-      case M_RST1:
-      case M_RST2:
-      case M_RST3:
-      case M_RST4:
-      case M_RST5:
-      case M_RST6:
-      case M_RST7:
-      case M_TEM:
-      {
-        stop_decoding(JPGD_UNEXPECTED_MARKER);
-        break;
-      }
-      default:    /* must be DNL, DHP, EXP, APPn, JPGn, COM, or RESn or APP0 */
-      {
-        skip_variable_marker();
-        break;
-      }
-    }
-  }
-}
-
-// Finds the start of image (SOI) marker.
-// This code is rather defensive: it only checks the first 512 bytes to avoid
-// false positives.
-void jpeg_decoder::locate_soi_marker()
-{
-  uint lastchar, thischar;
-  uint bytesleft;
-
-  lastchar = get_bits(8);
-
-  thischar = get_bits(8);
-
-  /* ok if it's a normal JPEG file without a special header */
-
-  if ((lastchar == 0xFF) && (thischar == M_SOI))
-    return;
-
-  bytesleft = 4096; //512;
-
-  for ( ; ; )
-  {
-    if (--bytesleft == 0)
-      stop_decoding(JPGD_NOT_JPEG);
-
-    lastchar = thischar;
-
-    thischar = get_bits(8);
-
-    if (lastchar == 0xFF)
-    {
-      if (thischar == M_SOI)
-        break;
-      else if (thischar == M_EOI) // get_bits will keep returning M_EOI if we read past the end
-        stop_decoding(JPGD_NOT_JPEG);
-    }
-  }
-
-  // Check the next character after marker: if it's not 0xFF, it can't be the start of the next marker, so the file is bad.
-  thischar = (m_bit_buf >> 24) & 0xFF;
-
-  if (thischar != 0xFF)
-    stop_decoding(JPGD_NOT_JPEG);
-}
-
-// Find a start of frame (SOF) marker.
-void jpeg_decoder::locate_sof_marker()
-{
-  locate_soi_marker();
-
-  int c = process_markers();
-
-  switch (c)
-  {
-    case M_SOF2:
-      m_progressive_flag = JPGD_TRUE;
-    case M_SOF0:  /* baseline DCT */
-    case M_SOF1:  /* extended sequential DCT */
-    {
-      read_sof_marker();
-      break;
-    }
-    case M_SOF9:  /* Arithmitic coding */
-    {
-      stop_decoding(JPGD_NO_ARITHMITIC_SUPPORT);
-      break;
-    }
-    default:
-    {
-      stop_decoding(JPGD_UNSUPPORTED_MARKER);
-      break;
-    }
-  }
-}
-
-// Find a start of scan (SOS) marker.
-int jpeg_decoder::locate_sos_marker()
-{
-  int c;
-
-  c = process_markers();
-
-  if (c == M_EOI)
-    return JPGD_FALSE;
-  else if (c != M_SOS)
-    stop_decoding(JPGD_UNEXPECTED_MARKER);
-
-  read_sos_marker();
-
-  return JPGD_TRUE;
-}
-
-// Reset everything to default/uninitialized state.
-void jpeg_decoder::init(jpeg_decoder_stream *pStream)
-{
-  m_pMem_blocks = NULL;
-  m_error_code = JPGD_SUCCESS;
-  m_ready_flag = false;
-  m_image_x_size = m_image_y_size = 0;
-  m_pStream = pStream;
-  m_progressive_flag = JPGD_FALSE;
-
-  memset(m_huff_ac, 0, sizeof(m_huff_ac));
-  memset(m_huff_num, 0, sizeof(m_huff_num));
-  memset(m_huff_val, 0, sizeof(m_huff_val));
-  memset(m_quant, 0, sizeof(m_quant));
-
-  m_scan_type = 0;
-  m_comps_in_frame = 0;
-
-  memset(m_comp_h_samp, 0, sizeof(m_comp_h_samp));
-  memset(m_comp_v_samp, 0, sizeof(m_comp_v_samp));
-  memset(m_comp_quant, 0, sizeof(m_comp_quant));
-  memset(m_comp_ident, 0, sizeof(m_comp_ident));
-  memset(m_comp_h_blocks, 0, sizeof(m_comp_h_blocks));
-  memset(m_comp_v_blocks, 0, sizeof(m_comp_v_blocks));
-
-  m_comps_in_scan = 0;
-  memset(m_comp_list, 0, sizeof(m_comp_list));
-  memset(m_comp_dc_tab, 0, sizeof(m_comp_dc_tab));
-  memset(m_comp_ac_tab, 0, sizeof(m_comp_ac_tab));
-
-  m_spectral_start = 0;
-  m_spectral_end = 0;
-  m_successive_low = 0;
-  m_successive_high = 0;
-  m_max_mcu_x_size = 0;
-  m_max_mcu_y_size = 0;
-  m_blocks_per_mcu = 0;
-  m_max_blocks_per_row = 0;
-  m_mcus_per_row = 0;
-  m_mcus_per_col = 0;
-  m_expanded_blocks_per_component = 0;
-  m_expanded_blocks_per_mcu = 0;
-  m_expanded_blocks_per_row = 0;
-  m_freq_domain_chroma_upsample = false;
-
-  memset(m_mcu_org, 0, sizeof(m_mcu_org));
-
-  m_total_lines_left = 0;
-  m_mcu_lines_left = 0;
-  m_real_dest_bytes_per_scan_line = 0;
-  m_dest_bytes_per_scan_line = 0;
-  m_dest_bytes_per_pixel = 0;
-
-  memset(m_pHuff_tabs, 0, sizeof(m_pHuff_tabs));
-
-  memset(m_dc_coeffs, 0, sizeof(m_dc_coeffs));
-  memset(m_ac_coeffs, 0, sizeof(m_ac_coeffs));
-  memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
-
-  m_eob_run = 0;
-
-  memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
-
-  m_pIn_buf_ofs = m_in_buf;
-  m_in_buf_left = 0;
-  m_eof_flag = false;
-  m_tem_flag = 0;
-
-  memset(m_in_buf_pad_start, 0, sizeof(m_in_buf_pad_start));
-  memset(m_in_buf, 0, sizeof(m_in_buf));
-  memset(m_in_buf_pad_end, 0, sizeof(m_in_buf_pad_end));
-
-  m_restart_interval = 0;
-  m_restarts_left    = 0;
-  m_next_restart_num = 0;
-
-  m_max_mcus_per_row = 0;
-  m_max_blocks_per_mcu = 0;
-  m_max_mcus_per_col = 0;
-
-  memset(m_last_dc_val, 0, sizeof(m_last_dc_val));
-  m_pMCU_coefficients = NULL;
-  m_pSample_buf = NULL;
-
-  m_total_bytes_read = 0;
-
-  m_pScan_line_0 = NULL;
-  m_pScan_line_1 = NULL;
-
-  // Ready the input buffer.
-  prep_in_buffer();
-
-  // Prime the bit buffer.
-  m_bits_left = 16;
-  m_bit_buf = 0;
-
-  get_bits(16);
-  get_bits(16);
-
-  for (int i = 0; i < JPGD_MAX_BLOCKS_PER_MCU; i++)
-    m_mcu_block_max_zag[i] = 64;
-}
-
-#define SCALEBITS 16
-#define ONE_HALF  ((int) 1 << (SCALEBITS-1))
-#define FIX(x)    ((int) ((x) * (1L<<SCALEBITS) + 0.5f))
-
-// Create a few tables that allow us to quickly convert YCbCr to RGB.
-void jpeg_decoder::create_look_ups()
-{
-  for (int i = 0; i <= 255; i++)
-  {
-    int k = i - 128;
-    m_crr[i] = ( FIX(1.40200f)  * k + ONE_HALF) >> SCALEBITS;
-    m_cbb[i] = ( FIX(1.77200f)  * k + ONE_HALF) >> SCALEBITS;
-    m_crg[i] = (-FIX(0.71414f)) * k;
-    m_cbg[i] = (-FIX(0.34414f)) * k + ONE_HALF;
-  }
-}
-
-// This method throws back into the stream any bytes that where read
-// into the bit buffer during initial marker scanning.
-void jpeg_decoder::fix_in_buffer()
-{
-  // In case any 0xFF's where pulled into the buffer during marker scanning.
-  JPGD_ASSERT((m_bits_left & 7) == 0);
-
-  if (m_bits_left == 16)
-    stuff_char( (uint8)(m_bit_buf & 0xFF));
-
-  if (m_bits_left >= 8)
-    stuff_char( (uint8)((m_bit_buf >> 8) & 0xFF));
-
-  stuff_char((uint8)((m_bit_buf >> 16) & 0xFF));
-  stuff_char((uint8)((m_bit_buf >> 24) & 0xFF));
-
-  m_bits_left = 16;
-  get_bits_no_markers(16);
-  get_bits_no_markers(16);
-}
-
-void jpeg_decoder::transform_mcu(int mcu_row)
-{
-  jpgd_block_t* pSrc_ptr = m_pMCU_coefficients;
-  uint8* pDst_ptr = m_pSample_buf + mcu_row * m_blocks_per_mcu * 64;
-
-  for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
-  {
-    idct(pSrc_ptr, pDst_ptr, m_mcu_block_max_zag[mcu_block]);
-    pSrc_ptr += 64;
-    pDst_ptr += 64;
-  }
-}
-
-static const uint8 s_max_rc[64] =
-{
-  17, 18, 34, 50, 50, 51, 52, 52, 52, 68, 84, 84, 84, 84, 85, 86, 86, 86, 86, 86,
-  102, 118, 118, 118, 118, 118, 118, 119, 120, 120, 120, 120, 120, 120, 120, 136,
-  136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136,
-  136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136
-};
-
-void jpeg_decoder::transform_mcu_expand(int mcu_row)
-{
-  jpgd_block_t* pSrc_ptr = m_pMCU_coefficients;
-  uint8* pDst_ptr = m_pSample_buf + mcu_row * m_expanded_blocks_per_mcu * 64;
-
-  // Y IDCT
-    int mcu_block;
-  for (mcu_block = 0; mcu_block < m_expanded_blocks_per_component; mcu_block++)
-  {
-    idct(pSrc_ptr, pDst_ptr, m_mcu_block_max_zag[mcu_block]);
-    pSrc_ptr += 64;
-    pDst_ptr += 64;
-  }
-
-  // Chroma IDCT, with upsampling
-    jpgd_block_t temp_block[64];
-
-  for (int i = 0; i < 2; i++)
-  {
-    DCT_Upsample::Matrix44 P, Q, R, S;
-
-    JPGD_ASSERT(m_mcu_block_max_zag[mcu_block] >= 1);
-    JPGD_ASSERT(m_mcu_block_max_zag[mcu_block] <= 64);
-
-    int max_zag = m_mcu_block_max_zag[mcu_block++] - 1; 
-    if (max_zag <= 0) max_zag = 0; // should never happen, only here to shut up static analysis
-    switch (s_max_rc[max_zag])
-    {
-    case 1*16+1:
-      DCT_Upsample::P_Q<1, 1>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<1, 1>::calc(R, S, pSrc_ptr);
-      break;
-    case 1*16+2:
-      DCT_Upsample::P_Q<1, 2>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<1, 2>::calc(R, S, pSrc_ptr);
-      break;
-    case 2*16+2:
-      DCT_Upsample::P_Q<2, 2>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<2, 2>::calc(R, S, pSrc_ptr);
-      break;
-    case 3*16+2:
-      DCT_Upsample::P_Q<3, 2>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<3, 2>::calc(R, S, pSrc_ptr);
-      break;
-    case 3*16+3:
-      DCT_Upsample::P_Q<3, 3>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<3, 3>::calc(R, S, pSrc_ptr);
-      break;
-    case 3*16+4:
-      DCT_Upsample::P_Q<3, 4>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<3, 4>::calc(R, S, pSrc_ptr);
-      break;
-    case 4*16+4:
-      DCT_Upsample::P_Q<4, 4>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<4, 4>::calc(R, S, pSrc_ptr);
-      break;
-    case 5*16+4:
-      DCT_Upsample::P_Q<5, 4>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<5, 4>::calc(R, S, pSrc_ptr);
-      break;
-    case 5*16+5:
-      DCT_Upsample::P_Q<5, 5>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<5, 5>::calc(R, S, pSrc_ptr);
-      break;
-    case 5*16+6:
-      DCT_Upsample::P_Q<5, 6>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<5, 6>::calc(R, S, pSrc_ptr);
-      break;
-    case 6*16+6:
-      DCT_Upsample::P_Q<6, 6>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<6, 6>::calc(R, S, pSrc_ptr);
-      break;
-    case 7*16+6:
-      DCT_Upsample::P_Q<7, 6>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<7, 6>::calc(R, S, pSrc_ptr);
-      break;
-    case 7*16+7:
-      DCT_Upsample::P_Q<7, 7>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<7, 7>::calc(R, S, pSrc_ptr);
-      break;
-    case 7*16+8:
-      DCT_Upsample::P_Q<7, 8>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<7, 8>::calc(R, S, pSrc_ptr);
-      break;
-    case 8*16+8:
-      DCT_Upsample::P_Q<8, 8>::calc(P, Q, pSrc_ptr);
-      DCT_Upsample::R_S<8, 8>::calc(R, S, pSrc_ptr);
-      break;
-    default:
-      JPGD_ASSERT(false);
-    }
-
-    DCT_Upsample::Matrix44 a(P + Q); P -= Q;
-    DCT_Upsample::Matrix44& b = P;
-    DCT_Upsample::Matrix44 c(R + S); R -= S;
-    DCT_Upsample::Matrix44& d = R;
-
-    DCT_Upsample::Matrix44::add_and_store(temp_block, a, c);
-    idct_4x4(temp_block, pDst_ptr);
-    pDst_ptr += 64;
-
-    DCT_Upsample::Matrix44::sub_and_store(temp_block, a, c);
-    idct_4x4(temp_block, pDst_ptr);
-    pDst_ptr += 64;
-
-    DCT_Upsample::Matrix44::add_and_store(temp_block, b, d);
-    idct_4x4(temp_block, pDst_ptr);
-    pDst_ptr += 64;
-
-    DCT_Upsample::Matrix44::sub_and_store(temp_block, b, d);
-    idct_4x4(temp_block, pDst_ptr);
-    pDst_ptr += 64;
-
-    pSrc_ptr += 64;
-  }
-}
-
-// Loads and dequantizes the next row of (already decoded) coefficients.
-// Progressive images only.
-void jpeg_decoder::load_next_row()
-{
-  int i;
-  jpgd_block_t *p;
-  jpgd_quant_t *q;
-  int mcu_row, mcu_block, row_block = 0;
-  int component_num, component_id;
-  int block_x_mcu[JPGD_MAX_COMPONENTS];
-
-  memset(block_x_mcu, 0, JPGD_MAX_COMPONENTS * sizeof(int));
-
-  for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
-  {
-    int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
-
-    for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
-    {
-      component_id = m_mcu_org[mcu_block];
-      q = m_quant[m_comp_quant[component_id]];
-
-      p = m_pMCU_coefficients + 64 * mcu_block;
-
-      jpgd_block_t* pAC = coeff_buf_getp(m_ac_coeffs[component_id], block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
-      jpgd_block_t* pDC = coeff_buf_getp(m_dc_coeffs[component_id], block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
-      p[0] = pDC[0];
-      memcpy(&p[1], &pAC[1], 63 * sizeof(jpgd_block_t));
-
-      for (i = 63; i > 0; i--)
-        if (p[g_ZAG[i]])
-          break;
-
-      m_mcu_block_max_zag[mcu_block] = i + 1;
-
-      for ( ; i >= 0; i--)
-                if (p[g_ZAG[i]])
-                    p[g_ZAG[i]] = static_cast<jpgd_block_t>(p[g_ZAG[i]] * q[i]);
-
-      row_block++;
-
-      if (m_comps_in_scan == 1)
-        block_x_mcu[component_id]++;
-      else
-      {
-        if (++block_x_mcu_ofs == m_comp_h_samp[component_id])
-        {
-          block_x_mcu_ofs = 0;
-
-          if (++block_y_mcu_ofs == m_comp_v_samp[component_id])
-          {
-            block_y_mcu_ofs = 0;
-
-            block_x_mcu[component_id] += m_comp_h_samp[component_id];
-          }
-        }
-      }
-    }
-
-    if (m_freq_domain_chroma_upsample)
-      transform_mcu_expand(mcu_row);
-    else
-      transform_mcu(mcu_row);
-  }
-
-  if (m_comps_in_scan == 1)
-    m_block_y_mcu[m_comp_list[0]]++;
-  else
-  {
-    for (component_num = 0; component_num < m_comps_in_scan; component_num++)
-    {
-      component_id = m_comp_list[component_num];
-
-      m_block_y_mcu[component_id] += m_comp_v_samp[component_id];
-    }
-  }
-}
-
-// Restart interval processing.
-void jpeg_decoder::process_restart()
-{
-  int i;
-  int c = 0;
-
-  // Align to a byte boundry
-  // FIXME: Is this really necessary? get_bits_no_markers() never reads in markers!
-  //get_bits_no_markers(m_bits_left & 7);
-
-  // Let's scan a little bit to find the marker, but not _too_ far.
-  // 1536 is a "fudge factor" that determines how much to scan.
-  for (i = 1536; i > 0; i--)
-    if (get_char() == 0xFF)
-      break;
-
-  if (i == 0)
-    stop_decoding(JPGD_BAD_RESTART_MARKER);
-
-  for ( ; i > 0; i--)
-    if ((c = get_char()) != 0xFF)
-      break;
-
-  if (i == 0)
-    stop_decoding(JPGD_BAD_RESTART_MARKER);
-
-  // Is it the expected marker? If not, something bad happened.
-  if (c != (m_next_restart_num + M_RST0))
-    stop_decoding(JPGD_BAD_RESTART_MARKER);
-
-  // Reset each component's DC prediction values.
-  memset(&m_last_dc_val, 0, m_comps_in_frame * sizeof(uint));
-
-  m_eob_run = 0;
-
-  m_restarts_left = m_restart_interval;
-
-  m_next_restart_num = (m_next_restart_num + 1) & 7;
-
-  // Get the bit buffer going again...
-
-  m_bits_left = 16;
-  get_bits_no_markers(16);
-  get_bits_no_markers(16);
-}
-
-static inline int dequantize_ac(int c, int q) { c *= q; return c; }
-
-// Decodes and dequantizes the next row of coefficients.
-void jpeg_decoder::decode_next_row()
-{
-  int row_block = 0;
-
-  for (int mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
-  {
-    if ((m_restart_interval) && (m_restarts_left == 0))
-      process_restart();
-
-    jpgd_block_t* p = m_pMCU_coefficients;
-    for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++, p += 64)
-    {
-      int component_id = m_mcu_org[mcu_block];
-      jpgd_quant_t* q = m_quant[m_comp_quant[component_id]];
-
-      int r, s;
-      s = huff_decode(m_pHuff_tabs[m_comp_dc_tab[component_id]], r);
-      s = JPGD_HUFF_EXTEND(r, s);
-
-      m_last_dc_val[component_id] = (s += m_last_dc_val[component_id]);
-
-      p[0] = static_cast<jpgd_block_t>(s * q[0]);
-
-      int prev_num_set = m_mcu_block_max_zag[mcu_block];
-
-      huff_tables *pH = m_pHuff_tabs[m_comp_ac_tab[component_id]];
-
-      int k;
-      for (k = 1; k < 64; k++)
-      {
-        int extra_bits;
-        s = huff_decode(pH, extra_bits);
-
-        r = s >> 4;
-        s &= 15;
-
-        if (s)
-        {
-          if (r)
-          {
-            if ((k + r) > 63)
-              stop_decoding(JPGD_DECODE_ERROR);
-
-            if (k < prev_num_set)
-            {
-              int n = JPGD_MIN(r, prev_num_set - k);
-              int kt = k;
-              while (n--)
-                p[g_ZAG[kt++]] = 0;
-            }
-
-            k += r;
-          }
-          
-          s = JPGD_HUFF_EXTEND(extra_bits, s);
-
-          JPGD_ASSERT(k < 64);
-
-          p[g_ZAG[k]] = static_cast<jpgd_block_t>(dequantize_ac(s, q[k])); //s * q[k];
-        }
-        else
-        {
-          if (r == 15)
-          {
-            if ((k + 16) > 64)
-              stop_decoding(JPGD_DECODE_ERROR);
-
-            if (k < prev_num_set)
-            {
-              int n = JPGD_MIN(16, prev_num_set - k);
-              int kt = k;
-              while (n--)
-              {
-                JPGD_ASSERT(kt <= 63);
-                p[g_ZAG[kt++]] = 0;
-              }
-            }
-
-            k += 16 - 1; // - 1 because the loop counter is k
-            JPGD_ASSERT(p[g_ZAG[k]] == 0);
-          }
-          else
-            break;
-        }
-      }
-
-      if (k < prev_num_set)
-      {
-        int kt = k;
-        while (kt < prev_num_set)
-          p[g_ZAG[kt++]] = 0;
-      }
-
-      m_mcu_block_max_zag[mcu_block] = k;
-
-      row_block++;
-    }
-
-    if (m_freq_domain_chroma_upsample)
-      transform_mcu_expand(mcu_row);
-    else
-      transform_mcu(mcu_row);
-
-    m_restarts_left--;
-  }
-}
-
-// YCbCr H1V1 (1x1:1:1, 3 m_blocks per MCU) to RGB
-void jpeg_decoder::H1V1Convert()
-{
-  int row = m_max_mcu_y_size - m_mcu_lines_left;
-  uint8 *d = m_pScan_line_0;
-  uint8 *s = m_pSample_buf + row * 8;
-
-  for (int i = m_max_mcus_per_row; i > 0; i--)
-  {
-    for (int j = 0; j < 8; j++)
-    {
-      int y = s[j];
-      int cb = s[64+j];
-      int cr = s[128+j];
-
-      d[0] = clamp(y + m_crr[cr]);
-      d[1] = clamp(y + ((m_crg[cr] + m_cbg[cb]) >> 16));
-      d[2] = clamp(y + m_cbb[cb]);
-      d[3] = 255;
-
-      d += 4;
-    }
-
-    s += 64*3;
-  }
-}
-
-// YCbCr H2V1 (2x1:1:1, 4 m_blocks per MCU) to RGB
-void jpeg_decoder::H2V1Convert()
-{
-  int row = m_max_mcu_y_size - m_mcu_lines_left;
-  uint8 *d0 = m_pScan_line_0;
-  uint8 *y = m_pSample_buf + row * 8;
-  uint8 *c = m_pSample_buf + 2*64 + row * 8;
-
-  for (int i = m_max_mcus_per_row; i > 0; i--)
-  {
-    for (int l = 0; l < 2; l++)
-    {
-      for (int j = 0; j < 4; j++)
-      {
-        int cb = c[0];
-        int cr = c[64];
-
-        int rc = m_crr[cr];
-        int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
-        int bc = m_cbb[cb];
-
-        int yy = y[j<<1];
-        d0[0] = clamp(yy+rc);
-        d0[1] = clamp(yy+gc);
-        d0[2] = clamp(yy+bc);
-        d0[3] = 255;
-
-        yy = y[(j<<1)+1];
-        d0[4] = clamp(yy+rc);
-        d0[5] = clamp(yy+gc);
-        d0[6] = clamp(yy+bc);
-        d0[7] = 255;
-
-        d0 += 8;
-
-        c++;
-      }
-      y += 64;
-    }
-
-    y += 64*4 - 64*2;
-    c += 64*4 - 8;
-  }
-}
-
-// YCbCr H2V1 (1x2:1:1, 4 m_blocks per MCU) to RGB
-void jpeg_decoder::H1V2Convert()
-{
-  int row = m_max_mcu_y_size - m_mcu_lines_left;
-  uint8 *d0 = m_pScan_line_0;
-  uint8 *d1 = m_pScan_line_1;
-  uint8 *y;
-  uint8 *c;
-
-  if (row < 8)
-    y = m_pSample_buf + row * 8;
-  else
-    y = m_pSample_buf + 64*1 + (row & 7) * 8;
-
-  c = m_pSample_buf + 64*2 + (row >> 1) * 8;
-
-  for (int i = m_max_mcus_per_row; i > 0; i--)
-  {
-    for (int j = 0; j < 8; j++)
-    {
-      int cb = c[0+j];
-      int cr = c[64+j];
-
-      int rc = m_crr[cr];
-      int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
-      int bc = m_cbb[cb];
-
-      int yy = y[j];
-      d0[0] = clamp(yy+rc);
-      d0[1] = clamp(yy+gc);
-      d0[2] = clamp(yy+bc);
-      d0[3] = 255;
-
-      yy = y[8+j];
-      d1[0] = clamp(yy+rc);
-      d1[1] = clamp(yy+gc);
-      d1[2] = clamp(yy+bc);
-      d1[3] = 255;
-
-      d0 += 4;
-      d1 += 4;
-    }
-
-    y += 64*4;
-    c += 64*4;
-  }
-}
-
-// YCbCr H2V2 (2x2:1:1, 6 m_blocks per MCU) to RGB
-void jpeg_decoder::H2V2Convert()
-{
-    int row = m_max_mcu_y_size - m_mcu_lines_left;
-    uint8 *d0 = m_pScan_line_0;
-    uint8 *d1 = m_pScan_line_1;
-    uint8 *y;
-    uint8 *c;
-
-    if (row < 8)
-        y = m_pSample_buf + row * 8;
-    else
-        y = m_pSample_buf + 64*2 + (row & 7) * 8;
-
-    c = m_pSample_buf + 64*4 + (row >> 1) * 8;
-
-    for (int i = m_max_mcus_per_row; i > 0; i--)
-    {
-        for (int l = 0; l < 2; l++)
-        {
-            for (int j = 0; j < 8; j += 2)
-            {
-                int cb = c[0];
-                int cr = c[64];
-
-                int rc = m_crr[cr];
-                int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
-                int bc = m_cbb[cb];
-
-                int yy = y[j];
-                d0[0] = clamp(yy+rc);
-                d0[1] = clamp(yy+gc);
-                d0[2] = clamp(yy+bc);
-                d0[3] = 255;
-
-                yy = y[j+1];
-                d0[4] = clamp(yy+rc);
-                d0[5] = clamp(yy+gc);
-                d0[6] = clamp(yy+bc);
-                d0[7] = 255;
-
-                yy = y[j+8];
-                d1[0] = clamp(yy+rc);
-                d1[1] = clamp(yy+gc);
-                d1[2] = clamp(yy+bc);
-                d1[3] = 255;
-
-                yy = y[j+8+1];
-                d1[4] = clamp(yy+rc);
-                d1[5] = clamp(yy+gc);
-                d1[6] = clamp(yy+bc);
-                d1[7] = 255;
-
-                d0 += 8;
-                d1 += 8;
-
-                c++;
-            }
-            y += 64;
-        }
-
-        y += 64*6 - 64*2;
-        c += 64*6 - 8;
-    }
-}
-
-// Y (1 block per MCU) to 8-bit grayscale
-void jpeg_decoder::gray_convert()
-{
-  int row = m_max_mcu_y_size - m_mcu_lines_left;
-  uint8 *d = m_pScan_line_0;
-  uint8 *s = m_pSample_buf + row * 8;
-
-  for (int i = m_max_mcus_per_row; i > 0; i--)
-  {
-    *(uint *)d = *(uint *)s;
-    *(uint *)(&d[4]) = *(uint *)(&s[4]);
-
-    s += 64;
-    d += 8;
-  }
-}
-
-void jpeg_decoder::expanded_convert()
-{
-  int row = m_max_mcu_y_size - m_mcu_lines_left;
-
-  uint8* Py = m_pSample_buf + (row / 8) * 64 * m_comp_h_samp[0] + (row & 7) * 8;
-
-  uint8* d = m_pScan_line_0;
-
-  for (int i = m_max_mcus_per_row; i > 0; i--)
-  {
-    for (int k = 0; k < m_max_mcu_x_size; k += 8)
-    {
-      const int Y_ofs = k * 8;
-      const int Cb_ofs = Y_ofs + 64 * m_expanded_blocks_per_component;
-      const int Cr_ofs = Y_ofs + 64 * m_expanded_blocks_per_component * 2;
-      for (int j = 0; j < 8; j++)
-      {
-        int y = Py[Y_ofs + j];
-        int cb = Py[Cb_ofs + j];
-        int cr = Py[Cr_ofs + j];
-
-        d[0] = clamp(y + m_crr[cr]);
-        d[1] = clamp(y + ((m_crg[cr] + m_cbg[cb]) >> 16));
-        d[2] = clamp(y + m_cbb[cb]);
-        d[3] = 255;
-
-        d += 4;
-      }
-    }
-
-    Py += 64 * m_expanded_blocks_per_mcu;
-  }
-}
-
-// Find end of image (EOI) marker, so we can return to the user the exact size of the input stream.
-void jpeg_decoder::find_eoi()
-{
-  if (!m_progressive_flag)
-  {
-    // Attempt to read the EOI marker.
-    //get_bits_no_markers(m_bits_left & 7);
-
-    // Prime the bit buffer
-    m_bits_left = 16;
-    get_bits(16);
-    get_bits(16);
-
-    // The next marker _should_ be EOI
-    process_markers();
-  }
-
-  m_total_bytes_read -= m_in_buf_left;
-}
-
-int jpeg_decoder::decode(const void** pScan_line, uint* pScan_line_len)
-{
-  if ((m_error_code) || (!m_ready_flag))
-    return JPGD_FAILED;
-
-  if (m_total_lines_left == 0)
-    return JPGD_DONE;
-
-  if (m_mcu_lines_left == 0)
-  {
-    if (setjmp(m_jmp_state))
-      return JPGD_FAILED;
-
-    if (m_progressive_flag)
-      load_next_row();
-    else
-      decode_next_row();
-
-    // Find the EOI marker if that was the last row.
-    if (m_total_lines_left <= m_max_mcu_y_size)
-      find_eoi();
-
-    m_mcu_lines_left = m_max_mcu_y_size;
-  }
-
-  if (m_freq_domain_chroma_upsample)
-  {
-    expanded_convert();
-    *pScan_line = m_pScan_line_0;
-  }
-  else
-  {
-    switch (m_scan_type)
-    {
-      case JPGD_YH2V2:
-      {
-        if ((m_mcu_lines_left & 1) == 0)
-        {
-          H2V2Convert();
-          *pScan_line = m_pScan_line_0;
-        }
-        else
-          *pScan_line = m_pScan_line_1;
-
-        break;
-      }
-      case JPGD_YH2V1:
-      {
-        H2V1Convert();
-        *pScan_line = m_pScan_line_0;
-        break;
-      }
-      case JPGD_YH1V2:
-      {
-        if ((m_mcu_lines_left & 1) == 0)
-        {
-          H1V2Convert();
-          *pScan_line = m_pScan_line_0;
-        }
-        else
-          *pScan_line = m_pScan_line_1;
-
-        break;
-      }
-      case JPGD_YH1V1:
-      {
-        H1V1Convert();
-        *pScan_line = m_pScan_line_0;
-        break;
-      }
-      case JPGD_GRAYSCALE:
-      {
-        gray_convert();
-        *pScan_line = m_pScan_line_0;
-
-        break;
-      }
-    }
-  }
-
-  *pScan_line_len = m_real_dest_bytes_per_scan_line;
-
-  m_mcu_lines_left--;
-  m_total_lines_left--;
-
-  return JPGD_SUCCESS;
-}
-
-// Creates the tables needed for efficient Huffman decoding.
-void jpeg_decoder::make_huff_table(int index, huff_tables *pH)
-{
-  int p, i, l, si;
-  uint8 huffsize[257];
-  uint huffcode[257];
-  uint code;
-  uint subtree;
-  int code_size;
-  int lastp;
-  int nextfreeentry;
-  int currententry;
-
-  pH->ac_table = m_huff_ac[index] != 0;
-
-  p = 0;
-
-  for (l = 1; l <= 16; l++)
-  {
-    for (i = 1; i <= m_huff_num[index][l]; i++)
-      huffsize[p++] = static_cast<uint8>(l);
-  }
-
-  huffsize[p] = 0;
-
-  lastp = p;
-
-  code = 0;
-  si = huffsize[0];
-  p = 0;
-
-  while (huffsize[p])
-  {
-    while (huffsize[p] == si)
-    {
-      huffcode[p++] = code;
-      code++;
-    }
-
-    code <<= 1;
-    si++;
-  }
-
-  memset(pH->look_up, 0, sizeof(pH->look_up));
-  memset(pH->look_up2, 0, sizeof(pH->look_up2));
-  memset(pH->tree, 0, sizeof(pH->tree));
-  memset(pH->code_size, 0, sizeof(pH->code_size));
-
-  nextfreeentry = -1;
-
-  p = 0;
-
-  while (p < lastp)
-  {
-    i = m_huff_val[index][p];
-    code = huffcode[p];
-    code_size = huffsize[p];
-
-    pH->code_size[i] = static_cast<uint8>(code_size);
-
-    if (code_size <= 8)
-    {
-      code <<= (8 - code_size);
-
-      for (l = 1 << (8 - code_size); l > 0; l--)
-      {
-        JPGD_ASSERT(i < 256);
-
-        pH->look_up[code] = i;
-
-        bool has_extrabits = false;
-                int extra_bits = 0;
-        int num_extra_bits = i & 15;
-
-        int bits_to_fetch = code_size;
-        if (num_extra_bits)
-        {
-          int total_codesize = code_size + num_extra_bits;
-          if (total_codesize <= 8)
-          {
-            has_extrabits = true;
-            extra_bits = ((1 << num_extra_bits) - 1) & (code >> (8 - total_codesize));
-            JPGD_ASSERT(extra_bits <= 0x7FFF);
-            bits_to_fetch += num_extra_bits;
-          }
-        }
-
-        if (!has_extrabits)
-          pH->look_up2[code] = i | (bits_to_fetch << 8);
-        else
-          pH->look_up2[code] = i | 0x8000 | (extra_bits << 16) | (bits_to_fetch << 8);
-
-        code++;
-      }
-    }
-    else
-    {
-      subtree = (code >> (code_size - 8)) & 0xFF;
-
-      currententry = pH->look_up[subtree];
-
-      if (currententry == 0)
-      {
-        pH->look_up[subtree] = currententry = nextfreeentry;
-        pH->look_up2[subtree] = currententry = nextfreeentry;
-
-        nextfreeentry -= 2;
-      }
-
-      code <<= (16 - (code_size - 8));
-
-      for (l = code_size; l > 9; l--)
-      {
-        if ((code & 0x8000) == 0)
-          currententry--;
-
-        if (pH->tree[-currententry - 1] == 0)
-        {
-          pH->tree[-currententry - 1] = nextfreeentry;
-
-          currententry = nextfreeentry;
-
-          nextfreeentry -= 2;
-        }
-        else
-          currententry = pH->tree[-currententry - 1];
-
-        code <<= 1;
-      }
-
-      if ((code & 0x8000) == 0)
-        currententry--;
-
-      pH->tree[-currententry - 1] = i;
-    }
-
-    p++;
-  }
-}
-
-// Verifies the quantization tables needed for this scan are available.
-void jpeg_decoder::check_quant_tables()
-{
-  for (int i = 0; i < m_comps_in_scan; i++)
-    if (m_quant[m_comp_quant[m_comp_list[i]]] == NULL)
-      stop_decoding(JPGD_UNDEFINED_QUANT_TABLE);
-}
-
-// Verifies that all the Huffman tables needed for this scan are available.
-void jpeg_decoder::check_huff_tables()
-{
-  for (int i = 0; i < m_comps_in_scan; i++)
-  {
-    if ((m_spectral_start == 0) && (m_huff_num[m_comp_dc_tab[m_comp_list[i]]] == NULL))
-      stop_decoding(JPGD_UNDEFINED_HUFF_TABLE);
-
-    if ((m_spectral_end > 0) && (m_huff_num[m_comp_ac_tab[m_comp_list[i]]] == NULL))
-      stop_decoding(JPGD_UNDEFINED_HUFF_TABLE);
-  }
-
-  for (int i = 0; i < JPGD_MAX_HUFF_TABLES; i++)
-    if (m_huff_num[i])
-    {
-      if (!m_pHuff_tabs[i])
-        m_pHuff_tabs[i] = (huff_tables *)alloc(sizeof(huff_tables));
-
-      make_huff_table(i, m_pHuff_tabs[i]);
-    }
-}
-
-// Determines the component order inside each MCU.
-// Also calcs how many MCU's are on each row, etc.
-void jpeg_decoder::calc_mcu_block_order()
-{
-  int component_num, component_id;
-  int max_h_samp = 0, max_v_samp = 0;
-
-  for (component_id = 0; component_id < m_comps_in_frame; component_id++)
-  {
-    if (m_comp_h_samp[component_id] > max_h_samp)
-      max_h_samp = m_comp_h_samp[component_id];
-
-    if (m_comp_v_samp[component_id] > max_v_samp)
-      max_v_samp = m_comp_v_samp[component_id];
-  }
-
-  for (component_id = 0; component_id < m_comps_in_frame; component_id++)
-  {
-    m_comp_h_blocks[component_id] = ((((m_image_x_size * m_comp_h_samp[component_id]) + (max_h_samp - 1)) / max_h_samp) + 7) / 8;
-    m_comp_v_blocks[component_id] = ((((m_image_y_size * m_comp_v_samp[component_id]) + (max_v_samp - 1)) / max_v_samp) + 7) / 8;
-  }
-
-  if (m_comps_in_scan == 1)
-  {
-    m_mcus_per_row = m_comp_h_blocks[m_comp_list[0]];
-    m_mcus_per_col = m_comp_v_blocks[m_comp_list[0]];
-  }
-  else
-  {
-    m_mcus_per_row = (((m_image_x_size + 7) / 8) + (max_h_samp - 1)) / max_h_samp;
-    m_mcus_per_col = (((m_image_y_size + 7) / 8) + (max_v_samp - 1)) / max_v_samp;
-  }
-
-  if (m_comps_in_scan == 1)
-  {
-    m_mcu_org[0] = m_comp_list[0];
-
-    m_blocks_per_mcu = 1;
-  }
-  else
-  {
-    m_blocks_per_mcu = 0;
-
-    for (component_num = 0; component_num < m_comps_in_scan; component_num++)
-    {
-      int num_blocks;
-
-      component_id = m_comp_list[component_num];
-
-      num_blocks = m_comp_h_samp[component_id] * m_comp_v_samp[component_id];
-
-      while (num_blocks--)
-        m_mcu_org[m_blocks_per_mcu++] = component_id;
-    }
-  }
-}
-
-// Starts a new scan.
-int jpeg_decoder::init_scan()
-{
-  if (!locate_sos_marker())
-    return JPGD_FALSE;
-
-  calc_mcu_block_order();
-
-  check_huff_tables();
-
-  check_quant_tables();
-
-  memset(m_last_dc_val, 0, m_comps_in_frame * sizeof(uint));
-
-  m_eob_run = 0;
-
-  if (m_restart_interval)
-  {
-    m_restarts_left = m_restart_interval;
-    m_next_restart_num = 0;
-  }
-
-  fix_in_buffer();
-
-  return JPGD_TRUE;
-}
-
-// Starts a frame. Determines if the number of components or sampling factors
-// are supported.
-void jpeg_decoder::init_frame()
-{
-  int i;
-
-  if (m_comps_in_frame == 1)
-  {
-    if ((m_comp_h_samp[0] != 1) || (m_comp_v_samp[0] != 1))
-      stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
-
-    m_scan_type = JPGD_GRAYSCALE;
-    m_max_blocks_per_mcu = 1;
-    m_max_mcu_x_size = 8;
-    m_max_mcu_y_size = 8;
-  }
-  else if (m_comps_in_frame == 3)
-  {
-    if ( ((m_comp_h_samp[1] != 1) || (m_comp_v_samp[1] != 1)) ||
-         ((m_comp_h_samp[2] != 1) || (m_comp_v_samp[2] != 1)) )
-      stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
-
-    if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 1))
-    {
-      m_scan_type = JPGD_YH1V1;
-
-      m_max_blocks_per_mcu = 3;
-      m_max_mcu_x_size = 8;
-      m_max_mcu_y_size = 8;
-    }
-    else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 1))
-    {
-      m_scan_type = JPGD_YH2V1;
-      m_max_blocks_per_mcu = 4;
-      m_max_mcu_x_size = 16;
-      m_max_mcu_y_size = 8;
-    }
-    else if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 2))
-    {
-      m_scan_type = JPGD_YH1V2;
-      m_max_blocks_per_mcu = 4;
-      m_max_mcu_x_size = 8;
-      m_max_mcu_y_size = 16;
-    }
-    else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 2))
-    {
-      m_scan_type = JPGD_YH2V2;
-      m_max_blocks_per_mcu = 6;
-      m_max_mcu_x_size = 16;
-      m_max_mcu_y_size = 16;
-    }
-    else
-      stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
-  }
-  else
-    stop_decoding(JPGD_UNSUPPORTED_COLORSPACE);
-
-  m_max_mcus_per_row = (m_image_x_size + (m_max_mcu_x_size - 1)) / m_max_mcu_x_size;
-  m_max_mcus_per_col = (m_image_y_size + (m_max_mcu_y_size - 1)) / m_max_mcu_y_size;
-
-  // These values are for the *destination* pixels: after conversion.
-  if (m_scan_type == JPGD_GRAYSCALE)
-    m_dest_bytes_per_pixel = 1;
-  else
-    m_dest_bytes_per_pixel = 4;
-
-  m_dest_bytes_per_scan_line = ((m_image_x_size + 15) & 0xFFF0) * m_dest_bytes_per_pixel;
-
-  m_real_dest_bytes_per_scan_line = (m_image_x_size * m_dest_bytes_per_pixel);
-
-  // Initialize two scan line buffers.
-  m_pScan_line_0 = (uint8 *)alloc(m_dest_bytes_per_scan_line, true);
-  if ((m_scan_type == JPGD_YH1V2) || (m_scan_type == JPGD_YH2V2))
-    m_pScan_line_1 = (uint8 *)alloc(m_dest_bytes_per_scan_line, true);
-
-  m_max_blocks_per_row = m_max_mcus_per_row * m_max_blocks_per_mcu;
-
-  // Should never happen
-  if (m_max_blocks_per_row > JPGD_MAX_BLOCKS_PER_ROW)
-    stop_decoding(JPGD_ASSERTION_ERROR);
-
-  // Allocate the coefficient buffer, enough for one MCU
-  m_pMCU_coefficients = (jpgd_block_t*)alloc(m_max_blocks_per_mcu * 64 * sizeof(jpgd_block_t));
-
-  for (i = 0; i < m_max_blocks_per_mcu; i++)
-    m_mcu_block_max_zag[i] = 64;
-
-  m_expanded_blocks_per_component = m_comp_h_samp[0] * m_comp_v_samp[0];
-  m_expanded_blocks_per_mcu = m_expanded_blocks_per_component * m_comps_in_frame;
-  m_expanded_blocks_per_row = m_max_mcus_per_row * m_expanded_blocks_per_mcu;
-    // Freq. domain chroma upsampling is only supported for H2V2 subsampling factor (the most common one I've seen).
-  m_freq_domain_chroma_upsample = false;
-#if JPGD_SUPPORT_FREQ_DOMAIN_UPSAMPLING
-  m_freq_domain_chroma_upsample = (m_expanded_blocks_per_mcu == 4*3);
-#endif
-
-  if (m_freq_domain_chroma_upsample)
-    m_pSample_buf = (uint8 *)alloc(m_expanded_blocks_per_row * 64);
-  else
-    m_pSample_buf = (uint8 *)alloc(m_max_blocks_per_row * 64);
-
-  m_total_lines_left = m_image_y_size;
-
-  m_mcu_lines_left = 0;
-
-  create_look_ups();
-}
-
-// The coeff_buf series of methods originally stored the coefficients
-// into a "virtual" file which was located in EMS, XMS, or a disk file. A cache
-// was used to make this process more efficient. Now, we can store the entire
-// thing in RAM.
-jpeg_decoder::coeff_buf* jpeg_decoder::coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y)
-{
-  coeff_buf* cb = (coeff_buf*)alloc(sizeof(coeff_buf));
-
-  cb->block_num_x = block_num_x;
-  cb->block_num_y = block_num_y;
-  cb->block_len_x = block_len_x;
-  cb->block_len_y = block_len_y;
-  cb->block_size = (block_len_x * block_len_y) * sizeof(jpgd_block_t);
-  cb->pData = (uint8 *)alloc(cb->block_size * block_num_x * block_num_y, true);
-  return cb;
-}
-
-inline jpgd_block_t *jpeg_decoder::coeff_buf_getp(coeff_buf *cb, int block_x, int block_y)
-{
-  JPGD_ASSERT((block_x < cb->block_num_x) && (block_y < cb->block_num_y));
-  return (jpgd_block_t *)(cb->pData + block_x * cb->block_size + block_y * (cb->block_size * cb->block_num_x));
-}
-
-// The following methods decode the various types of m_blocks encountered
-// in progressively encoded images.
-void jpeg_decoder::decode_block_dc_first(jpeg_decoder *pD, int component_id, int block_x, int block_y)
-{
-  int s, r;
-  jpgd_block_t *p = pD->coeff_buf_getp(pD->m_dc_coeffs[component_id], block_x, block_y);
-
-  if ((s = pD->huff_decode(pD->m_pHuff_tabs[pD->m_comp_dc_tab[component_id]])) != 0)
-  {
-    r = pD->get_bits_no_markers(s);
-    s = JPGD_HUFF_EXTEND(r, s);
-  }
-
-  pD->m_last_dc_val[component_id] = (s += pD->m_last_dc_val[component_id]);
-
-  p[0] = static_cast<jpgd_block_t>(s << pD->m_successive_low);
-}
-
-void jpeg_decoder::decode_block_dc_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y)
-{
-  if (pD->get_bits_no_markers(1))
-  {
-    jpgd_block_t *p = pD->coeff_buf_getp(pD->m_dc_coeffs[component_id], block_x, block_y);
-
-    p[0] |= (1 << pD->m_successive_low);
-  }
-}
-
-void jpeg_decoder::decode_block_ac_first(jpeg_decoder *pD, int component_id, int block_x, int block_y)
-{
-  int k, s, r;
-
-  if (pD->m_eob_run)
-  {
-    pD->m_eob_run--;
-    return;
-  }
-
-  jpgd_block_t *p = pD->coeff_buf_getp(pD->m_ac_coeffs[component_id], block_x, block_y);
-
-  for (k = pD->m_spectral_start; k <= pD->m_spectral_end; k++)
-  {
-    s = pD->huff_decode(pD->m_pHuff_tabs[pD->m_comp_ac_tab[component_id]]);
-
-    r = s >> 4;
-    s &= 15;
-
-    if (s)
-    {
-      if ((k += r) > 63)
-        pD->stop_decoding(JPGD_DECODE_ERROR);
-
-      r = pD->get_bits_no_markers(s);
-      s = JPGD_HUFF_EXTEND(r, s);
-
-      p[g_ZAG[k]] = static_cast<jpgd_block_t>(s << pD->m_successive_low);
-    }
-    else
-    {
-      if (r == 15)
-      {
-        if ((k += 15) > 63)
-          pD->stop_decoding(JPGD_DECODE_ERROR);
-      }
-      else
-      {
-        pD->m_eob_run = 1 << r;
-
-        if (r)
-          pD->m_eob_run += pD->get_bits_no_markers(r);
-
-        pD->m_eob_run--;
-
-        break;
-      }
-    }
-  }
-}
-
-void jpeg_decoder::decode_block_ac_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y)
-{
-  int s, k, r;
-  int p1 = 1 << pD->m_successive_low;
-  int m1 = (-1) << pD->m_successive_low;
-  jpgd_block_t *p = pD->coeff_buf_getp(pD->m_ac_coeffs[component_id], block_x, block_y);
-  
-  JPGD_ASSERT(pD->m_spectral_end <= 63);
-  
-  k = pD->m_spectral_start;
-  
-  if (pD->m_eob_run == 0)
-  {
-    for ( ; k <= pD->m_spectral_end; k++)
-    {
-      s = pD->huff_decode(pD->m_pHuff_tabs[pD->m_comp_ac_tab[component_id]]);
-
-      r = s >> 4;
-      s &= 15;
-
-      if (s)
-      {
-        if (s != 1)
-          pD->stop_decoding(JPGD_DECODE_ERROR);
-
-        if (pD->get_bits_no_markers(1))
-          s = p1;
-        else
-          s = m1;
-      }
-      else
-      {
-        if (r != 15)
-        {
-          pD->m_eob_run = 1 << r;
-
-          if (r)
-            pD->m_eob_run += pD->get_bits_no_markers(r);
-
-          break;
-        }
-      }
-
-      do
-      {
-        jpgd_block_t *this_coef = p + g_ZAG[k & 63];
-
-        if (*this_coef != 0)
-        {
-          if (pD->get_bits_no_markers(1))
-          {
-            if ((*this_coef & p1) == 0)
-            {
-              if (*this_coef >= 0)
-                *this_coef = static_cast<jpgd_block_t>(*this_coef + p1);
-              else
-                *this_coef = static_cast<jpgd_block_t>(*this_coef + m1);
-            }
-          }
-        }
-        else
-        {
-          if (--r < 0)
-            break;
-        }
-
-        k++;
-
-      } while (k <= pD->m_spectral_end);
-
-      if ((s) && (k < 64))
-      {
-        p[g_ZAG[k]] = static_cast<jpgd_block_t>(s);
-      }
-    }
-  }
-
-  if (pD->m_eob_run > 0)
-  {
-    for ( ; k <= pD->m_spectral_end; k++)
-    {
-      jpgd_block_t *this_coef = p + g_ZAG[k & 63]; // logical AND to shut up static code analysis
-
-      if (*this_coef != 0)
-      {
-        if (pD->get_bits_no_markers(1))
-        {
-          if ((*this_coef & p1) == 0)
-          {
-            if (*this_coef >= 0)
-              *this_coef = static_cast<jpgd_block_t>(*this_coef + p1);
-            else
-              *this_coef = static_cast<jpgd_block_t>(*this_coef + m1);
-          }
-        }
-      }
-    }
-
-    pD->m_eob_run--;
-  }
-}
-
-// Decode a scan in a progressively encoded image.
-void jpeg_decoder::decode_scan(pDecode_block_func decode_block_func)
-{
-  int mcu_row, mcu_col, mcu_block;
-  int block_x_mcu[JPGD_MAX_COMPONENTS], m_block_y_mcu[JPGD_MAX_COMPONENTS];
-
-  memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
-
-  for (mcu_col = 0; mcu_col < m_mcus_per_col; mcu_col++)
-  {
-    int component_num, component_id;
-
-    memset(block_x_mcu, 0, sizeof(block_x_mcu));
-
-    for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
-    {
-      int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
-
-      if ((m_restart_interval) && (m_restarts_left == 0))
-        process_restart();
-
-      for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
-      {
-        component_id = m_mcu_org[mcu_block];
-
-        decode_block_func(this, component_id, block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
-
-        if (m_comps_in_scan == 1)
-          block_x_mcu[component_id]++;
-        else
-        {
-          if (++block_x_mcu_ofs == m_comp_h_samp[component_id])
-          {
-            block_x_mcu_ofs = 0;
-
-            if (++block_y_mcu_ofs == m_comp_v_samp[component_id])
-            {
-              block_y_mcu_ofs = 0;
-              block_x_mcu[component_id] += m_comp_h_samp[component_id];
-            }
-          }
-        }
-      }
-
-      m_restarts_left--;
-    }
-
-    if (m_comps_in_scan == 1)
-      m_block_y_mcu[m_comp_list[0]]++;
-    else
-    {
-      for (component_num = 0; component_num < m_comps_in_scan; component_num++)
-      {
-        component_id = m_comp_list[component_num];
-        m_block_y_mcu[component_id] += m_comp_v_samp[component_id];
-      }
-    }
-  }
-}
-
-// Decode a progressively encoded image.
-void jpeg_decoder::init_progressive()
-{
-  int i;
-
-  if (m_comps_in_frame == 4)
-    stop_decoding(JPGD_UNSUPPORTED_COLORSPACE);
-
-  // Allocate the coefficient buffers.
-  for (i = 0; i < m_comps_in_frame; i++)
-  {
-    m_dc_coeffs[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp[i], m_max_mcus_per_col * m_comp_v_samp[i], 1, 1);
-    m_ac_coeffs[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp[i], m_max_mcus_per_col * m_comp_v_samp[i], 8, 8);
-  }
-
-  for ( ; ; )
-  {
-    int dc_only_scan, refinement_scan;
-    pDecode_block_func decode_block_func;
-
-    if (!init_scan())
-      break;
-
-    dc_only_scan = (m_spectral_start == 0);
-    refinement_scan = (m_successive_high != 0);
-
-    if ((m_spectral_start > m_spectral_end) || (m_spectral_end > 63))
-      stop_decoding(JPGD_BAD_SOS_SPECTRAL);
-
-    if (dc_only_scan)
-    {
-      if (m_spectral_end)
-        stop_decoding(JPGD_BAD_SOS_SPECTRAL);
-    }
-    else if (m_comps_in_scan != 1)  /* AC scans can only contain one component */
-      stop_decoding(JPGD_BAD_SOS_SPECTRAL);
-
-    if ((refinement_scan) && (m_successive_low != m_successive_high - 1))
-      stop_decoding(JPGD_BAD_SOS_SUCCESSIVE);
-
-    if (dc_only_scan)
-    {
-      if (refinement_scan)
-        decode_block_func = decode_block_dc_refine;
-      else
-        decode_block_func = decode_block_dc_first;
-    }
-    else
-    {
-      if (refinement_scan)
-        decode_block_func = decode_block_ac_refine;
-      else
-        decode_block_func = decode_block_ac_first;
-    }
-
-    decode_scan(decode_block_func);
-
-    m_bits_left = 16;
-    get_bits(16);
-    get_bits(16);
-  }
-
-  m_comps_in_scan = m_comps_in_frame;
-
-  for (i = 0; i < m_comps_in_frame; i++)
-    m_comp_list[i] = i;
-
-  calc_mcu_block_order();
-}
-
-void jpeg_decoder::init_sequential()
-{
-  if (!init_scan())
-    stop_decoding(JPGD_UNEXPECTED_MARKER);
-}
-
-void jpeg_decoder::decode_start()
-{
-  init_frame();
-
-  if (m_progressive_flag)
-    init_progressive();
-  else
-    init_sequential();
-}
-
-void jpeg_decoder::decode_init(jpeg_decoder_stream *pStream)
-{
-  init(pStream);
-  locate_sof_marker();
-}
-
-jpeg_decoder::jpeg_decoder(jpeg_decoder_stream *pStream)
-{
-  if (setjmp(m_jmp_state))
-    return;
-  decode_init(pStream);
-}
-
-int jpeg_decoder::begin_decoding()
-{
-  if (m_ready_flag)
-    return JPGD_SUCCESS;
-
-  if (m_error_code)
-    return JPGD_FAILED;
-
-  if (setjmp(m_jmp_state))
-    return JPGD_FAILED;
-
-  decode_start();
-
-  m_ready_flag = true;
-
-  return JPGD_SUCCESS;
-}
-
-jpeg_decoder::~jpeg_decoder()
-{
-  free_all_blocks();
-}
-
-jpeg_decoder_file_stream::jpeg_decoder_file_stream()
-{
-  m_pFile = NULL;
-  m_eof_flag = false;
-  m_error_flag = false;
-}
-
-void jpeg_decoder_file_stream::close()
-{
-  if (m_pFile)
-  {
-    fclose(m_pFile);
-    m_pFile = NULL;
-  }
-
-  m_eof_flag = false;
-  m_error_flag = false;
-}
-
-jpeg_decoder_file_stream::~jpeg_decoder_file_stream()
-{
-  close();
-}
-
-bool jpeg_decoder_file_stream::open(const char *Pfilename)
-{
-  close();
-
-  m_eof_flag = false;
-  m_error_flag = false;
-
-#if defined(_MSC_VER)
-  m_pFile = NULL;
-  fopen_s(&m_pFile, Pfilename, "rb");
-#else
-  m_pFile = fopen(Pfilename, "rb");
-#endif
-  return m_pFile != NULL;
-}
-
-int jpeg_decoder_file_stream::read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag)
-{
-  if (!m_pFile)
-    return -1;
-
-  if (m_eof_flag)
-  {
-    *pEOF_flag = true;
-    return 0;
-  }
-
-  if (m_error_flag)
-    return -1;
-
-  int bytes_read = static_cast<int>(fread(pBuf, 1, max_bytes_to_read, m_pFile));
-  if (bytes_read < max_bytes_to_read)
-  {
-    if (ferror(m_pFile))
-    {
-      m_error_flag = true;
-      return -1;
-    }
-
-    m_eof_flag = true;
-    *pEOF_flag = true;
-  }
-
-  return bytes_read;
-}
-
-bool jpeg_decoder_mem_stream::open(const uint8 *pSrc_data, uint size)
-{
-  close();
-  m_pSrc_data = pSrc_data;
-  m_ofs = 0;
-  m_size = size;
-  return true;
-}
-
-int jpeg_decoder_mem_stream::read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag)
-{
-  *pEOF_flag = false;
-
-  if (!m_pSrc_data)
-    return -1;
-
-  uint bytes_remaining = m_size - m_ofs;
-  if ((uint)max_bytes_to_read > bytes_remaining)
-  {
-    max_bytes_to_read = bytes_remaining;
-    *pEOF_flag = true;
-  }
-
-  memcpy(pBuf, m_pSrc_data + m_ofs, max_bytes_to_read);
-  m_ofs += max_bytes_to_read;
-
-  return max_bytes_to_read;
-}
-
-unsigned char *decompress_jpeg_image_from_stream(jpeg_decoder_stream *pStream, int *width, int *height, int *actual_comps, int req_comps)
-{
-  if (!actual_comps)
-    return NULL;
-  *actual_comps = 0;
-
-  if ((!pStream) || (!width) || (!height) || (!req_comps))
-    return NULL;
-
-  if ((req_comps != 1) && (req_comps != 3) && (req_comps != 4))
-    return NULL;
-
-  jpeg_decoder decoder(pStream);
-  if (decoder.get_error_code() != JPGD_SUCCESS)
-    return NULL;
-
-  const int image_width = decoder.get_width(), image_height = decoder.get_height();
-  *width = image_width;
-  *height = image_height;
-  *actual_comps = decoder.get_num_components();
-
-  if (decoder.begin_decoding() != JPGD_SUCCESS)
-    return NULL;
-
-  const int dst_bpl = image_width * req_comps;
-
-  uint8 *pImage_data = (uint8*)jpgd_malloc(dst_bpl * image_height);
-  if (!pImage_data)
-    return NULL;
-
-  for (int y = 0; y < image_height; y++)
-  {
-    const uint8* pScan_line;
-    uint scan_line_len;
-    if (decoder.decode((const void**)&pScan_line, &scan_line_len) != JPGD_SUCCESS)
-    {
-      jpgd_free(pImage_data);
-      return NULL;
-    }
-
-    uint8 *pDst = pImage_data + y * dst_bpl;
-
-#if 1
-    //Return as BGRA
-    if ((req_comps == 4) && (decoder.get_num_components() == 3)) {
-      for (int x = 0; x < image_width; x++)
-      {
-        pDst[0] = pScan_line[x*4+2];
-        pDst[1] = pScan_line[x*4+1];
-        pDst[2] = pScan_line[x*4+0];
-        pDst[3] = 255;
-        pDst += 4;
-      }
-    } else
-#endif
-    if (((req_comps == 1) && (decoder.get_num_components() == 1)) || ((req_comps == 4) && (decoder.get_num_components() == 3)))
-      memcpy(pDst, pScan_line, dst_bpl);
-    else if (decoder.get_num_components() == 1)
-    {
-      if (req_comps == 3)
-      {
-        for (int x = 0; x < image_width; x++)
-        {
-          uint8 luma = pScan_line[x];
-          pDst[0] = luma;
-          pDst[1] = luma;
-          pDst[2] = luma;
-          pDst += 3;
-        }
-      }
-      else
-      {
-        for (int x = 0; x < image_width; x++)
-        {
-          uint8 luma = pScan_line[x];
-          pDst[0] = luma;
-          pDst[1] = luma;
-          pDst[2] = luma;
-          pDst[3] = 255;
-          pDst += 4;
-        }
-      }
-    }
-    else if (decoder.get_num_components() == 3)
-    {
-      if (req_comps == 1)
-      {
-        const int YR = 19595, YG = 38470, YB = 7471;
-        for (int x = 0; x < image_width; x++)
-        {
-          int r = pScan_line[x*4+0];
-          int g = pScan_line[x*4+1];
-          int b = pScan_line[x*4+2];
-          *pDst++ = static_cast<uint8>((r * YR + g * YG + b * YB + 32768) >> 16);
-        }
-      }
-      else
-      {
-        for (int x = 0; x < image_width; x++)
-        {
-          pDst[0] = pScan_line[x*4+0];
-          pDst[1] = pScan_line[x*4+1];
-          pDst[2] = pScan_line[x*4+2];
-          pDst += 3;
-        }
-      }
-    }
-  }
-
-  return pImage_data;
-}
-
-unsigned char *decompress_jpeg_image_from_memory(const unsigned char *pSrc_data, int src_data_size, int *width, int *height, int *actual_comps, int req_comps)
-{
-  jpgd::jpeg_decoder_mem_stream mem_stream(pSrc_data, src_data_size);
-  return decompress_jpeg_image_from_stream(&mem_stream, width, height, actual_comps, req_comps);
-}
-
-unsigned char *decompress_jpeg_image_from_file(const char *pSrc_filename, int *width, int *height, int *actual_comps, int req_comps)
-{
-  jpgd::jpeg_decoder_file_stream file_stream;
-  if (!file_stream.open(pSrc_filename))
-    return NULL;
-  return decompress_jpeg_image_from_stream(&file_stream, width, height, actual_comps, req_comps);
-}
-
-} // namespace jpgd
diff --git a/src/loaders/jpg/jpgd.h b/src/loaders/jpg/jpgd.h
deleted file mode 100644
index 46069a1..0000000
--- a/src/loaders/jpg/jpgd.h
+++ /dev/null
@@ -1,319 +0,0 @@
-// jpgd.h - C++ class for JPEG decompression.
-// Public domain, Rich Geldreich <richgel99@gmail.com>
-#ifndef JPEG_DECODER_H
-#define JPEG_DECODER_H
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <setjmp.h>
-
-#ifdef _MSC_VER
-  #define JPGD_NORETURN __declspec(noreturn) 
-#elif defined(__GNUC__)
-  #define JPGD_NORETURN __attribute__ ((noreturn))
-#else
-  #define JPGD_NORETURN
-#endif
-
-namespace jpgd
-{
-  typedef unsigned char  uint8;
-  typedef   signed short int16;
-  typedef unsigned short uint16;
-  typedef unsigned int   uint;
-  typedef   signed int   int32;
-
-  // Loads a JPEG image from a memory buffer or a file.
-  // req_comps can be 1 (grayscale), 3 (RGB), or 4 (RGBA).
-  // On return, width/height will be set to the image's dimensions, and actual_comps will be set to the either 1 (grayscale) or 3 (RGB).
-  // Notes: For more control over where and how the source data is read, see the decompress_jpeg_image_from_stream() function below, or call the jpeg_decoder class directly.
-  // Requesting a 8 or 32bpp image is currently a little faster than 24bpp because the jpeg_decoder class itself currently always unpacks to either 8 or 32bpp.
-  unsigned char *decompress_jpeg_image_from_memory(const unsigned char *pSrc_data, int src_data_size, int *width, int *height, int *actual_comps, int req_comps);
-  unsigned char *decompress_jpeg_image_from_file(const char *pSrc_filename, int *width, int *height, int *actual_comps, int req_comps);
-
-  // Success/failure error codes.
-  enum jpgd_status
-  {
-    JPGD_SUCCESS = 0, JPGD_FAILED = -1, JPGD_DONE = 1,
-    JPGD_BAD_DHT_COUNTS = -256, JPGD_BAD_DHT_INDEX, JPGD_BAD_DHT_MARKER, JPGD_BAD_DQT_MARKER, JPGD_BAD_DQT_TABLE, 
-    JPGD_BAD_PRECISION, JPGD_BAD_HEIGHT, JPGD_BAD_WIDTH, JPGD_TOO_MANY_COMPONENTS, 
-    JPGD_BAD_SOF_LENGTH, JPGD_BAD_VARIABLE_MARKER, JPGD_BAD_DRI_LENGTH, JPGD_BAD_SOS_LENGTH,
-    JPGD_BAD_SOS_COMP_ID, JPGD_W_EXTRA_BYTES_BEFORE_MARKER, JPGD_NO_ARITHMITIC_SUPPORT, JPGD_UNEXPECTED_MARKER,
-    JPGD_NOT_JPEG, JPGD_UNSUPPORTED_MARKER, JPGD_BAD_DQT_LENGTH, JPGD_TOO_MANY_BLOCKS,
-    JPGD_UNDEFINED_QUANT_TABLE, JPGD_UNDEFINED_HUFF_TABLE, JPGD_NOT_SINGLE_SCAN, JPGD_UNSUPPORTED_COLORSPACE,
-    JPGD_UNSUPPORTED_SAMP_FACTORS, JPGD_DECODE_ERROR, JPGD_BAD_RESTART_MARKER, JPGD_ASSERTION_ERROR,
-    JPGD_BAD_SOS_SPECTRAL, JPGD_BAD_SOS_SUCCESSIVE, JPGD_STREAM_READ, JPGD_NOTENOUGHMEM
-  };
-    
-  // Input stream interface.
-  // Derive from this class to read input data from sources other than files or memory. Set m_eof_flag to true when no more data is available.
-  // The decoder is rather greedy: it will keep on calling this method until its internal input buffer is full, or until the EOF flag is set.
-  // It the input stream contains data after the JPEG stream's EOI (end of image) marker it will probably be pulled into the internal buffer.
-  // Call the get_total_bytes_read() method to determine the actual size of the JPEG stream after successful decoding.
-  class jpeg_decoder_stream
-  {
-  public:
-    jpeg_decoder_stream() { }
-    virtual ~jpeg_decoder_stream() { }
-
-    // The read() method is called when the internal input buffer is empty.
-    // Parameters:
-    // pBuf - input buffer
-    // max_bytes_to_read - maximum bytes that can be written to pBuf
-    // pEOF_flag - set this to true if at end of stream (no more bytes remaining)
-    // Returns -1 on error, otherwise return the number of bytes actually written to the buffer (which may be 0).
-    // Notes: This method will be called in a loop until you set *pEOF_flag to true or the internal buffer is full.
-    virtual int read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag) = 0;
-  };
-
-  // stdio FILE stream class.
-  class jpeg_decoder_file_stream : public jpeg_decoder_stream
-  {
-    jpeg_decoder_file_stream(const jpeg_decoder_file_stream &);
-    jpeg_decoder_file_stream &operator =(const jpeg_decoder_file_stream &);
-
-    FILE *m_pFile;
-    bool m_eof_flag, m_error_flag;
-
-  public:
-    jpeg_decoder_file_stream();
-    virtual ~jpeg_decoder_file_stream();
-    
-    bool open(const char *Pfilename);
-    void close();
-
-    virtual int read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag);
-  };
-
-  // Memory stream class.
-  class jpeg_decoder_mem_stream : public jpeg_decoder_stream
-  {
-    const uint8 *m_pSrc_data;
-    uint m_ofs, m_size;
-
-  public:
-    jpeg_decoder_mem_stream() : m_pSrc_data(NULL), m_ofs(0), m_size(0) { }
-    jpeg_decoder_mem_stream(const uint8 *pSrc_data, uint size) : m_pSrc_data(pSrc_data), m_ofs(0), m_size(size) { }
-
-    virtual ~jpeg_decoder_mem_stream() { }
-
-    bool open(const uint8 *pSrc_data, uint size);
-    void close() { m_pSrc_data = NULL; m_ofs = 0; m_size = 0; }
-    
-    virtual int read(uint8 *pBuf, int max_bytes_to_read, bool *pEOF_flag);
-  };
-
-  // Loads JPEG file from a jpeg_decoder_stream.
-  unsigned char *decompress_jpeg_image_from_stream(jpeg_decoder_stream *pStream, int *width, int *height, int *actual_comps, int req_comps);
-
-  enum 
-  { 
-    JPGD_IN_BUF_SIZE = 8192, JPGD_MAX_BLOCKS_PER_MCU = 10, JPGD_MAX_HUFF_TABLES = 8, JPGD_MAX_QUANT_TABLES = 4, 
-    JPGD_MAX_COMPONENTS = 4, JPGD_MAX_COMPS_IN_SCAN = 4, JPGD_MAX_BLOCKS_PER_ROW = 8192, JPGD_MAX_HEIGHT = 16384, JPGD_MAX_WIDTH = 16384 
-  };
-          
-  typedef int16 jpgd_quant_t;
-  typedef int16 jpgd_block_t;
-
-  class jpeg_decoder
-  {
-  public:
-    // Call get_error_code() after constructing to determine if the stream is valid or not. You may call the get_width(), get_height(), etc.
-    // methods after the constructor is called. You may then either destruct the object, or begin decoding the image by calling begin_decoding(), then decode() on each scanline.
-    jpeg_decoder(jpeg_decoder_stream *pStream);
-
-    ~jpeg_decoder();
-
-    // Call this method after constructing the object to begin decompression.
-    // If JPGD_SUCCESS is returned you may then call decode() on each scanline.
-    int begin_decoding();
-
-    // Returns the next scan line.
-    // For grayscale images, pScan_line will point to a buffer containing 8-bit pixels (get_bytes_per_pixel() will return 1). 
-    // Otherwise, it will always point to a buffer containing 32-bit RGBA pixels (A will always be 255, and get_bytes_per_pixel() will return 4).
-    // Returns JPGD_SUCCESS if a scan line has been returned.
-    // Returns JPGD_DONE if all scan lines have been returned.
-    // Returns JPGD_FAILED if an error occurred. Call get_error_code() for a more info.
-    int decode(const void** pScan_line, uint* pScan_line_len);
-    
-    inline jpgd_status get_error_code() const { return m_error_code; }
-
-    inline int get_width() const { return m_image_x_size; }
-    inline int get_height() const { return m_image_y_size; }
-
-    inline int get_num_components() const { return m_comps_in_frame; }
-
-    inline int get_bytes_per_pixel() const { return m_dest_bytes_per_pixel; }
-    inline int get_bytes_per_scan_line() const { return m_image_x_size * get_bytes_per_pixel(); }
-
-    // Returns the total number of bytes actually consumed by the decoder (which should equal the actual size of the JPEG file).
-    inline int get_total_bytes_read() const { return m_total_bytes_read; }
-    
-  private:
-    jpeg_decoder(const jpeg_decoder &);
-    jpeg_decoder &operator =(const jpeg_decoder &);
-
-    typedef void (*pDecode_block_func)(jpeg_decoder *, int, int, int);
-
-    struct huff_tables
-    {
-      bool ac_table;
-      uint  look_up[256];
-      uint  look_up2[256];
-      uint8 code_size[256];
-      uint  tree[512];
-    };
-
-    struct coeff_buf
-    {
-      uint8 *pData;
-      int block_num_x, block_num_y;
-      int block_len_x, block_len_y;
-      int block_size;
-    };
-
-    struct mem_block
-    {
-      mem_block *m_pNext;
-      size_t m_used_count;
-      size_t m_size;
-      char m_data[1];
-    };
-
-    jmp_buf m_jmp_state;
-    mem_block *m_pMem_blocks;
-    int m_image_x_size;
-    int m_image_y_size;
-    jpeg_decoder_stream *m_pStream;
-    int m_progressive_flag;
-    uint8 m_huff_ac[JPGD_MAX_HUFF_TABLES];
-    uint8* m_huff_num[JPGD_MAX_HUFF_TABLES];      // pointer to number of Huffman codes per bit size
-    uint8* m_huff_val[JPGD_MAX_HUFF_TABLES];      // pointer to Huffman codes per bit size
-    jpgd_quant_t* m_quant[JPGD_MAX_QUANT_TABLES]; // pointer to quantization tables
-    int m_scan_type;                              // Gray, Yh1v1, Yh1v2, Yh2v1, Yh2v2 (CMYK111, CMYK4114 no longer supported)
-    int m_comps_in_frame;                         // # of components in frame
-    int m_comp_h_samp[JPGD_MAX_COMPONENTS];       // component's horizontal sampling factor
-    int m_comp_v_samp[JPGD_MAX_COMPONENTS];       // component's vertical sampling factor
-    int m_comp_quant[JPGD_MAX_COMPONENTS];        // component's quantization table selector
-    int m_comp_ident[JPGD_MAX_COMPONENTS];        // component's ID
-    int m_comp_h_blocks[JPGD_MAX_COMPONENTS];
-    int m_comp_v_blocks[JPGD_MAX_COMPONENTS];
-    int m_comps_in_scan;                          // # of components in scan
-    int m_comp_list[JPGD_MAX_COMPS_IN_SCAN];      // components in this scan
-    int m_comp_dc_tab[JPGD_MAX_COMPONENTS];       // component's DC Huffman coding table selector
-    int m_comp_ac_tab[JPGD_MAX_COMPONENTS];       // component's AC Huffman coding table selector
-    int m_spectral_start;                         // spectral selection start
-    int m_spectral_end;                           // spectral selection end
-    int m_successive_low;                         // successive approximation low
-    int m_successive_high;                        // successive approximation high
-    int m_max_mcu_x_size;                         // MCU's max. X size in pixels
-    int m_max_mcu_y_size;                         // MCU's max. Y size in pixels
-    int m_blocks_per_mcu;
-    int m_max_blocks_per_row;
-    int m_mcus_per_row, m_mcus_per_col;
-    int m_mcu_org[JPGD_MAX_BLOCKS_PER_MCU];
-    int m_total_lines_left;                       // total # lines left in image
-    int m_mcu_lines_left;                         // total # lines left in this MCU
-    int m_real_dest_bytes_per_scan_line;
-    int m_dest_bytes_per_scan_line;               // rounded up
-    int m_dest_bytes_per_pixel;                   // 4 (RGB) or 1 (Y)
-    huff_tables* m_pHuff_tabs[JPGD_MAX_HUFF_TABLES];
-    coeff_buf* m_dc_coeffs[JPGD_MAX_COMPONENTS];
-    coeff_buf* m_ac_coeffs[JPGD_MAX_COMPONENTS];
-    int m_eob_run;
-    int m_block_y_mcu[JPGD_MAX_COMPONENTS];
-    uint8* m_pIn_buf_ofs;
-    int m_in_buf_left;
-    int m_tem_flag;
-    bool m_eof_flag;
-    uint8 m_in_buf_pad_start[128];
-    uint8 m_in_buf[JPGD_IN_BUF_SIZE + 128];
-    uint8 m_in_buf_pad_end[128];
-    int m_bits_left;
-    uint m_bit_buf;
-    int m_restart_interval;
-    int m_restarts_left;
-    int m_next_restart_num;
-    int m_max_mcus_per_row;
-    int m_max_blocks_per_mcu;
-    int m_expanded_blocks_per_mcu;
-    int m_expanded_blocks_per_row;
-    int m_expanded_blocks_per_component;
-    bool  m_freq_domain_chroma_upsample;
-    int m_max_mcus_per_col;
-    uint m_last_dc_val[JPGD_MAX_COMPONENTS];
-    jpgd_block_t* m_pMCU_coefficients;
-    int m_mcu_block_max_zag[JPGD_MAX_BLOCKS_PER_MCU];
-    uint8* m_pSample_buf;
-    int m_crr[256];
-    int m_cbb[256];
-    int m_crg[256];
-    int m_cbg[256];
-    uint8* m_pScan_line_0;
-    uint8* m_pScan_line_1;
-    jpgd_status m_error_code;
-    bool m_ready_flag;
-    int m_total_bytes_read;
-
-    void free_all_blocks();
-    JPGD_NORETURN void stop_decoding(jpgd_status status);
-    void *alloc(size_t n, bool zero = false);
-    void word_clear(void *p, uint16 c, uint n);
-    void prep_in_buffer();
-    void read_dht_marker();
-    void read_dqt_marker();
-    void read_sof_marker();
-    void skip_variable_marker();
-    void read_dri_marker();
-    void read_sos_marker();
-    int next_marker();
-    int process_markers();
-    void locate_soi_marker();
-    void locate_sof_marker();
-    int locate_sos_marker();
-    void init(jpeg_decoder_stream * pStream);
-    void create_look_ups();
-    void fix_in_buffer();
-    void transform_mcu(int mcu_row);
-    void transform_mcu_expand(int mcu_row);
-    coeff_buf* coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y);
-    inline jpgd_block_t *coeff_buf_getp(coeff_buf *cb, int block_x, int block_y);
-    void load_next_row();
-    void decode_next_row();
-    void make_huff_table(int index, huff_tables *pH);
-    void check_quant_tables();
-    void check_huff_tables();
-    void calc_mcu_block_order();
-    int init_scan();
-    void init_frame();
-    void process_restart();
-    void decode_scan(pDecode_block_func decode_block_func);
-    void init_progressive();
-    void init_sequential();
-    void decode_start();
-    void decode_init(jpeg_decoder_stream * pStream);
-    void H2V2Convert();
-    void H2V1Convert();
-    void H1V2Convert();
-    void H1V1Convert();
-    void gray_convert();
-    void expanded_convert();
-    void find_eoi();
-    inline uint get_char();
-    inline uint get_char(bool *pPadding_flag);
-    inline void stuff_char(uint8 q);
-    inline uint8 get_octet();
-    inline uint get_bits(int num_bits);
-    inline uint get_bits_no_markers(int numbits);
-    inline int huff_decode(huff_tables *pH);
-    inline int huff_decode(huff_tables *pH, int& extrabits);
-    static inline uint8 clamp(int i);
-    static void decode_block_dc_first(jpeg_decoder *pD, int component_id, int block_x, int block_y);
-    static void decode_block_dc_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y);
-    static void decode_block_ac_first(jpeg_decoder *pD, int component_id, int block_x, int block_y);
-    static void decode_block_ac_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y);
-  };
-  
-} // namespace jpgd
-
-#endif // JPEG_DECODER_H
diff --git a/src/loaders/jpg/meson.build b/src/loaders/jpg/meson.build
index 087e36b..ff7e992 100644
--- a/src/loaders/jpg/meson.build
+++ b/src/loaders/jpg/meson.build
@@ -1,7 +1,7 @@
 source_file = [
-   'jpgd.h',
+   'tvgJpgd.h',
    'tvgJpgLoader.h',
-   'jpgd.cpp',
+   'tvgJpgd.cpp',
    'tvgJpgLoader.cpp',
 ]
 
diff --git a/src/loaders/jpg/tvgJpgLoader.cpp b/src/loaders/jpg/tvgJpgLoader.cpp
index 511a888..7e6c375 100644
--- a/src/loaders/jpg/tvgJpgLoader.cpp
+++ b/src/loaders/jpg/tvgJpgLoader.cpp
@@ -20,7 +20,7 @@
  * SOFTWARE.
  */
 
-#include "jpgd.h"
+#include "tvgJpgd.h"
 #include "tvgLoader.h"
 #include "tvgJpgLoader.h"
 
@@ -43,11 +43,11 @@ JpgLoader::~JpgLoader()
 bool JpgLoader::open(const string& path)
 {
     int width, height, actual_comps;
-    image = jpgd::decompress_jpeg_image_from_file(path.c_str(), &width, &height, &actual_comps, 4);
+    image = decompress_jpeg_image_from_file(path.c_str(), &width, &height, &actual_comps, 4);
     if (!image) return false;
 
-    vw = w = width;
-    vh = h = height;
+    vw = w = static_cast<float>(width);
+    vh = h = static_cast<float>(height);
 
     return true;
 }
@@ -55,11 +55,11 @@ bool JpgLoader::open(const string& path)
 bool JpgLoader::open(const char* data, uint32_t size, bool copy)
 {
     int width, height, actual_comps;
-    image = jpgd::decompress_jpeg_image_from_memory((const unsigned char *)data, size, &width, &height, &actual_comps, 4);
+    image = decompress_jpeg_image_from_memory((const unsigned char *)data, size, &width, &height, &actual_comps, 4);
     if (!image) return false;
 
-    vw = w = width;
-    vh = h = height;
+    vw = w = static_cast<float>(width);
+    vh = h = static_cast<float>(height);
 
     return true;
 }
diff --git a/src/loaders/jpg/tvgJpgd.cpp b/src/loaders/jpg/tvgJpgd.cpp
new file mode 100644
index 0000000..7a59829
--- /dev/null
+++ b/src/loaders/jpg/tvgJpgd.cpp
@@ -0,0 +1,3014 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// jpgd.cpp - C++ class for JPEG decompression.
+// Public domain, Rich Geldreich <richgel99@gmail.com>
+// Alex Evans: Linear memory allocator (taken from jpge.h).
+// v1.04, May. 19, 2012: Code tweaks to fix VS2008 static code analysis warnings (all looked harmless)
+//
+// Supports progressive and baseline sequential JPEG image files, and the most common chroma subsampling factors: Y, H1V1, H2V1, H1V2, and H2V2.
+//
+// Chroma upsampling quality: H2V2 is upsampled in the frequency domain, H2V1 and H1V2 are upsampled using point sampling.
+// Chroma upsampling reference: "Fast Scheme for Image Size Change in the Compressed Domain"
+// http://vision.ai.uiuc.edu/~dugad/research/dct/index.html
+
+#include <memory.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <setjmp.h>
+#include <stdint.h>
+#include "tvgJpgd.h"
+
+#ifdef _MSC_VER
+  #pragma warning (disable : 4611) // warning C4611: interaction between '_setjmp' and C++ object destruction is non-portable
+  #define JPGD_NORETURN __declspec(noreturn)
+#elif defined(__GNUC__)
+  #define JPGD_NORETURN __attribute__ ((noreturn))
+#else
+  #define JPGD_NORETURN
+#endif
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+
+
+// Set to 1 to enable freq. domain chroma upsampling on images using H2V2 subsampling (0=faster nearest neighbor sampling).
+// This is slower, but results in higher quality on images with highly saturated colors.
+#define JPGD_SUPPORT_FREQ_DOMAIN_UPSAMPLING 1
+
+#define JPGD_ASSERT(x)
+#define JPGD_MAX(a,b) (((a)>(b)) ? (a) : (b))
+#define JPGD_MIN(a,b) (((a)<(b)) ? (a) : (b))
+
+typedef int16_t jpgd_quant_t;
+typedef int16_t jpgd_block_t;
+
+// Success/failure error codes.
+enum jpgd_status
+{
+    JPGD_SUCCESS = 0, JPGD_FAILED = -1, JPGD_DONE = 1,
+    JPGD_BAD_DHT_COUNTS = -256, JPGD_BAD_DHT_INDEX, JPGD_BAD_DHT_MARKER, JPGD_BAD_DQT_MARKER, JPGD_BAD_DQT_TABLE,
+    JPGD_BAD_PRECISION, JPGD_BAD_HEIGHT, JPGD_BAD_WIDTH, JPGD_TOO_MANY_COMPONENTS,
+    JPGD_BAD_SOF_LENGTH, JPGD_BAD_VARIABLE_MARKER, JPGD_BAD_DRI_LENGTH, JPGD_BAD_SOS_LENGTH,
+    JPGD_BAD_SOS_COMP_ID, JPGD_W_EXTRA_BYTES_BEFORE_MARKER, JPGD_NO_ARITHMITIC_SUPPORT, JPGD_UNEXPECTED_MARKER,
+    JPGD_NOT_JPEG, JPGD_UNSUPPORTED_MARKER, JPGD_BAD_DQT_LENGTH, JPGD_TOO_MANY_BLOCKS,
+    JPGD_UNDEFINED_QUANT_TABLE, JPGD_UNDEFINED_HUFF_TABLE, JPGD_NOT_SINGLE_SCAN, JPGD_UNSUPPORTED_COLORSPACE,
+    JPGD_UNSUPPORTED_SAMP_FACTORS, JPGD_DECODE_ERROR, JPGD_BAD_RESTART_MARKER, JPGD_ASSERTION_ERROR,
+    JPGD_BAD_SOS_SPECTRAL, JPGD_BAD_SOS_SUCCESSIVE, JPGD_STREAM_READ, JPGD_NOTENOUGHMEM
+};
+
+enum
+{
+    JPGD_IN_BUF_SIZE = 8192, JPGD_MAX_BLOCKS_PER_MCU = 10, JPGD_MAX_HUFF_TABLES = 8, JPGD_MAX_QUANT_TABLES = 4,
+    JPGD_MAX_COMPONENTS = 4, JPGD_MAX_COMPS_IN_SCAN = 4, JPGD_MAX_BLOCKS_PER_ROW = 8192, JPGD_MAX_HEIGHT = 16384, JPGD_MAX_WIDTH = 16384 
+};
+
+// Input stream interface.
+// Derive from this class to read input data from sources other than files or memory. Set m_eof_flag to true when no more data is available.
+// The decoder is rather greedy: it will keep on calling this method until its internal input buffer is full, or until the EOF flag is set.
+// It the input stream contains data after the JPEG stream's EOI (end of image) marker it will probably be pulled into the internal buffer.
+// Call the get_total_bytes_read() method to determine the actual size of the JPEG stream after successful decoding.
+struct jpeg_decoder_stream
+{
+    jpeg_decoder_stream() { }
+    virtual ~jpeg_decoder_stream() { }
+
+    // The read() method is called when the internal input buffer is empty.
+    // Parameters:
+    // pBuf - input buffer
+    // max_bytes_to_read - maximum bytes that can be written to pBuf
+    // pEOF_flag - set this to true if at end of stream (no more bytes remaining)
+    // Returns -1 on error, otherwise return the number of bytes actually written to the buffer (which may be 0).
+    // Notes: This method will be called in a loop until you set *pEOF_flag to true or the internal buffer is full.
+    virtual int read(uint8_t *pBuf, int max_bytes_to_read, bool *pEOF_flag) = 0;
+};
+
+
+// stdio FILE stream class.
+class jpeg_decoder_file_stream : public jpeg_decoder_stream
+{
+    jpeg_decoder_file_stream(const jpeg_decoder_file_stream &);
+    jpeg_decoder_file_stream &operator =(const jpeg_decoder_file_stream &);
+
+    FILE *m_pFile;
+    bool m_eof_flag, m_error_flag;
+
+public:
+    jpeg_decoder_file_stream();
+    virtual ~jpeg_decoder_file_stream();
+    bool open(const char *Pfilename);
+    void close();
+    virtual int read(uint8_t *pBuf, int max_bytes_to_read, bool *pEOF_flag);
+  };
+
+
+// Memory stream class.
+class jpeg_decoder_mem_stream : public jpeg_decoder_stream
+{
+    const uint8_t *m_pSrc_data;
+    uint32_t m_ofs, m_size;
+
+public:
+    jpeg_decoder_mem_stream() : m_pSrc_data(nullptr), m_ofs(0), m_size(0) {}
+    jpeg_decoder_mem_stream(const uint8_t *pSrc_data, uint32_t size) : m_pSrc_data(pSrc_data), m_ofs(0), m_size(size) {}
+    virtual ~jpeg_decoder_mem_stream() {}
+    bool open(const uint8_t *pSrc_data, uint32_t size);
+    void close() { m_pSrc_data = nullptr; m_ofs = 0; m_size = 0; }
+    virtual int read(uint8_t *pBuf, int max_bytes_to_read, bool *pEOF_flag);
+};
+
+
+class jpeg_decoder
+{
+public:
+    // Call get_error_code() after constructing to determine if the stream is valid or not. You may call the get_width(), get_height(), etc.
+    // methods after the constructor is called. You may then either destruct the object, or begin decoding the image by calling begin_decoding(), then decode() on each scanline.
+    jpeg_decoder(jpeg_decoder_stream *pStream);
+    ~jpeg_decoder();
+
+    // Call this method after constructing the object to begin decompression.
+    // If JPGD_SUCCESS is returned you may then call decode() on each scanline.
+    int begin_decoding();
+    // Returns the next scan line.
+    // For grayscale images, pScan_line will point to a buffer containing 8-bit pixels (get_bytes_per_pixel() will return 1). 
+    // Otherwise, it will always point to a buffer containing 32-bit RGBA pixels (A will always be 255, and get_bytes_per_pixel() will return 4).
+    // Returns JPGD_SUCCESS if a scan line has been returned.
+    // Returns JPGD_DONE if all scan lines have been returned.
+    // Returns JPGD_FAILED if an error occurred. Call get_error_code() for a more info.
+    int decode(const void** pScan_line, uint32_t* pScan_line_len);
+    inline jpgd_status get_error_code() const { return m_error_code; }
+    inline int get_width() const { return m_image_x_size; }
+    inline int get_height() const { return m_image_y_size; }
+    inline int get_num_components() const { return m_comps_in_frame; }
+    inline int get_bytes_per_pixel() const { return m_dest_bytes_per_pixel; }
+    inline int get_bytes_per_scan_line() const { return m_image_x_size * get_bytes_per_pixel(); }
+    // Returns the total number of bytes actually consumed by the decoder (which should equal the actual size of the JPEG file).
+    inline int get_total_bytes_read() const { return m_total_bytes_read; }
+
+private:
+    jpeg_decoder(const jpeg_decoder &);
+    jpeg_decoder &operator =(const jpeg_decoder &);
+
+    typedef void (*pDecode_block_func)(jpeg_decoder *, int, int, int);
+
+    struct huff_tables
+    {
+      bool ac_table;
+      uint32_t  look_up[256];
+      uint32_t  look_up2[256];
+      uint8_t code_size[256];
+      uint32_t  tree[512];
+    };
+
+    struct coeff_buf
+    {
+      uint8_t *pData;
+      int block_num_x, block_num_y;
+      int block_len_x, block_len_y;
+      int block_size;
+    };
+
+    struct mem_block
+    {
+      mem_block *m_pNext;
+      size_t m_used_count;
+      size_t m_size;
+      char m_data[1];
+    };
+
+    jmp_buf m_jmp_state;
+    mem_block *m_pMem_blocks;
+    int m_image_x_size;
+    int m_image_y_size;
+    jpeg_decoder_stream *m_pStream;
+    int m_progressive_flag;
+    uint8_t m_huff_ac[JPGD_MAX_HUFF_TABLES];
+    uint8_t* m_huff_num[JPGD_MAX_HUFF_TABLES];      // pointer to number of Huffman codes per bit size
+    uint8_t* m_huff_val[JPGD_MAX_HUFF_TABLES];      // pointer to Huffman codes per bit size
+    jpgd_quant_t* m_quant[JPGD_MAX_QUANT_TABLES]; // pointer to quantization tables
+    int m_scan_type;                              // Gray, Yh1v1, Yh1v2, Yh2v1, Yh2v2 (CMYK111, CMYK4114 no longer supported)
+    int m_comps_in_frame;                         // # of components in frame
+    int m_comp_h_samp[JPGD_MAX_COMPONENTS];       // component's horizontal sampling factor
+    int m_comp_v_samp[JPGD_MAX_COMPONENTS];       // component's vertical sampling factor
+    int m_comp_quant[JPGD_MAX_COMPONENTS];        // component's quantization table selector
+    int m_comp_ident[JPGD_MAX_COMPONENTS];        // component's ID
+    int m_comp_h_blocks[JPGD_MAX_COMPONENTS];
+    int m_comp_v_blocks[JPGD_MAX_COMPONENTS];
+    int m_comps_in_scan;                          // # of components in scan
+    int m_comp_list[JPGD_MAX_COMPS_IN_SCAN];      // components in this scan
+    int m_comp_dc_tab[JPGD_MAX_COMPONENTS];       // component's DC Huffman coding table selector
+    int m_comp_ac_tab[JPGD_MAX_COMPONENTS];       // component's AC Huffman coding table selector
+    int m_spectral_start;                         // spectral selection start
+    int m_spectral_end;                           // spectral selection end
+    int m_successive_low;                         // successive approximation low
+    int m_successive_high;                        // successive approximation high
+    int m_max_mcu_x_size;                         // MCU's max. X size in pixels
+    int m_max_mcu_y_size;                         // MCU's max. Y size in pixels
+    int m_blocks_per_mcu;
+    int m_max_blocks_per_row;
+    int m_mcus_per_row, m_mcus_per_col;
+    int m_mcu_org[JPGD_MAX_BLOCKS_PER_MCU];
+    int m_total_lines_left;                       // total # lines left in image
+    int m_mcu_lines_left;                         // total # lines left in this MCU
+    int m_real_dest_bytes_per_scan_line;
+    int m_dest_bytes_per_scan_line;               // rounded up
+    int m_dest_bytes_per_pixel;                   // 4 (RGB) or 1 (Y)
+    huff_tables* m_pHuff_tabs[JPGD_MAX_HUFF_TABLES];
+    coeff_buf* m_dc_coeffs[JPGD_MAX_COMPONENTS];
+    coeff_buf* m_ac_coeffs[JPGD_MAX_COMPONENTS];
+    int m_eob_run;
+    int m_block_y_mcu[JPGD_MAX_COMPONENTS];
+    uint8_t* m_pIn_buf_ofs;
+    int m_in_buf_left;
+    int m_tem_flag;
+    bool m_eof_flag;
+    uint8_t m_in_buf_pad_start[128];
+    uint8_t m_in_buf[JPGD_IN_BUF_SIZE + 128];
+    uint8_t m_in_buf_pad_end[128];
+    int m_bits_left;
+    uint32_t m_bit_buf;
+    int m_restart_interval;
+    int m_restarts_left;
+    int m_next_restart_num;
+    int m_max_mcus_per_row;
+    int m_max_blocks_per_mcu;
+    int m_expanded_blocks_per_mcu;
+    int m_expanded_blocks_per_row;
+    int m_expanded_blocks_per_component;
+    bool  m_freq_domain_chroma_upsample;
+    int m_max_mcus_per_col;
+    uint32_t m_last_dc_val[JPGD_MAX_COMPONENTS];
+    jpgd_block_t* m_pMCU_coefficients;
+    int m_mcu_block_max_zag[JPGD_MAX_BLOCKS_PER_MCU];
+    uint8_t* m_pSample_buf;
+    int m_crr[256];
+    int m_cbb[256];
+    int m_crg[256];
+    int m_cbg[256];
+    uint8_t* m_pScan_line_0;
+    uint8_t* m_pScan_line_1;
+    jpgd_status m_error_code;
+    bool m_ready_flag;
+    int m_total_bytes_read;
+
+    void free_all_blocks();
+    JPGD_NORETURN void stop_decoding(jpgd_status status);
+    void *alloc(size_t n, bool zero = false);
+    void word_clear(void *p, uint16_t c, uint32_t n);
+    void prep_in_buffer();
+    void read_dht_marker();
+    void read_dqt_marker();
+    void read_sof_marker();
+    void skip_variable_marker();
+    void read_dri_marker();
+    void read_sos_marker();
+    int next_marker();
+    int process_markers();
+    void locate_soi_marker();
+    void locate_sof_marker();
+    int locate_sos_marker();
+    void init(jpeg_decoder_stream * pStream);
+    void create_look_ups();
+    void fix_in_buffer();
+    void transform_mcu(int mcu_row);
+    void transform_mcu_expand(int mcu_row);
+    coeff_buf* coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y);
+    inline jpgd_block_t *coeff_buf_getp(coeff_buf *cb, int block_x, int block_y);
+    void load_next_row();
+    void decode_next_row();
+    void make_huff_table(int index, huff_tables *pH);
+    void check_quant_tables();
+    void check_huff_tables();
+    void calc_mcu_block_order();
+    int init_scan();
+    void init_frame();
+    void process_restart();
+    void decode_scan(pDecode_block_func decode_block_func);
+    void init_progressive();
+    void init_sequential();
+    void decode_start();
+    void decode_init(jpeg_decoder_stream * pStream);
+    void H2V2Convert();
+    void H2V1Convert();
+    void H1V2Convert();
+    void H1V1Convert();
+    void gray_convert();
+    void expanded_convert();
+    void find_eoi();
+    inline uint32_t get_char();
+    inline uint32_t get_char(bool *pPadding_flag);
+    inline void stuff_char(uint8_t q);
+    inline uint8_t get_octet();
+    inline uint32_t get_bits(int num_bits);
+    inline uint32_t get_bits_no_markers(int numbits);
+    inline int huff_decode(huff_tables *pH);
+    inline int huff_decode(huff_tables *pH, int& extrabits);
+    static inline uint8_t clamp(int i);
+    static void decode_block_dc_first(jpeg_decoder *pD, int component_id, int block_x, int block_y);
+    static void decode_block_dc_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y);
+    static void decode_block_ac_first(jpeg_decoder *pD, int component_id, int block_x, int block_y);
+    static void decode_block_ac_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y);
+};
+
+
+// DCT coefficients are stored in this sequence.
+static int g_ZAG[64] = {  0,1,8,16,9,2,3,10,17,24,32,25,18,11,4,5,12,19,26,33,40,48,41,34,27,20,13,6,7,14,21,28,35,42,49,56,57,50,43,36,29,22,15,23,30,37,44,51,58,59,52,45,38,31,39,46,53,60,61,54,47,55,62,63 };
+
+enum JPEG_MARKER
+{
+  M_SOF0  = 0xC0, M_SOF1  = 0xC1, M_SOF2  = 0xC2, M_SOF3  = 0xC3, M_SOF5  = 0xC5, M_SOF6  = 0xC6, M_SOF7  = 0xC7, M_JPG   = 0xC8,
+  M_SOF9  = 0xC9, M_SOF10 = 0xCA, M_SOF11 = 0xCB, M_SOF13 = 0xCD, M_SOF14 = 0xCE, M_SOF15 = 0xCF, M_DHT   = 0xC4, M_DAC   = 0xCC,
+  M_RST0  = 0xD0, M_RST1  = 0xD1, M_RST2  = 0xD2, M_RST3  = 0xD3, M_RST4  = 0xD4, M_RST5  = 0xD5, M_RST6  = 0xD6, M_RST7  = 0xD7,
+  M_SOI   = 0xD8, M_EOI   = 0xD9, M_SOS   = 0xDA, M_DQT   = 0xDB, M_DNL   = 0xDC, M_DRI   = 0xDD, M_DHP   = 0xDE, M_EXP   = 0xDF,
+  M_APP0  = 0xE0, M_APP15 = 0xEF, M_JPG0  = 0xF0, M_JPG13 = 0xFD, M_COM   = 0xFE, M_TEM   = 0x01, M_ERROR = 0x100, RST0   = 0xD0
+};
+
+enum JPEG_SUBSAMPLING { JPGD_GRAYSCALE = 0, JPGD_YH1V1, JPGD_YH2V1, JPGD_YH1V2, JPGD_YH2V2 };
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define SCALEDONE ((int32_t)1)
+#define DESCALE(x,n)  (((x) + (SCALEDONE << ((n)-1))) >> (n))
+#define DESCALE_ZEROSHIFT(x,n)  (((x) + (128 << (n)) + (SCALEDONE << ((n)-1))) >> (n))
+#define MULTIPLY(var, cnst)  ((var) * (cnst))
+#define CLAMP(i) ((static_cast<uint32_t>(i) > 255) ? (((~i) >> 31) & 0xFF) : (i))
+
+#define FIX_0_298631336  ((int32_t)2446)        /* FIX(0.298631336) */
+#define FIX_0_390180644  ((int32_t)3196)        /* FIX(0.390180644) */
+#define FIX_0_541196100  ((int32_t)4433)        /* FIX(0.541196100) */
+#define FIX_0_765366865  ((int32_t)6270)        /* FIX(0.765366865) */
+#define FIX_0_899976223  ((int32_t)7373)        /* FIX(0.899976223) */
+#define FIX_1_175875602  ((int32_t)9633)        /* FIX(1.175875602) */
+#define FIX_1_501321110  ((int32_t)12299)       /* FIX(1.501321110) */
+#define FIX_1_847759065  ((int32_t)15137)       /* FIX(1.847759065) */
+#define FIX_1_961570560  ((int32_t)16069)       /* FIX(1.961570560) */
+#define FIX_2_053119869  ((int32_t)16819)       /* FIX(2.053119869) */
+#define FIX_2_562915447  ((int32_t)20995)       /* FIX(2.562915447) */
+#define FIX_3_072711026  ((int32_t)25172)       /* FIX(3.072711026) */
+
+
+// Compiler creates a fast path 1D IDCT for X non-zero columns
+template <int NONZERO_COLS>
+struct Row
+{
+    static void idct(int* pTemp, const jpgd_block_t* pSrc)
+    {
+        // ACCESS_COL() will be optimized at compile time to either an array access, or 0.
+        #define ACCESS_COL(x) (((x) < NONZERO_COLS) ? (int)pSrc[x] : 0)
+
+        const int z2 = ACCESS_COL(2), z3 = ACCESS_COL(6);
+        const int z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+        const int tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
+        const int tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+
+        const int tmp0 = (ACCESS_COL(0) + ACCESS_COL(4)) << CONST_BITS;
+        const int tmp1 = (ACCESS_COL(0) - ACCESS_COL(4)) << CONST_BITS;
+
+        const int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
+
+        const int atmp0 = ACCESS_COL(7), atmp1 = ACCESS_COL(5), atmp2 = ACCESS_COL(3), atmp3 = ACCESS_COL(1);
+
+        const int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
+        const int bz5 = MULTIPLY(bz3 + bz4, FIX_1_175875602);
+
+        const int az1 = MULTIPLY(bz1, - FIX_0_899976223);
+        const int az2 = MULTIPLY(bz2, - FIX_2_562915447);
+        const int az3 = MULTIPLY(bz3, - FIX_1_961570560) + bz5;
+        const int az4 = MULTIPLY(bz4, - FIX_0_390180644) + bz5;
+
+        const int btmp0 = MULTIPLY(atmp0, FIX_0_298631336) + az1 + az3;
+        const int btmp1 = MULTIPLY(atmp1, FIX_2_053119869) + az2 + az4;
+        const int btmp2 = MULTIPLY(atmp2, FIX_3_072711026) + az2 + az3;
+        const int btmp3 = MULTIPLY(atmp3, FIX_1_501321110) + az1 + az4;
+
+        pTemp[0] = DESCALE(tmp10 + btmp3, CONST_BITS-PASS1_BITS);
+        pTemp[7] = DESCALE(tmp10 - btmp3, CONST_BITS-PASS1_BITS);
+        pTemp[1] = DESCALE(tmp11 + btmp2, CONST_BITS-PASS1_BITS);
+        pTemp[6] = DESCALE(tmp11 - btmp2, CONST_BITS-PASS1_BITS);
+        pTemp[2] = DESCALE(tmp12 + btmp1, CONST_BITS-PASS1_BITS);
+        pTemp[5] = DESCALE(tmp12 - btmp1, CONST_BITS-PASS1_BITS);
+        pTemp[3] = DESCALE(tmp13 + btmp0, CONST_BITS-PASS1_BITS);
+        pTemp[4] = DESCALE(tmp13 - btmp0, CONST_BITS-PASS1_BITS);
+    }
+};
+
+
+template <>
+struct Row<0>
+{
+    static void idct(int* pTemp, const jpgd_block_t* pSrc)
+    {
+#ifdef _MSC_VER
+      pTemp; pSrc;
+#endif
+    }
+};
+
+
+template <>
+struct Row<1>
+{
+    static void idct(int* pTemp, const jpgd_block_t* pSrc)
+    {
+        const int dcval = (pSrc[0] << PASS1_BITS);
+
+        pTemp[0] = dcval;
+        pTemp[1] = dcval;
+        pTemp[2] = dcval;
+        pTemp[3] = dcval;
+        pTemp[4] = dcval;
+        pTemp[5] = dcval;
+        pTemp[6] = dcval;
+        pTemp[7] = dcval;
+    }
+};
+
+
+// Compiler creates a fast path 1D IDCT for X non-zero rows
+template <int NONZERO_ROWS>
+struct Col
+{
+    static void idct(uint8_t* pDst_ptr, const int* pTemp)
+    {
+        // ACCESS_ROW() will be optimized at compile time to either an array access, or 0.
+        #define ACCESS_ROW(x) (((x) < NONZERO_ROWS) ? pTemp[x * 8] : 0)
+
+        const int z2 = ACCESS_ROW(2);
+        const int z3 = ACCESS_ROW(6);
+
+        const int z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+        const int tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065);
+        const int tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+
+        const int tmp0 = (ACCESS_ROW(0) + ACCESS_ROW(4)) << CONST_BITS;
+        const int tmp1 = (ACCESS_ROW(0) - ACCESS_ROW(4)) << CONST_BITS;
+
+        const int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
+
+        const int atmp0 = ACCESS_ROW(7), atmp1 = ACCESS_ROW(5), atmp2 = ACCESS_ROW(3), atmp3 = ACCESS_ROW(1);
+
+        const int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
+        const int bz5 = MULTIPLY(bz3 + bz4, FIX_1_175875602);
+
+        const int az1 = MULTIPLY(bz1, - FIX_0_899976223);
+        const int az2 = MULTIPLY(bz2, - FIX_2_562915447);
+        const int az3 = MULTIPLY(bz3, - FIX_1_961570560) + bz5;
+        const int az4 = MULTIPLY(bz4, - FIX_0_390180644) + bz5;
+
+        const int btmp0 = MULTIPLY(atmp0, FIX_0_298631336) + az1 + az3;
+        const int btmp1 = MULTIPLY(atmp1, FIX_2_053119869) + az2 + az4;
+        const int btmp2 = MULTIPLY(atmp2, FIX_3_072711026) + az2 + az3;
+        const int btmp3 = MULTIPLY(atmp3, FIX_1_501321110) + az1 + az4;
+
+        int i = DESCALE_ZEROSHIFT(tmp10 + btmp3, CONST_BITS+PASS1_BITS+3);
+        pDst_ptr[8*0] = (uint8_t)CLAMP(i);
+
+        i = DESCALE_ZEROSHIFT(tmp10 - btmp3, CONST_BITS+PASS1_BITS+3);
+        pDst_ptr[8*7] = (uint8_t)CLAMP(i);
+
+        i = DESCALE_ZEROSHIFT(tmp11 + btmp2, CONST_BITS+PASS1_BITS+3);
+        pDst_ptr[8*1] = (uint8_t)CLAMP(i);
+
+        i = DESCALE_ZEROSHIFT(tmp11 - btmp2, CONST_BITS+PASS1_BITS+3);
+        pDst_ptr[8*6] = (uint8_t)CLAMP(i);
+
+        i = DESCALE_ZEROSHIFT(tmp12 + btmp1, CONST_BITS+PASS1_BITS+3);
+        pDst_ptr[8*2] = (uint8_t)CLAMP(i);
+
+        i = DESCALE_ZEROSHIFT(tmp12 - btmp1, CONST_BITS+PASS1_BITS+3);
+        pDst_ptr[8*5] = (uint8_t)CLAMP(i);
+
+        i = DESCALE_ZEROSHIFT(tmp13 + btmp0, CONST_BITS+PASS1_BITS+3);
+        pDst_ptr[8*3] = (uint8_t)CLAMP(i);
+
+        i = DESCALE_ZEROSHIFT(tmp13 - btmp0, CONST_BITS+PASS1_BITS+3);
+        pDst_ptr[8*4] = (uint8_t)CLAMP(i);
+    }
+};
+
+
+template <>
+struct Col<1>
+{
+    static void idct(uint8_t* pDst_ptr, const int* pTemp)
+    {
+        int dcval = DESCALE_ZEROSHIFT(pTemp[0], PASS1_BITS+3);
+        const uint8_t dcval_clamped = (uint8_t)CLAMP(dcval);
+        pDst_ptr[0*8] = dcval_clamped;
+        pDst_ptr[1*8] = dcval_clamped;
+        pDst_ptr[2*8] = dcval_clamped;
+        pDst_ptr[3*8] = dcval_clamped;
+        pDst_ptr[4*8] = dcval_clamped;
+        pDst_ptr[5*8] = dcval_clamped;
+        pDst_ptr[6*8] = dcval_clamped;
+        pDst_ptr[7*8] = dcval_clamped;
+    }
+};
+
+
+static const uint8_t s_idct_row_table[] = {
+    1,0,0,0,0,0,0,0, 2,0,0,0,0,0,0,0, 2,1,0,0,0,0,0,0, 2,1,1,0,0,0,0,0, 2,2,1,0,0,0,0,0, 3,2,1,0,0,0,0,0, 4,2,1,0,0,0,0,0, 4,3,1,0,0,0,0,0,
+    4,3,2,0,0,0,0,0, 4,3,2,1,0,0,0,0, 4,3,2,1,1,0,0,0, 4,3,2,2,1,0,0,0, 4,3,3,2,1,0,0,0, 4,4,3,2,1,0,0,0, 5,4,3,2,1,0,0,0, 6,4,3,2,1,0,0,0,
+    6,5,3,2,1,0,0,0, 6,5,4,2,1,0,0,0, 6,5,4,3,1,0,0,0, 6,5,4,3,2,0,0,0, 6,5,4,3,2,1,0,0, 6,5,4,3,2,1,1,0, 6,5,4,3,2,2,1,0, 6,5,4,3,3,2,1,0,
+    6,5,4,4,3,2,1,0, 6,5,5,4,3,2,1,0, 6,6,5,4,3,2,1,0, 7,6,5,4,3,2,1,0, 8,6,5,4,3,2,1,0, 8,7,5,4,3,2,1,0, 8,7,6,4,3,2,1,0, 8,7,6,5,3,2,1,0,
+    8,7,6,5,4,2,1,0, 8,7,6,5,4,3,1,0, 8,7,6,5,4,3,2,0, 8,7,6,5,4,3,2,1, 8,7,6,5,4,3,2,2, 8,7,6,5,4,3,3,2, 8,7,6,5,4,4,3,2, 8,7,6,5,5,4,3,2,
+    8,7,6,6,5,4,3,2, 8,7,7,6,5,4,3,2, 8,8,7,6,5,4,3,2, 8,8,8,6,5,4,3,2, 8,8,8,7,5,4,3,2, 8,8,8,7,6,4,3,2, 8,8,8,7,6,5,3,2, 8,8,8,7,6,5,4,2,
+    8,8,8,7,6,5,4,3, 8,8,8,7,6,5,4,4, 8,8,8,7,6,5,5,4, 8,8,8,7,6,6,5,4, 8,8,8,7,7,6,5,4, 8,8,8,8,7,6,5,4, 8,8,8,8,8,6,5,4, 8,8,8,8,8,7,5,4,
+    8,8,8,8,8,7,6,4, 8,8,8,8,8,7,6,5, 8,8,8,8,8,7,6,6, 8,8,8,8,8,7,7,6, 8,8,8,8,8,8,7,6, 8,8,8,8,8,8,8,6, 8,8,8,8,8,8,8,7, 8,8,8,8,8,8,8,8,
+};
+
+
+static const uint8_t s_idct_col_table[] = { 1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 };
+
+
+void idct(const jpgd_block_t* pSrc_ptr, uint8_t* pDst_ptr, int block_max_zag)
+{
+    JPGD_ASSERT(block_max_zag >= 1);
+    JPGD_ASSERT(block_max_zag <= 64);
+
+    if (block_max_zag <= 1) {
+        int k = ((pSrc_ptr[0] + 4) >> 3) + 128;
+        k = CLAMP(k);
+        k = k | (k<<8);
+        k = k | (k<<16);
+        for (int i = 8; i > 0; i--) {
+            *(int*)&pDst_ptr[0] = k;
+            *(int*)&pDst_ptr[4] = k;
+            pDst_ptr += 8;
+        }
+      return;
+    }
+
+    int temp[64];
+    const jpgd_block_t* pSrc = pSrc_ptr;
+    int* pTemp = temp;
+    const uint8_t* pRow_tab = &s_idct_row_table[(block_max_zag - 1) * 8];
+    int i;
+    for (i = 8; i > 0; i--, pRow_tab++) {
+        switch (*pRow_tab) {
+            case 0: Row<0>::idct(pTemp, pSrc); break;
+            case 1: Row<1>::idct(pTemp, pSrc); break;
+            case 2: Row<2>::idct(pTemp, pSrc); break;
+            case 3: Row<3>::idct(pTemp, pSrc); break;
+            case 4: Row<4>::idct(pTemp, pSrc); break;
+            case 5: Row<5>::idct(pTemp, pSrc); break;
+            case 6: Row<6>::idct(pTemp, pSrc); break;
+            case 7: Row<7>::idct(pTemp, pSrc); break;
+            case 8: Row<8>::idct(pTemp, pSrc); break;
+        }
+        pSrc += 8;
+        pTemp += 8;
+    }
+
+    pTemp = temp;
+
+    const int nonzero_rows = s_idct_col_table[block_max_zag - 1];
+    for (i = 8; i > 0; i--) {
+        switch (nonzero_rows) {
+            case 1: Col<1>::idct(pDst_ptr, pTemp); break;
+            case 2: Col<2>::idct(pDst_ptr, pTemp); break;
+            case 3: Col<3>::idct(pDst_ptr, pTemp); break;
+            case 4: Col<4>::idct(pDst_ptr, pTemp); break;
+            case 5: Col<5>::idct(pDst_ptr, pTemp); break;
+            case 6: Col<6>::idct(pDst_ptr, pTemp); break;
+            case 7: Col<7>::idct(pDst_ptr, pTemp); break;
+            case 8: Col<8>::idct(pDst_ptr, pTemp); break;
+        }
+        pTemp++;
+        pDst_ptr++;
+    }
+}
+
+
+void idct_4x4(const jpgd_block_t* pSrc_ptr, uint8_t* pDst_ptr)
+{
+    int temp[64];
+    int* pTemp = temp;
+    const jpgd_block_t* pSrc = pSrc_ptr;
+
+    for (int i = 4; i > 0; i--) {
+        Row<4>::idct(pTemp, pSrc);
+        pSrc += 8;
+        pTemp += 8;
+    }
+
+    pTemp = temp;
+
+    for (int i = 8; i > 0; i--) {
+        Col<4>::idct(pDst_ptr, pTemp);
+        pTemp++;
+        pDst_ptr++;
+    }
+}
+
+
+// Retrieve one character from the input stream.
+inline uint32_t jpeg_decoder::get_char()
+{
+    // Any bytes remaining in buffer?
+    if (!m_in_buf_left) {
+        // Try to get more bytes.
+        prep_in_buffer();
+        // Still nothing to get?
+        if (!m_in_buf_left) {
+            // Pad the end of the stream with 0xFF 0xD9 (EOI marker)
+            int t = m_tem_flag;
+            m_tem_flag ^= 1;
+            if (t) return 0xD9;
+            else return 0xFF;
+        }
+    }
+    uint32_t c = *m_pIn_buf_ofs++;
+    m_in_buf_left--;
+    return c;
+}
+
+
+// Same as previous method, except can indicate if the character is a pad character or not.
+inline uint32_t jpeg_decoder::get_char(bool *pPadding_flag)
+{
+    if (!m_in_buf_left) {
+        prep_in_buffer();
+        if (!m_in_buf_left) {
+            *pPadding_flag = true;
+            int t = m_tem_flag;
+            m_tem_flag ^= 1;
+            if (t) return 0xD9;
+            else return 0xFF;
+        }
+    }
+    *pPadding_flag = false;
+    uint32_t c = *m_pIn_buf_ofs++;
+    m_in_buf_left--;
+
+    return c;
+}
+
+
+// Inserts a previously retrieved character back into the input buffer.
+inline void jpeg_decoder::stuff_char(uint8_t q)
+{
+    *(--m_pIn_buf_ofs) = q;
+    m_in_buf_left++;
+}
+
+
+// Retrieves one character from the input stream, but does not read past markers. Will continue to return 0xFF when a marker is encountered.
+inline uint8_t jpeg_decoder::get_octet()
+{
+    bool padding_flag;
+    int c = get_char(&padding_flag);
+
+    if (c == 0xFF) {
+        if (padding_flag) return 0xFF;
+
+        c = get_char(&padding_flag);
+        if (padding_flag) {
+            stuff_char(0xFF);
+            return 0xFF;
+        }
+        if (c == 0x00) return 0xFF;
+        else {
+            stuff_char(static_cast<uint8_t>(c));
+            stuff_char(0xFF);
+            return 0xFF;
+        }
+    }
+    return static_cast<uint8_t>(c);
+}
+
+
+// Retrieves a variable number of bits from the input stream. Does not recognize markers.
+inline uint32_t jpeg_decoder::get_bits(int num_bits)
+{
+    if (!num_bits) return 0;
+
+    uint32_t i = m_bit_buf >> (32 - num_bits);
+
+    if ((m_bits_left -= num_bits) <= 0) {
+        m_bit_buf <<= (num_bits += m_bits_left);
+        uint32_t c1 = get_char();
+        uint32_t c2 = get_char();
+        m_bit_buf = (m_bit_buf & 0xFFFF0000) | (c1 << 8) | c2;
+        m_bit_buf <<= -m_bits_left;
+        m_bits_left += 16;
+        JPGD_ASSERT(m_bits_left >= 0);
+    }
+    else m_bit_buf <<= num_bits;
+
+    return i;
+}
+
+
+// Retrieves a variable number of bits from the input stream. Markers will not be read into the input bit buffer. Instead, an infinite number of all 1's will be returned when a marker is encountered.
+inline uint32_t jpeg_decoder::get_bits_no_markers(int num_bits)
+{
+    if (!num_bits)return 0;
+
+    uint32_t i = m_bit_buf >> (32 - num_bits);
+
+    if ((m_bits_left -= num_bits) <= 0) {
+        m_bit_buf <<= (num_bits += m_bits_left);
+        if ((m_in_buf_left < 2) || (m_pIn_buf_ofs[0] == 0xFF) || (m_pIn_buf_ofs[1] == 0xFF)) {
+            uint32_t c1 = get_octet();
+            uint32_t c2 = get_octet();
+            m_bit_buf |= (c1 << 8) | c2;
+        } else {
+            m_bit_buf |= ((uint32_t)m_pIn_buf_ofs[0] << 8) | m_pIn_buf_ofs[1];
+            m_in_buf_left -= 2;
+            m_pIn_buf_ofs += 2;
+        }
+        m_bit_buf <<= -m_bits_left;
+        m_bits_left += 16;
+        JPGD_ASSERT(m_bits_left >= 0);
+    } else m_bit_buf <<= num_bits;
+
+    return i;
+}
+
+
+// Decodes a Huffman encoded symbol.
+inline int jpeg_decoder::huff_decode(huff_tables *pH)
+{
+    int symbol;
+
+    // Check first 8-bits: do we have a complete symbol?
+    if ((symbol = pH->look_up[m_bit_buf >> 24]) < 0) {
+        // Decode more bits, use a tree traversal to find symbol.
+        int ofs = 23;
+        do {
+            symbol = pH->tree[-(int)(symbol + ((m_bit_buf >> ofs) & 1))];
+            ofs--;
+        } while (symbol < 0);
+        get_bits_no_markers(8 + (23 - ofs));
+    } else get_bits_no_markers(pH->code_size[symbol]);
+
+  return symbol;
+}
+
+
+// Decodes a Huffman encoded symbol.
+inline int jpeg_decoder::huff_decode(huff_tables *pH, int& extra_bits)
+{
+    int symbol;
+
+    // Check first 8-bits: do we have a complete symbol?
+    if ((symbol = pH->look_up2[m_bit_buf >> 24]) < 0) {
+        // Use a tree traversal to find symbol.
+        int ofs = 23;
+        do {
+            symbol = pH->tree[-(int)(symbol + ((m_bit_buf >> ofs) & 1))];
+            ofs--;
+        } while (symbol < 0);
+
+        get_bits_no_markers(8 + (23 - ofs));
+        extra_bits = get_bits_no_markers(symbol & 0xF);
+    } else {
+        JPGD_ASSERT(((symbol >> 8) & 31) == pH->code_size[symbol & 255] + ((symbol & 0x8000) ? (symbol & 15) : 0));
+
+        if (symbol & 0x8000) {
+            get_bits_no_markers((symbol >> 8) & 31);
+            extra_bits = symbol >> 16;
+        } else  {
+            int code_size = (symbol >> 8) & 31;
+            int num_extra_bits = symbol & 0xF;
+            int bits = code_size + num_extra_bits;
+            if (bits <= (m_bits_left + 16)) extra_bits = get_bits_no_markers(bits) & ((1 << num_extra_bits) - 1);
+            else {
+                get_bits_no_markers(code_size);
+                extra_bits = get_bits_no_markers(num_extra_bits);
+            }
+        }
+        symbol &= 0xFF;
+    }
+    return symbol;
+}
+
+
+// Tables and macro used to fully decode the DPCM differences.
+static const int s_extend_test[16] = { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
+static const int s_extend_offset[16] = { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1, ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1, ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1, ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
+static const int s_extend_mask[] = { 0, (1<<0), (1<<1), (1<<2), (1<<3), (1<<4), (1<<5), (1<<6), (1<<7), (1<<8), (1<<9), (1<<10), (1<<11), (1<<12), (1<<13), (1<<14), (1<<15), (1<<16) };
+// The logical AND's in this macro are to shut up static code analysis (aren't really necessary - couldn't find another way to do this)
+#define JPGD_HUFF_EXTEND(x, s) (((x) < s_extend_test[s & 15]) ? ((x) + s_extend_offset[s & 15]) : (x))
+
+
+// Clamps a value between 0-255.
+inline uint8_t jpeg_decoder::clamp(int i)
+{
+    if (static_cast<uint32_t>(i) > 255) i = (((~i) >> 31) & 0xFF);
+    return static_cast<uint8_t>(i);
+}
+
+
+namespace DCT_Upsample
+{
+    struct Matrix44
+    {
+        typedef int Element_Type;
+        enum { NUM_ROWS = 4, NUM_COLS = 4 };
+
+        Element_Type v[NUM_ROWS][NUM_COLS];
+
+        inline int rows() const { return NUM_ROWS; }
+        inline int cols() const { return NUM_COLS; }
+        inline const Element_Type & at(int r, int c) const { return v[r][c]; }
+        inline       Element_Type & at(int r, int c)       { return v[r][c]; }
+
+        inline Matrix44() {}
+
+        inline Matrix44& operator += (const Matrix44& a)
+        {
+            for (int r = 0; r < NUM_ROWS; r++) {
+                at(r, 0) += a.at(r, 0);
+                at(r, 1) += a.at(r, 1);
+                at(r, 2) += a.at(r, 2);
+                at(r, 3) += a.at(r, 3);
+            }
+            return *this;
+        }
+
+        inline Matrix44& operator -= (const Matrix44& a)
+        {
+            for (int r = 0; r < NUM_ROWS; r++) {
+                at(r, 0) -= a.at(r, 0);
+                at(r, 1) -= a.at(r, 1);
+                at(r, 2) -= a.at(r, 2);
+                at(r, 3) -= a.at(r, 3);
+            }
+            return *this;
+        }
+
+        friend inline Matrix44 operator + (const Matrix44& a, const Matrix44& b)
+        {
+            Matrix44 ret;
+            for (int r = 0; r < NUM_ROWS; r++) {
+                ret.at(r, 0) = a.at(r, 0) + b.at(r, 0);
+                ret.at(r, 1) = a.at(r, 1) + b.at(r, 1);
+                ret.at(r, 2) = a.at(r, 2) + b.at(r, 2);
+                ret.at(r, 3) = a.at(r, 3) + b.at(r, 3);
+            }
+            return ret;
+        }
+
+        friend inline Matrix44 operator - (const Matrix44& a, const Matrix44& b)
+        {
+            Matrix44 ret;
+            for (int r = 0; r < NUM_ROWS; r++) {
+                ret.at(r, 0) = a.at(r, 0) - b.at(r, 0);
+                ret.at(r, 1) = a.at(r, 1) - b.at(r, 1);
+                ret.at(r, 2) = a.at(r, 2) - b.at(r, 2);
+                ret.at(r, 3) = a.at(r, 3) - b.at(r, 3);
+            }
+            return ret;
+        }
+
+        static inline void add_and_store(jpgd_block_t* pDst, const Matrix44& a, const Matrix44& b)
+        {
+            for (int r = 0; r < 4; r++) {
+                pDst[0*8 + r] = static_cast<jpgd_block_t>(a.at(r, 0) + b.at(r, 0));
+                pDst[1*8 + r] = static_cast<jpgd_block_t>(a.at(r, 1) + b.at(r, 1));
+                pDst[2*8 + r] = static_cast<jpgd_block_t>(a.at(r, 2) + b.at(r, 2));
+                pDst[3*8 + r] = static_cast<jpgd_block_t>(a.at(r, 3) + b.at(r, 3));
+            }
+        }
+
+        static inline void sub_and_store(jpgd_block_t* pDst, const Matrix44& a, const Matrix44& b)
+        {
+            for (int r = 0; r < 4; r++) {
+                pDst[0*8 + r] = static_cast<jpgd_block_t>(a.at(r, 0) - b.at(r, 0));
+                pDst[1*8 + r] = static_cast<jpgd_block_t>(a.at(r, 1) - b.at(r, 1));
+                pDst[2*8 + r] = static_cast<jpgd_block_t>(a.at(r, 2) - b.at(r, 2));
+                pDst[3*8 + r] = static_cast<jpgd_block_t>(a.at(r, 3) - b.at(r, 3));
+            }
+        }
+    };
+
+    const int FRACT_BITS = 10;
+    const int SCALE = 1 << FRACT_BITS;
+
+    typedef int Temp_Type;
+    #define D(i) (((i) + (SCALE >> 1)) >> FRACT_BITS)
+    #define F(i) ((int)((i) * SCALE + .5f))
+
+    // Any decent C++ compiler will optimize this at compile time to a 0, or an array access.
+    #define AT(c, r) ((((c)>=NUM_COLS)||((r)>=NUM_ROWS)) ? 0 : pSrc[(c)+(r)*8])
+
+    // NUM_ROWS/NUM_COLS = # of non-zero rows/cols in input matrix
+    template<int NUM_ROWS, int NUM_COLS>
+    struct P_Q
+    {
+        static void calc(Matrix44& P, Matrix44& Q, const jpgd_block_t* pSrc)
+        {
+            // 4x8 = 4x8 times 8x8, matrix 0 is constant
+            const Temp_Type X000 = AT(0, 0);
+            const Temp_Type X001 = AT(0, 1);
+            const Temp_Type X002 = AT(0, 2);
+            const Temp_Type X003 = AT(0, 3);
+            const Temp_Type X004 = AT(0, 4);
+            const Temp_Type X005 = AT(0, 5);
+            const Temp_Type X006 = AT(0, 6);
+            const Temp_Type X007 = AT(0, 7);
+            const Temp_Type X010 = D(F(0.415735f) * AT(1, 0) + F(0.791065f) * AT(3, 0) + F(-0.352443f) * AT(5, 0) + F(0.277785f) * AT(7, 0));
+            const Temp_Type X011 = D(F(0.415735f) * AT(1, 1) + F(0.791065f) * AT(3, 1) + F(-0.352443f) * AT(5, 1) + F(0.277785f) * AT(7, 1));
+            const Temp_Type X012 = D(F(0.415735f) * AT(1, 2) + F(0.791065f) * AT(3, 2) + F(-0.352443f) * AT(5, 2) + F(0.277785f) * AT(7, 2));
+            const Temp_Type X013 = D(F(0.415735f) * AT(1, 3) + F(0.791065f) * AT(3, 3) + F(-0.352443f) * AT(5, 3) + F(0.277785f) * AT(7, 3));
+            const Temp_Type X014 = D(F(0.415735f) * AT(1, 4) + F(0.791065f) * AT(3, 4) + F(-0.352443f) * AT(5, 4) + F(0.277785f) * AT(7, 4));
+            const Temp_Type X015 = D(F(0.415735f) * AT(1, 5) + F(0.791065f) * AT(3, 5) + F(-0.352443f) * AT(5, 5) + F(0.277785f) * AT(7, 5));
+            const Temp_Type X016 = D(F(0.415735f) * AT(1, 6) + F(0.791065f) * AT(3, 6) + F(-0.352443f) * AT(5, 6) + F(0.277785f) * AT(7, 6));
+            const Temp_Type X017 = D(F(0.415735f) * AT(1, 7) + F(0.791065f) * AT(3, 7) + F(-0.352443f) * AT(5, 7) + F(0.277785f) * AT(7, 7));
+            const Temp_Type X020 = AT(4, 0);
+            const Temp_Type X021 = AT(4, 1);
+            const Temp_Type X022 = AT(4, 2);
+            const Temp_Type X023 = AT(4, 3);
+            const Temp_Type X024 = AT(4, 4);
+            const Temp_Type X025 = AT(4, 5);
+            const Temp_Type X026 = AT(4, 6);
+            const Temp_Type X027 = AT(4, 7);
+            const Temp_Type X030 = D(F(0.022887f) * AT(1, 0) + F(-0.097545f) * AT(3, 0) + F(0.490393f) * AT(5, 0) + F(0.865723f) * AT(7, 0));
+            const Temp_Type X031 = D(F(0.022887f) * AT(1, 1) + F(-0.097545f) * AT(3, 1) + F(0.490393f) * AT(5, 1) + F(0.865723f) * AT(7, 1));
+            const Temp_Type X032 = D(F(0.022887f) * AT(1, 2) + F(-0.097545f) * AT(3, 2) + F(0.490393f) * AT(5, 2) + F(0.865723f) * AT(7, 2));
+            const Temp_Type X033 = D(F(0.022887f) * AT(1, 3) + F(-0.097545f) * AT(3, 3) + F(0.490393f) * AT(5, 3) + F(0.865723f) * AT(7, 3));
+            const Temp_Type X034 = D(F(0.022887f) * AT(1, 4) + F(-0.097545f) * AT(3, 4) + F(0.490393f) * AT(5, 4) + F(0.865723f) * AT(7, 4));
+            const Temp_Type X035 = D(F(0.022887f) * AT(1, 5) + F(-0.097545f) * AT(3, 5) + F(0.490393f) * AT(5, 5) + F(0.865723f) * AT(7, 5));
+            const Temp_Type X036 = D(F(0.022887f) * AT(1, 6) + F(-0.097545f) * AT(3, 6) + F(0.490393f) * AT(5, 6) + F(0.865723f) * AT(7, 6));
+            const Temp_Type X037 = D(F(0.022887f) * AT(1, 7) + F(-0.097545f) * AT(3, 7) + F(0.490393f) * AT(5, 7) + F(0.865723f) * AT(7, 7));
+
+            // 4x4 = 4x8 times 8x4, matrix 1 is constant
+            P.at(0, 0) = X000;
+            P.at(0, 1) = D(X001 * F(0.415735f) + X003 * F(0.791065f) + X005 * F(-0.352443f) + X007 * F(0.277785f));
+            P.at(0, 2) = X004;
+            P.at(0, 3) = D(X001 * F(0.022887f) + X003 * F(-0.097545f) + X005 * F(0.490393f) + X007 * F(0.865723f));
+            P.at(1, 0) = X010;
+            P.at(1, 1) = D(X011 * F(0.415735f) + X013 * F(0.791065f) + X015 * F(-0.352443f) + X017 * F(0.277785f));
+            P.at(1, 2) = X014;
+            P.at(1, 3) = D(X011 * F(0.022887f) + X013 * F(-0.097545f) + X015 * F(0.490393f) + X017 * F(0.865723f));
+            P.at(2, 0) = X020;
+            P.at(2, 1) = D(X021 * F(0.415735f) + X023 * F(0.791065f) + X025 * F(-0.352443f) + X027 * F(0.277785f));
+            P.at(2, 2) = X024;
+            P.at(2, 3) = D(X021 * F(0.022887f) + X023 * F(-0.097545f) + X025 * F(0.490393f) + X027 * F(0.865723f));
+            P.at(3, 0) = X030;
+            P.at(3, 1) = D(X031 * F(0.415735f) + X033 * F(0.791065f) + X035 * F(-0.352443f) + X037 * F(0.277785f));
+            P.at(3, 2) = X034;
+            P.at(3, 3) = D(X031 * F(0.022887f) + X033 * F(-0.097545f) + X035 * F(0.490393f) + X037 * F(0.865723f));
+            // 40 muls 24 adds
+
+            // 4x4 = 4x8 times 8x4, matrix 1 is constant
+            Q.at(0, 0) = D(X001 * F(0.906127f) + X003 * F(-0.318190f) + X005 * F(0.212608f) + X007 * F(-0.180240f));
+            Q.at(0, 1) = X002;
+            Q.at(0, 2) = D(X001 * F(-0.074658f) + X003 * F(0.513280f) + X005 * F(0.768178f) + X007 * F(-0.375330f));
+            Q.at(0, 3) = X006;
+            Q.at(1, 0) = D(X011 * F(0.906127f) + X013 * F(-0.318190f) + X015 * F(0.212608f) + X017 * F(-0.180240f));
+            Q.at(1, 1) = X012;
+            Q.at(1, 2) = D(X011 * F(-0.074658f) + X013 * F(0.513280f) + X015 * F(0.768178f) + X017 * F(-0.375330f));
+            Q.at(1, 3) = X016;
+            Q.at(2, 0) = D(X021 * F(0.906127f) + X023 * F(-0.318190f) + X025 * F(0.212608f) + X027 * F(-0.180240f));
+            Q.at(2, 1) = X022;
+            Q.at(2, 2) = D(X021 * F(-0.074658f) + X023 * F(0.513280f) + X025 * F(0.768178f) + X027 * F(-0.375330f));
+            Q.at(2, 3) = X026;
+            Q.at(3, 0) = D(X031 * F(0.906127f) + X033 * F(-0.318190f) + X035 * F(0.212608f) + X037 * F(-0.180240f));
+            Q.at(3, 1) = X032;
+            Q.at(3, 2) = D(X031 * F(-0.074658f) + X033 * F(0.513280f) + X035 * F(0.768178f) + X037 * F(-0.375330f));
+            Q.at(3, 3) = X036;
+            // 40 muls 24 adds
+        }
+    };
+
+
+    template<int NUM_ROWS, int NUM_COLS>
+    struct R_S
+    {
+        static void calc(Matrix44& R, Matrix44& S, const jpgd_block_t* pSrc)
+        {
+            // 4x8 = 4x8 times 8x8, matrix 0 is constant
+            const Temp_Type X100 = D(F(0.906127f) * AT(1, 0) + F(-0.318190f) * AT(3, 0) + F(0.212608f) * AT(5, 0) + F(-0.180240f) * AT(7, 0));
+            const Temp_Type X101 = D(F(0.906127f) * AT(1, 1) + F(-0.318190f) * AT(3, 1) + F(0.212608f) * AT(5, 1) + F(-0.180240f) * AT(7, 1));
+            const Temp_Type X102 = D(F(0.906127f) * AT(1, 2) + F(-0.318190f) * AT(3, 2) + F(0.212608f) * AT(5, 2) + F(-0.180240f) * AT(7, 2));
+            const Temp_Type X103 = D(F(0.906127f) * AT(1, 3) + F(-0.318190f) * AT(3, 3) + F(0.212608f) * AT(5, 3) + F(-0.180240f) * AT(7, 3));
+            const Temp_Type X104 = D(F(0.906127f) * AT(1, 4) + F(-0.318190f) * AT(3, 4) + F(0.212608f) * AT(5, 4) + F(-0.180240f) * AT(7, 4));
+            const Temp_Type X105 = D(F(0.906127f) * AT(1, 5) + F(-0.318190f) * AT(3, 5) + F(0.212608f) * AT(5, 5) + F(-0.180240f) * AT(7, 5));
+            const Temp_Type X106 = D(F(0.906127f) * AT(1, 6) + F(-0.318190f) * AT(3, 6) + F(0.212608f) * AT(5, 6) + F(-0.180240f) * AT(7, 6));
+            const Temp_Type X107 = D(F(0.906127f) * AT(1, 7) + F(-0.318190f) * AT(3, 7) + F(0.212608f) * AT(5, 7) + F(-0.180240f) * AT(7, 7));
+            const Temp_Type X110 = AT(2, 0);
+            const Temp_Type X111 = AT(2, 1);
+            const Temp_Type X112 = AT(2, 2);
+            const Temp_Type X113 = AT(2, 3);
+            const Temp_Type X114 = AT(2, 4);
+            const Temp_Type X115 = AT(2, 5);
+            const Temp_Type X116 = AT(2, 6);
+            const Temp_Type X117 = AT(2, 7);
+            const Temp_Type X120 = D(F(-0.074658f) * AT(1, 0) + F(0.513280f) * AT(3, 0) + F(0.768178f) * AT(5, 0) + F(-0.375330f) * AT(7, 0));
+            const Temp_Type X121 = D(F(-0.074658f) * AT(1, 1) + F(0.513280f) * AT(3, 1) + F(0.768178f) * AT(5, 1) + F(-0.375330f) * AT(7, 1));
+            const Temp_Type X122 = D(F(-0.074658f) * AT(1, 2) + F(0.513280f) * AT(3, 2) + F(0.768178f) * AT(5, 2) + F(-0.375330f) * AT(7, 2));
+            const Temp_Type X123 = D(F(-0.074658f) * AT(1, 3) + F(0.513280f) * AT(3, 3) + F(0.768178f) * AT(5, 3) + F(-0.375330f) * AT(7, 3));
+            const Temp_Type X124 = D(F(-0.074658f) * AT(1, 4) + F(0.513280f) * AT(3, 4) + F(0.768178f) * AT(5, 4) + F(-0.375330f) * AT(7, 4));
+            const Temp_Type X125 = D(F(-0.074658f) * AT(1, 5) + F(0.513280f) * AT(3, 5) + F(0.768178f) * AT(5, 5) + F(-0.375330f) * AT(7, 5));
+            const Temp_Type X126 = D(F(-0.074658f) * AT(1, 6) + F(0.513280f) * AT(3, 6) + F(0.768178f) * AT(5, 6) + F(-0.375330f) * AT(7, 6));
+            const Temp_Type X127 = D(F(-0.074658f) * AT(1, 7) + F(0.513280f) * AT(3, 7) + F(0.768178f) * AT(5, 7) + F(-0.375330f) * AT(7, 7));
+            const Temp_Type X130 = AT(6, 0);
+            const Temp_Type X131 = AT(6, 1);
+            const Temp_Type X132 = AT(6, 2);
+            const Temp_Type X133 = AT(6, 3);
+            const Temp_Type X134 = AT(6, 4);
+            const Temp_Type X135 = AT(6, 5);
+            const Temp_Type X136 = AT(6, 6);
+            const Temp_Type X137 = AT(6, 7);
+            // 80 muls 48 adds
+
+            // 4x4 = 4x8 times 8x4, matrix 1 is constant
+            R.at(0, 0) = X100;
+            R.at(0, 1) = D(X101 * F(0.415735f) + X103 * F(0.791065f) + X105 * F(-0.352443f) + X107 * F(0.277785f));
+            R.at(0, 2) = X104;
+            R.at(0, 3) = D(X101 * F(0.022887f) + X103 * F(-0.097545f) + X105 * F(0.490393f) + X107 * F(0.865723f));
+            R.at(1, 0) = X110;
+            R.at(1, 1) = D(X111 * F(0.415735f) + X113 * F(0.791065f) + X115 * F(-0.352443f) + X117 * F(0.277785f));
+            R.at(1, 2) = X114;
+            R.at(1, 3) = D(X111 * F(0.022887f) + X113 * F(-0.097545f) + X115 * F(0.490393f) + X117 * F(0.865723f));
+            R.at(2, 0) = X120;
+            R.at(2, 1) = D(X121 * F(0.415735f) + X123 * F(0.791065f) + X125 * F(-0.352443f) + X127 * F(0.277785f));
+            R.at(2, 2) = X124;
+            R.at(2, 3) = D(X121 * F(0.022887f) + X123 * F(-0.097545f) + X125 * F(0.490393f) + X127 * F(0.865723f));
+            R.at(3, 0) = X130;
+            R.at(3, 1) = D(X131 * F(0.415735f) + X133 * F(0.791065f) + X135 * F(-0.352443f) + X137 * F(0.277785f));
+            R.at(3, 2) = X134;
+            R.at(3, 3) = D(X131 * F(0.022887f) + X133 * F(-0.097545f) + X135 * F(0.490393f) + X137 * F(0.865723f));
+            // 40 muls 24 adds
+            // 4x4 = 4x8 times 8x4, matrix 1 is constant
+            S.at(0, 0) = D(X101 * F(0.906127f) + X103 * F(-0.318190f) + X105 * F(0.212608f) + X107 * F(-0.180240f));
+            S.at(0, 1) = X102;
+            S.at(0, 2) = D(X101 * F(-0.074658f) + X103 * F(0.513280f) + X105 * F(0.768178f) + X107 * F(-0.375330f));
+            S.at(0, 3) = X106;
+            S.at(1, 0) = D(X111 * F(0.906127f) + X113 * F(-0.318190f) + X115 * F(0.212608f) + X117 * F(-0.180240f));
+            S.at(1, 1) = X112;
+            S.at(1, 2) = D(X111 * F(-0.074658f) + X113 * F(0.513280f) + X115 * F(0.768178f) + X117 * F(-0.375330f));
+            S.at(1, 3) = X116;
+            S.at(2, 0) = D(X121 * F(0.906127f) + X123 * F(-0.318190f) + X125 * F(0.212608f) + X127 * F(-0.180240f));
+            S.at(2, 1) = X122;
+            S.at(2, 2) = D(X121 * F(-0.074658f) + X123 * F(0.513280f) + X125 * F(0.768178f) + X127 * F(-0.375330f));
+            S.at(2, 3) = X126;
+            S.at(3, 0) = D(X131 * F(0.906127f) + X133 * F(-0.318190f) + X135 * F(0.212608f) + X137 * F(-0.180240f));
+            S.at(3, 1) = X132;
+            S.at(3, 2) = D(X131 * F(-0.074658f) + X133 * F(0.513280f) + X135 * F(0.768178f) + X137 * F(-0.375330f));
+            S.at(3, 3) = X136;
+            // 40 muls 24 adds
+        }
+    };
+} // end namespace DCT_Upsample
+
+
+// Unconditionally frees all allocated m_blocks.
+void jpeg_decoder::free_all_blocks()
+{
+    m_pStream = nullptr;
+    for (mem_block *b = m_pMem_blocks; b; ) {
+        mem_block *n = b->m_pNext;
+        free(b);
+        b = n;
+    }
+    m_pMem_blocks = nullptr;
+}
+
+
+// This method handles all errors. It will never return.
+// It could easily be changed to use C++ exceptions.
+JPGD_NORETURN void jpeg_decoder::stop_decoding(jpgd_status status)
+{
+    m_error_code = status;
+    free_all_blocks();
+    longjmp(m_jmp_state, status);
+}
+
+
+void *jpeg_decoder::alloc(size_t nSize, bool zero)
+{
+    nSize = (JPGD_MAX(nSize, 1) + 3) & ~3;
+    char *rv = nullptr;
+    for (mem_block *b = m_pMem_blocks; b; b = b->m_pNext) {
+        if ((b->m_used_count + nSize) <= b->m_size) {
+            rv = b->m_data + b->m_used_count;
+            b->m_used_count += nSize;
+            break;
+        }
+    }
+    if (!rv) {
+        int capacity = JPGD_MAX(32768 - 256, (nSize + 2047) & ~2047);
+        mem_block *b = (mem_block*)malloc(sizeof(mem_block) + capacity);
+        if (!b) stop_decoding(JPGD_NOTENOUGHMEM);
+        b->m_pNext = m_pMem_blocks; m_pMem_blocks = b;
+        b->m_used_count = nSize;
+        b->m_size = capacity;
+        rv = b->m_data;
+    }
+    if (zero) memset(rv, 0, nSize);
+    return rv;
+}
+
+
+void jpeg_decoder::word_clear(void *p, uint16_t c, uint32_t n)
+{
+    uint8_t *pD = (uint8_t*)p;
+    const uint8_t l = c & 0xFF, h = (c >> 8) & 0xFF;
+    while (n) {
+        pD[0] = l; pD[1] = h; pD += 2;
+        n--;
+    }
+}
+
+
+// Refill the input buffer.
+// This method will sit in a loop until (A) the buffer is full or (B)
+// the stream's read() method reports and end of file condition.
+void jpeg_decoder::prep_in_buffer()
+{
+    m_in_buf_left = 0;
+    m_pIn_buf_ofs = m_in_buf;
+
+    if (m_eof_flag) return;
+
+    do {
+        int bytes_read = m_pStream->read(m_in_buf + m_in_buf_left, JPGD_IN_BUF_SIZE - m_in_buf_left, &m_eof_flag);
+        if (bytes_read == -1) stop_decoding(JPGD_STREAM_READ);
+        m_in_buf_left += bytes_read;
+    } while ((m_in_buf_left < JPGD_IN_BUF_SIZE) && (!m_eof_flag));
+
+    m_total_bytes_read += m_in_buf_left;
+
+    // Pad the end of the block with M_EOI (prevents the decompressor from going off the rails if the stream is invalid).
+    // (This dates way back to when this decompressor was written in C/asm, and the all-asm Huffman decoder did some fancy things to increase perf.)
+    word_clear(m_pIn_buf_ofs + m_in_buf_left, 0xD9FF, 64);
+}
+
+
+// Read a Huffman code table.
+void jpeg_decoder::read_dht_marker()
+{
+    int i, index, count;
+    uint8_t huff_num[17];
+    uint8_t huff_val[256];
+    uint32_t num_left = get_bits(16);
+
+    if (num_left < 2) stop_decoding(JPGD_BAD_DHT_MARKER);
+    num_left -= 2;
+
+    while (num_left) {
+        index = get_bits(8);
+        huff_num[0] = 0;
+        count = 0;
+
+        for (i = 1; i <= 16; i++) {
+            huff_num[i] = static_cast<uint8_t>(get_bits(8));
+            count += huff_num[i];
+        }
+
+        if (count > 255) stop_decoding(JPGD_BAD_DHT_COUNTS);
+
+        for (i = 0; i < count; i++)
+            huff_val[i] = static_cast<uint8_t>(get_bits(8));
+
+        i = 1 + 16 + count;
+
+        if (num_left < (uint32_t)i) stop_decoding(JPGD_BAD_DHT_MARKER);
+        num_left -= i;
+
+        if ((index & 0x10) > 0x10) stop_decoding(JPGD_BAD_DHT_INDEX);
+        index = (index & 0x0F) + ((index & 0x10) >> 4) * (JPGD_MAX_HUFF_TABLES >> 1);
+        if (index >= JPGD_MAX_HUFF_TABLES) stop_decoding(JPGD_BAD_DHT_INDEX);
+
+        if (!m_huff_num[index]) m_huff_num[index] = (uint8_t *)alloc(17);
+        if (!m_huff_val[index]) m_huff_val[index] = (uint8_t *)alloc(256);
+
+        m_huff_ac[index] = (index & 0x10) != 0;
+        memcpy(m_huff_num[index], huff_num, 17);
+        memcpy(m_huff_val[index], huff_val, 256);
+    }
+}
+
+
+// Read a quantization table.
+void jpeg_decoder::read_dqt_marker()
+{
+    int n, i, prec;
+    uint32_t temp;
+    uint32_t num_left = get_bits(16);
+    if (num_left < 2) stop_decoding(JPGD_BAD_DQT_MARKER);
+    num_left -= 2;
+
+    while (num_left) {
+        n = get_bits(8);
+        prec = n >> 4;
+        n &= 0x0F;
+
+        if (n >= JPGD_MAX_QUANT_TABLES) stop_decoding(JPGD_BAD_DQT_TABLE);
+
+        if (!m_quant[n]) m_quant[n] = (jpgd_quant_t *)alloc(64 * sizeof(jpgd_quant_t));
+
+        // read quantization entries, in zag order
+        for (i = 0; i < 64; i++) {
+            temp = get_bits(8);
+            if (prec) temp = (temp << 8) + get_bits(8);
+            m_quant[n][i] = static_cast<jpgd_quant_t>(temp);
+        }
+        i = 64 + 1;
+        if (prec) i += 64;
+        if (num_left < (uint32_t)i) stop_decoding(JPGD_BAD_DQT_LENGTH);
+        num_left -= i;
+    }
+}
+
+
+// Read the start of frame (SOF) marker.
+void jpeg_decoder::read_sof_marker()
+{
+    int i;
+    uint32_t num_left = get_bits(16);
+
+    if (get_bits(8) != 8) stop_decoding(JPGD_BAD_PRECISION);   /* precision: sorry, only 8-bit precision is supported right now */
+       
+    m_image_y_size = get_bits(16);
+    if ((m_image_y_size < 1) || (m_image_y_size > JPGD_MAX_HEIGHT)) stop_decoding(JPGD_BAD_HEIGHT);
+
+    m_image_x_size = get_bits(16);
+    if ((m_image_x_size < 1) || (m_image_x_size > JPGD_MAX_WIDTH)) stop_decoding(JPGD_BAD_WIDTH);
+
+    m_comps_in_frame = get_bits(8);
+    if (m_comps_in_frame > JPGD_MAX_COMPONENTS) stop_decoding(JPGD_TOO_MANY_COMPONENTS);
+
+    if (num_left != (uint32_t)(m_comps_in_frame * 3 + 8)) stop_decoding(JPGD_BAD_SOF_LENGTH);
+
+    for (i = 0; i < m_comps_in_frame; i++) {
+        m_comp_ident[i]  = get_bits(8);
+        m_comp_h_samp[i] = get_bits(4);
+        m_comp_v_samp[i] = get_bits(4);
+        m_comp_quant[i]  = get_bits(8);
+    }
+}
+
+
+// Used to skip unrecognized markers.
+void jpeg_decoder::skip_variable_marker()
+{
+    uint32_t num_left = get_bits(16);
+    if (num_left < 2) stop_decoding(JPGD_BAD_VARIABLE_MARKER);
+    num_left -= 2;
+
+    while (num_left) {
+        get_bits(8);
+        num_left--;
+    }
+}
+
+
+// Read a define restart interval (DRI) marker.
+void jpeg_decoder::read_dri_marker()
+{
+    if (get_bits(16) != 4) stop_decoding(JPGD_BAD_DRI_LENGTH);
+    m_restart_interval = get_bits(16);
+}
+
+
+// Read a start of scan (SOS) marker.
+void jpeg_decoder::read_sos_marker()
+{
+    int i, ci, c, cc;
+    uint32_t num_left = get_bits(16);
+    int n = get_bits(8);
+
+    m_comps_in_scan = n;
+    num_left -= 3;
+
+    if ( (num_left != (uint32_t)(n * 2 + 3)) || (n < 1) || (n > JPGD_MAX_COMPS_IN_SCAN) ) stop_decoding(JPGD_BAD_SOS_LENGTH);
+
+    for (i = 0; i < n; i++) {
+        cc = get_bits(8);
+        c = get_bits(8);
+        num_left -= 2;
+
+        for (ci = 0; ci < m_comps_in_frame; ci++)
+          if (cc == m_comp_ident[ci]) break;
+
+        if (ci >= m_comps_in_frame) stop_decoding(JPGD_BAD_SOS_COMP_ID);
+
+        m_comp_list[i]    = ci;
+        m_comp_dc_tab[ci] = (c >> 4) & 15;
+        m_comp_ac_tab[ci] = (c & 15) + (JPGD_MAX_HUFF_TABLES >> 1);
+    }
+    m_spectral_start  = get_bits(8);
+    m_spectral_end    = get_bits(8);
+    m_successive_high = get_bits(4);
+    m_successive_low  = get_bits(4);
+
+    if (!m_progressive_flag) {
+        m_spectral_start = 0;
+        m_spectral_end = 63;
+    }
+    num_left -= 3;
+
+    while (num_left) {    /* read past whatever is num_left */    
+        get_bits(8);
+        num_left--;
+    }
+}
+
+
+// Finds the next marker.
+int jpeg_decoder::next_marker()
+{
+    uint32_t c, bytes = 0;
+
+    do {
+        do {
+            bytes++;
+            c = get_bits(8);
+        } while (c != 0xFF);
+
+        do {
+            c = get_bits(8);
+        } while (c == 0xFF);
+    } while (c == 0);
+
+    // If bytes > 0 here, there where extra bytes before the marker (not good).
+    return c;
+}
+
+
+// Process markers. Returns when an SOFx, SOI, EOI, or SOS marker is
+// encountered.
+int jpeg_decoder::process_markers()
+{
+    int c;
+
+    for ( ; ; ) {
+        c = next_marker();
+        switch (c) {
+            case M_SOF0:
+            case M_SOF1:
+            case M_SOF2:
+            case M_SOF3:
+            case M_SOF5:
+            case M_SOF6:
+            case M_SOF7:
+      //      case M_JPG:
+            case M_SOF9:
+            case M_SOF10:
+            case M_SOF11:
+            case M_SOF13:
+            case M_SOF14:
+            case M_SOF15:
+            case M_SOI:
+            case M_EOI:
+            case M_SOS: return c;
+            case M_DHT: {
+                read_dht_marker();
+                break;
+            }
+            // No arithmitic support - dumb patents!
+            case M_DAC: {
+                stop_decoding(JPGD_NO_ARITHMITIC_SUPPORT);
+                break;
+            }
+            case M_DQT: {
+                read_dqt_marker();
+                break;
+            }
+            case M_DRI: {
+                read_dri_marker();
+                break;
+            }
+            //case M_APP0:  /* no need to read the JFIF marker */
+            case M_JPG:
+            case M_RST0:    /* no parameters */
+            case M_RST1:
+            case M_RST2:
+            case M_RST3:
+            case M_RST4:
+            case M_RST5:
+            case M_RST6:
+            case M_RST7:
+            case M_TEM: {
+                stop_decoding(JPGD_UNEXPECTED_MARKER);
+                break;
+            }
+            default: {   /* must be DNL, DHP, EXP, APPn, JPGn, COM, or RESn or APP0 */            
+                skip_variable_marker();
+                break;
+            }
+        }
+    }
+}
+
+
+// Finds the start of image (SOI) marker.
+// This code is rather defensive: it only checks the first 512 bytes to avoid
+// false positives.
+void jpeg_decoder::locate_soi_marker()
+{
+    uint32_t lastchar = get_bits(8);
+    uint32_t thischar = get_bits(8);
+
+    /* ok if it's a normal JPEG file without a special header */
+    if ((lastchar == 0xFF) && (thischar == M_SOI)) return;
+
+    uint32_t bytesleft = 4096; //512;
+
+    while (true) {
+        if (--bytesleft == 0) stop_decoding(JPGD_NOT_JPEG);
+
+        lastchar = thischar;
+        thischar = get_bits(8);
+
+        if (lastchar == 0xFF) {
+          if (thischar == M_SOI) break;
+          else if (thischar == M_EOI) stop_decoding(JPGD_NOT_JPEG); // get_bits will keep returning M_EOI if we read past the end    
+        }
+    }
+
+    // Check the next character after marker: if it's not 0xFF, it can't be the start of the next marker, so the file is bad.
+    thischar = (m_bit_buf >> 24) & 0xFF;
+    if (thischar != 0xFF) stop_decoding(JPGD_NOT_JPEG);
+}
+
+
+// Find a start of frame (SOF) marker.
+void jpeg_decoder::locate_sof_marker()
+{
+    locate_soi_marker();
+    int c = process_markers();
+
+    switch (c) {
+        case M_SOF2: m_progressive_flag = true;
+        case M_SOF0:  /* baseline DCT */
+        case M_SOF1: { /* extended sequential DCT */        
+          read_sof_marker();
+          break;
+        }
+        case M_SOF9: {  /* Arithmitic coding */
+          stop_decoding(JPGD_NO_ARITHMITIC_SUPPORT);
+          break;
+        }
+        default: {
+          stop_decoding(JPGD_UNSUPPORTED_MARKER);
+          break;
+        }
+    }
+}
+
+
+// Find a start of scan (SOS) marker.
+int jpeg_decoder::locate_sos_marker()
+{
+    int c = process_markers();
+    if (c == M_EOI) return false;
+    else if (c != M_SOS) stop_decoding(JPGD_UNEXPECTED_MARKER);
+    read_sos_marker();
+    return true;
+}
+
+
+// Reset everything to default/uninitialized state.
+void jpeg_decoder::init(jpeg_decoder_stream *pStream)
+{
+    m_pMem_blocks = nullptr;
+    m_error_code = JPGD_SUCCESS;
+    m_ready_flag = false;
+    m_image_x_size = m_image_y_size = 0;
+    m_pStream = pStream;
+    m_progressive_flag = false;
+
+    memset(m_huff_ac, 0, sizeof(m_huff_ac));
+    memset(m_huff_num, 0, sizeof(m_huff_num));
+    memset(m_huff_val, 0, sizeof(m_huff_val));
+    memset(m_quant, 0, sizeof(m_quant));
+
+    m_scan_type = 0;
+    m_comps_in_frame = 0;
+
+    memset(m_comp_h_samp, 0, sizeof(m_comp_h_samp));
+    memset(m_comp_v_samp, 0, sizeof(m_comp_v_samp));
+    memset(m_comp_quant, 0, sizeof(m_comp_quant));
+    memset(m_comp_ident, 0, sizeof(m_comp_ident));
+    memset(m_comp_h_blocks, 0, sizeof(m_comp_h_blocks));
+    memset(m_comp_v_blocks, 0, sizeof(m_comp_v_blocks));
+
+    m_comps_in_scan = 0;
+    memset(m_comp_list, 0, sizeof(m_comp_list));
+    memset(m_comp_dc_tab, 0, sizeof(m_comp_dc_tab));
+    memset(m_comp_ac_tab, 0, sizeof(m_comp_ac_tab));
+
+    m_spectral_start = 0;
+    m_spectral_end = 0;
+    m_successive_low = 0;
+    m_successive_high = 0;
+    m_max_mcu_x_size = 0;
+    m_max_mcu_y_size = 0;
+    m_blocks_per_mcu = 0;
+    m_max_blocks_per_row = 0;
+    m_mcus_per_row = 0;
+    m_mcus_per_col = 0;
+    m_expanded_blocks_per_component = 0;
+    m_expanded_blocks_per_mcu = 0;
+    m_expanded_blocks_per_row = 0;
+    m_freq_domain_chroma_upsample = false;
+
+    memset(m_mcu_org, 0, sizeof(m_mcu_org));
+
+    m_total_lines_left = 0;
+    m_mcu_lines_left = 0;
+    m_real_dest_bytes_per_scan_line = 0;
+    m_dest_bytes_per_scan_line = 0;
+    m_dest_bytes_per_pixel = 0;
+
+    memset(m_pHuff_tabs, 0, sizeof(m_pHuff_tabs));
+
+    memset(m_dc_coeffs, 0, sizeof(m_dc_coeffs));
+    memset(m_ac_coeffs, 0, sizeof(m_ac_coeffs));
+    memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
+
+    m_eob_run = 0;
+
+    memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
+
+    m_pIn_buf_ofs = m_in_buf;
+    m_in_buf_left = 0;
+    m_eof_flag = false;
+    m_tem_flag = 0;
+
+    memset(m_in_buf_pad_start, 0, sizeof(m_in_buf_pad_start));
+    memset(m_in_buf, 0, sizeof(m_in_buf));
+    memset(m_in_buf_pad_end, 0, sizeof(m_in_buf_pad_end));
+
+    m_restart_interval = 0;
+    m_restarts_left    = 0;
+    m_next_restart_num = 0;
+
+    m_max_mcus_per_row = 0;
+    m_max_blocks_per_mcu = 0;
+    m_max_mcus_per_col = 0;
+
+    memset(m_last_dc_val, 0, sizeof(m_last_dc_val));
+    m_pMCU_coefficients = nullptr;
+    m_pSample_buf = nullptr;
+
+    m_total_bytes_read = 0;
+
+    m_pScan_line_0 = nullptr;
+    m_pScan_line_1 = nullptr;
+
+    // Ready the input buffer.
+    prep_in_buffer();
+
+    // Prime the bit buffer.
+    m_bits_left = 16;
+    m_bit_buf = 0;
+
+    get_bits(16);
+    get_bits(16);
+
+    for (int i = 0; i < JPGD_MAX_BLOCKS_PER_MCU; i++) {
+        m_mcu_block_max_zag[i] = 64;
+    }
+}
+
+#define SCALEBITS 16
+#define ONE_HALF  ((int) 1 << (SCALEBITS-1))
+#define FIX(x)    ((int) ((x) * (1L<<SCALEBITS) + 0.5f))
+
+
+// Create a few tables that allow us to quickly convert YCbCr to RGB.
+void jpeg_decoder::create_look_ups()
+{
+  for (int i = 0; i <= 255; i++) {
+      int k = i - 128;
+      m_crr[i] = ( FIX(1.40200f)  * k + ONE_HALF) >> SCALEBITS;
+      m_cbb[i] = ( FIX(1.77200f)  * k + ONE_HALF) >> SCALEBITS;
+      m_crg[i] = (-FIX(0.71414f)) * k;
+      m_cbg[i] = (-FIX(0.34414f)) * k + ONE_HALF;
+  }
+}
+
+
+// This method throws back into the stream any bytes that where read
+// into the bit buffer during initial marker scanning.
+void jpeg_decoder::fix_in_buffer()
+{
+    // In case any 0xFF's where pulled into the buffer during marker scanning.
+    JPGD_ASSERT((m_bits_left & 7) == 0);
+
+    if (m_bits_left == 16) stuff_char( (uint8_t)(m_bit_buf & 0xFF));
+    if (m_bits_left >= 8) stuff_char( (uint8_t)((m_bit_buf >> 8) & 0xFF));
+
+    stuff_char((uint8_t)((m_bit_buf >> 16) & 0xFF));
+    stuff_char((uint8_t)((m_bit_buf >> 24) & 0xFF));
+
+    m_bits_left = 16;
+    get_bits_no_markers(16);
+    get_bits_no_markers(16);
+}
+
+
+void jpeg_decoder::transform_mcu(int mcu_row)
+{
+    jpgd_block_t* pSrc_ptr = m_pMCU_coefficients;
+    uint8_t* pDst_ptr = m_pSample_buf + mcu_row * m_blocks_per_mcu * 64;
+
+    for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++) {
+        idct(pSrc_ptr, pDst_ptr, m_mcu_block_max_zag[mcu_block]);
+        pSrc_ptr += 64;
+        pDst_ptr += 64;
+    }
+}
+
+
+static const uint8_t s_max_rc[64] =
+{
+    17, 18, 34, 50, 50, 51, 52, 52, 52, 68, 84, 84, 84, 84, 85, 86, 86, 86, 86, 86,
+    102, 118, 118, 118, 118, 118, 118, 119, 120, 120, 120, 120, 120, 120, 120, 136,
+    136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136,
+    136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136
+};
+
+
+void jpeg_decoder::transform_mcu_expand(int mcu_row)
+{
+    jpgd_block_t* pSrc_ptr = m_pMCU_coefficients;
+    uint8_t* pDst_ptr = m_pSample_buf + mcu_row * m_expanded_blocks_per_mcu * 64;
+
+    // Y IDCT
+    int mcu_block;
+    for (mcu_block = 0; mcu_block < m_expanded_blocks_per_component; mcu_block++) {
+        idct(pSrc_ptr, pDst_ptr, m_mcu_block_max_zag[mcu_block]);
+        pSrc_ptr += 64;
+        pDst_ptr += 64;
+    }
+
+    // Chroma IDCT, with upsampling
+    jpgd_block_t temp_block[64];
+
+    for (int i = 0; i < 2; i++) {
+        DCT_Upsample::Matrix44 P, Q, R, S;
+        JPGD_ASSERT(m_mcu_block_max_zag[mcu_block] >= 1);
+        JPGD_ASSERT(m_mcu_block_max_zag[mcu_block] <= 64);
+
+        int max_zag = m_mcu_block_max_zag[mcu_block++] - 1; 
+        if (max_zag <= 0) max_zag = 0; // should never happen, only here to shut up static analysis
+
+        switch (s_max_rc[max_zag]) {
+            case 1*16+1:
+                DCT_Upsample::P_Q<1, 1>::calc(P, Q, pSrc_ptr);
+                DCT_Upsample::R_S<1, 1>::calc(R, S, pSrc_ptr);
+                break;
+            case 1*16+2:
+                DCT_Upsample::P_Q<1, 2>::calc(P, Q, pSrc_ptr);
+                DCT_Upsample::R_S<1, 2>::calc(R, S, pSrc_ptr);
+                break;
+            case 2*16+2:
+                DCT_Upsample::P_Q<2, 2>::calc(P, Q, pSrc_ptr);
+                DCT_Upsample::R_S<2, 2>::calc(R, S, pSrc_ptr);
+                break;
+            case 3*16+2:
+                DCT_Upsample::P_Q<3, 2>::calc(P, Q, pSrc_ptr);
+                DCT_Upsample::R_S<3, 2>::calc(R, S, pSrc_ptr);
+                break;
+            case 3*16+3:
+                DCT_Upsample::P_Q<3, 3>::calc(P, Q, pSrc_ptr);
+                DCT_Upsample::R_S<3, 3>::calc(R, S, pSrc_ptr);
+                break;
+            case 3*16+4:
+                DCT_Upsample::P_Q<3, 4>::calc(P, Q, pSrc_ptr);
+                DCT_Upsample::R_S<3, 4>::calc(R, S, pSrc_ptr);
+                break;
+            case 4*16+4:
+                DCT_Upsample::P_Q<4, 4>::calc(P, Q, pSrc_ptr);
+                DCT_Upsample::R_S<4, 4>::calc(R, S, pSrc_ptr);
+                break;
+            case 5*16+4:
+                DCT_Upsample::P_Q<5, 4>::calc(P, Q, pSrc_ptr);
+                DCT_Upsample::R_S<5, 4>::calc(R, S, pSrc_ptr);
+                break;
+            case 5*16+5:
+                DCT_Upsample::P_Q<5, 5>::calc(P, Q, pSrc_ptr);
+                DCT_Upsample::R_S<5, 5>::calc(R, S, pSrc_ptr);
+                break;
+            case 5*16+6:
+                DCT_Upsample::P_Q<5, 6>::calc(P, Q, pSrc_ptr);
+                DCT_Upsample::R_S<5, 6>::calc(R, S, pSrc_ptr);
+                break;
+            case 6*16+6:
+                DCT_Upsample::P_Q<6, 6>::calc(P, Q, pSrc_ptr);
+                DCT_Upsample::R_S<6, 6>::calc(R, S, pSrc_ptr);
+                break;
+            case 7*16+6:
+                DCT_Upsample::P_Q<7, 6>::calc(P, Q, pSrc_ptr);
+                DCT_Upsample::R_S<7, 6>::calc(R, S, pSrc_ptr);
+                break;
+            case 7*16+7:
+                DCT_Upsample::P_Q<7, 7>::calc(P, Q, pSrc_ptr);
+                DCT_Upsample::R_S<7, 7>::calc(R, S, pSrc_ptr);
+                break;
+            case 7*16+8:
+                DCT_Upsample::P_Q<7, 8>::calc(P, Q, pSrc_ptr);
+                DCT_Upsample::R_S<7, 8>::calc(R, S, pSrc_ptr);
+                break;
+            case 8*16+8:
+                DCT_Upsample::P_Q<8, 8>::calc(P, Q, pSrc_ptr);
+                DCT_Upsample::R_S<8, 8>::calc(R, S, pSrc_ptr);
+                break;
+            default:
+                JPGD_ASSERT(false);
+        }
+        DCT_Upsample::Matrix44 a(P + Q); P -= Q;
+        DCT_Upsample::Matrix44& b = P;
+        DCT_Upsample::Matrix44 c(R + S); R -= S;
+        DCT_Upsample::Matrix44& d = R;
+
+        DCT_Upsample::Matrix44::add_and_store(temp_block, a, c);
+        idct_4x4(temp_block, pDst_ptr);
+        pDst_ptr += 64;
+
+        DCT_Upsample::Matrix44::sub_and_store(temp_block, a, c);
+        idct_4x4(temp_block, pDst_ptr);
+        pDst_ptr += 64;
+
+        DCT_Upsample::Matrix44::add_and_store(temp_block, b, d);
+        idct_4x4(temp_block, pDst_ptr);
+        pDst_ptr += 64;
+
+        DCT_Upsample::Matrix44::sub_and_store(temp_block, b, d);
+        idct_4x4(temp_block, pDst_ptr);
+        pDst_ptr += 64;
+        pSrc_ptr += 64;
+    }
+}
+
+
+// Loads and dequantizes the next row of (already decoded) coefficients.
+// Progressive images only.
+void jpeg_decoder::load_next_row()
+{
+    int i;
+    jpgd_block_t *p;
+    jpgd_quant_t *q;
+    int mcu_row, mcu_block, row_block = 0;
+    int component_num, component_id;
+    int block_x_mcu[JPGD_MAX_COMPONENTS];
+
+    memset(block_x_mcu, 0, JPGD_MAX_COMPONENTS * sizeof(int));
+
+    for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++) {
+        int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
+
+        for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++) {
+            component_id = m_mcu_org[mcu_block];
+            q = m_quant[m_comp_quant[component_id]];
+            p = m_pMCU_coefficients + 64 * mcu_block;
+
+            jpgd_block_t* pAC = coeff_buf_getp(m_ac_coeffs[component_id], block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
+            jpgd_block_t* pDC = coeff_buf_getp(m_dc_coeffs[component_id], block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
+            p[0] = pDC[0];
+            memcpy(&p[1], &pAC[1], 63 * sizeof(jpgd_block_t));
+
+            for (i = 63; i > 0; i--) { 
+                if (p[g_ZAG[i]]) break;
+            }
+
+            m_mcu_block_max_zag[mcu_block] = i + 1;
+
+            for ( ; i >= 0; i--) {
+                if (p[g_ZAG[i]]) {
+                    p[g_ZAG[i]] = static_cast<jpgd_block_t>(p[g_ZAG[i]] * q[i]);
+                }
+            }
+
+            row_block++;
+
+            if (m_comps_in_scan == 1) block_x_mcu[component_id]++;
+            else {
+                if (++block_x_mcu_ofs == m_comp_h_samp[component_id]) block_x_mcu_ofs = 0;
+                if (++block_y_mcu_ofs == m_comp_v_samp[component_id]) {
+                    block_y_mcu_ofs = 0;
+                    block_x_mcu[component_id] += m_comp_h_samp[component_id];
+                }            
+            }
+        }
+        if (m_freq_domain_chroma_upsample) transform_mcu_expand(mcu_row);
+        else transform_mcu(mcu_row);
+    }
+    if (m_comps_in_scan == 1) m_block_y_mcu[m_comp_list[0]]++;
+    else {
+        for (component_num = 0; component_num < m_comps_in_scan; component_num++) {
+            component_id = m_comp_list[component_num];
+            m_block_y_mcu[component_id] += m_comp_v_samp[component_id];
+        }
+    }
+}
+
+
+// Restart interval processing.
+void jpeg_decoder::process_restart()
+{
+    int i;
+    int c = 0;
+
+    // Align to a byte boundry
+    // FIXME: Is this really necessary? get_bits_no_markers() never reads in markers!
+    //get_bits_no_markers(m_bits_left & 7);
+
+    // Let's scan a little bit to find the marker, but not _too_ far.
+    // 1536 is a "fudge factor" that determines how much to scan.
+    for (i = 1536; i > 0; i--) {
+        if (get_char() == 0xFF) break;
+    }
+    if (i == 0) stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+    for ( ; i > 0; i--) {
+        if ((c = get_char()) != 0xFF) break;
+    }
+    if (i == 0) stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+    // Is it the expected marker? If not, something bad happened.
+    if (c != (m_next_restart_num + M_RST0)) stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+    // Reset each component's DC prediction values.
+    memset(&m_last_dc_val, 0, m_comps_in_frame * sizeof(uint32_t));
+
+    m_eob_run = 0;
+    m_restarts_left = m_restart_interval;
+    m_next_restart_num = (m_next_restart_num + 1) & 7;
+
+    // Get the bit buffer going again...
+    m_bits_left = 16;
+    get_bits_no_markers(16);
+    get_bits_no_markers(16);
+}
+
+
+static inline int dequantize_ac(int c, int q)
+{ 
+    c *= q;
+    return c;
+}
+
+// Decodes and dequantizes the next row of coefficients.
+void jpeg_decoder::decode_next_row()
+{
+    int row_block = 0;
+
+    for (int mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++) {
+        if ((m_restart_interval) && (m_restarts_left == 0)) process_restart();
+
+        jpgd_block_t* p = m_pMCU_coefficients;
+
+        for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++, p += 64) {
+            int component_id = m_mcu_org[mcu_block];
+            jpgd_quant_t* q = m_quant[m_comp_quant[component_id]];
+
+            int r, s;
+            s = huff_decode(m_pHuff_tabs[m_comp_dc_tab[component_id]], r);
+            s = JPGD_HUFF_EXTEND(r, s);
+
+            m_last_dc_val[component_id] = (s += m_last_dc_val[component_id]);
+
+            p[0] = static_cast<jpgd_block_t>(s * q[0]);
+
+            int prev_num_set = m_mcu_block_max_zag[mcu_block];
+            huff_tables *pH = m_pHuff_tabs[m_comp_ac_tab[component_id]];
+            int k;
+            for (k = 1; k < 64; k++) {
+                int extra_bits;
+                s = huff_decode(pH, extra_bits);
+                r = s >> 4;
+                s &= 15;
+
+                if (s) {
+                    if (r) {
+                        if ((k + r) > 63) stop_decoding(JPGD_DECODE_ERROR);
+                        if (k < prev_num_set) {
+                            int n = JPGD_MIN(r, prev_num_set - k);
+                            int kt = k;
+                            while (n--) p[g_ZAG[kt++]] = 0;
+                        }
+                        k += r;
+                    }                
+                    s = JPGD_HUFF_EXTEND(extra_bits, s);
+                    JPGD_ASSERT(k < 64);
+                    p[g_ZAG[k]] = static_cast<jpgd_block_t>(dequantize_ac(s, q[k])); //s * q[k];
+                } else {
+                    if (r == 15) {
+                        if ((k + 16) > 64) stop_decoding(JPGD_DECODE_ERROR);
+                        if (k < prev_num_set) {
+                            int n = JPGD_MIN(16, prev_num_set - k);
+                            int kt = k;
+                            while (n--) {
+                                JPGD_ASSERT(kt <= 63);
+                                p[g_ZAG[kt++]] = 0;
+                            }
+                        }
+                        k += 16 - 1; // - 1 because the loop counter is k
+                        JPGD_ASSERT(p[g_ZAG[k]] == 0);
+                    } else  break;
+                }
+            }
+
+            if (k < prev_num_set) {
+                int kt = k;
+                while (kt < prev_num_set) p[g_ZAG[kt++]] = 0;
+            }
+
+            m_mcu_block_max_zag[mcu_block] = k;
+            row_block++;
+        }
+        if (m_freq_domain_chroma_upsample) transform_mcu_expand(mcu_row);
+        else transform_mcu(mcu_row);
+        m_restarts_left--;
+    }
+}
+
+
+// YCbCr H1V1 (1x1:1:1, 3 m_blocks per MCU) to RGB
+void jpeg_decoder::H1V1Convert()
+{
+    int row = m_max_mcu_y_size - m_mcu_lines_left;
+    uint8_t *d = m_pScan_line_0;
+    uint8_t *s = m_pSample_buf + row * 8;
+
+    for (int i = m_max_mcus_per_row; i > 0; i--) {
+        for (int j = 0; j < 8; j++) {
+            int y = s[j];
+            int cb = s[64+j];
+            int cr = s[128+j];
+
+            d[0] = clamp(y + m_crr[cr]);
+            d[1] = clamp(y + ((m_crg[cr] + m_cbg[cb]) >> 16));
+            d[2] = clamp(y + m_cbb[cb]);
+            d[3] = 255;
+            d += 4;
+        }
+        s += 64*3;
+    }
+}
+
+
+// YCbCr H2V1 (2x1:1:1, 4 m_blocks per MCU) to RGB
+void jpeg_decoder::H2V1Convert()
+{
+    int row = m_max_mcu_y_size - m_mcu_lines_left;
+    uint8_t *d0 = m_pScan_line_0;
+    uint8_t *y = m_pSample_buf + row * 8;
+    uint8_t *c = m_pSample_buf + 2*64 + row * 8;
+
+    for (int i = m_max_mcus_per_row; i > 0; i--) {
+        for (int l = 0; l < 2; l++) {
+            for (int j = 0; j < 4; j++) {
+                int cb = c[0];
+                int cr = c[64];
+
+                int rc = m_crr[cr];
+                int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+                int bc = m_cbb[cb];
+
+                int yy = y[j<<1];
+                d0[0] = clamp(yy+rc);
+                d0[1] = clamp(yy+gc);
+                d0[2] = clamp(yy+bc);
+                d0[3] = 255;
+
+                yy = y[(j<<1)+1];
+                d0[4] = clamp(yy+rc);
+                d0[5] = clamp(yy+gc);
+                d0[6] = clamp(yy+bc);
+                d0[7] = 255;
+                d0 += 8;
+                c++;
+            }
+            y += 64;
+        }
+        y += 64*4 - 64*2;
+        c += 64*4 - 8;
+    }
+}
+
+
+// YCbCr H2V1 (1x2:1:1, 4 m_blocks per MCU) to RGB
+void jpeg_decoder::H1V2Convert()
+{
+    int row = m_max_mcu_y_size - m_mcu_lines_left;
+    uint8_t *d0 = m_pScan_line_0;
+    uint8_t *d1 = m_pScan_line_1;
+    uint8_t *y;
+    uint8_t *c;
+
+    if (row < 8) y = m_pSample_buf + row * 8;
+    else y = m_pSample_buf + 64*1 + (row & 7) * 8;
+
+    c = m_pSample_buf + 64*2 + (row >> 1) * 8;
+
+    for (int i = m_max_mcus_per_row; i > 0; i--) {
+        for (int j = 0; j < 8; j++) {
+            int cb = c[0+j];
+            int cr = c[64+j];
+
+            int rc = m_crr[cr];
+            int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+            int bc = m_cbb[cb];
+
+            int yy = y[j];
+            d0[0] = clamp(yy+rc);
+            d0[1] = clamp(yy+gc);
+            d0[2] = clamp(yy+bc);
+            d0[3] = 255;
+
+            yy = y[8+j];
+            d1[0] = clamp(yy+rc);
+            d1[1] = clamp(yy+gc);
+            d1[2] = clamp(yy+bc);
+            d1[3] = 255;
+
+            d0 += 4;
+            d1 += 4;
+        }
+        y += 64*4;
+        c += 64*4;
+    }
+}
+
+
+// YCbCr H2V2 (2x2:1:1, 6 m_blocks per MCU) to RGB
+void jpeg_decoder::H2V2Convert()
+{
+    int row = m_max_mcu_y_size - m_mcu_lines_left;
+    uint8_t *d0 = m_pScan_line_0;
+    uint8_t *d1 = m_pScan_line_1;
+    uint8_t *y;
+    uint8_t *c;
+
+    if (row < 8) y = m_pSample_buf + row * 8;
+    else y = m_pSample_buf + 64*2 + (row & 7) * 8;
+
+    c = m_pSample_buf + 64*4 + (row >> 1) * 8;
+
+    for (int i = m_max_mcus_per_row; i > 0; i--) {
+        for (int l = 0; l < 2; l++) {
+            for (int j = 0; j < 8; j += 2) {
+                int cb = c[0];
+                int cr = c[64];
+
+                int rc = m_crr[cr];
+                int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+                int bc = m_cbb[cb];
+
+                int yy = y[j];
+                d0[0] = clamp(yy+rc);
+                d0[1] = clamp(yy+gc);
+                d0[2] = clamp(yy+bc);
+                d0[3] = 255;
+
+                yy = y[j+1];
+                d0[4] = clamp(yy+rc);
+                d0[5] = clamp(yy+gc);
+                d0[6] = clamp(yy+bc);
+                d0[7] = 255;
+
+                yy = y[j+8];
+                d1[0] = clamp(yy+rc);
+                d1[1] = clamp(yy+gc);
+                d1[2] = clamp(yy+bc);
+                d1[3] = 255;
+
+                yy = y[j+8+1];
+                d1[4] = clamp(yy+rc);
+                d1[5] = clamp(yy+gc);
+                d1[6] = clamp(yy+bc);
+                d1[7] = 255;
+
+                d0 += 8;
+                d1 += 8;
+
+                c++;
+            }
+            y += 64;
+        }
+        y += 64*6 - 64*2;
+        c += 64*6 - 8;
+    }
+}
+
+
+// Y (1 block per MCU) to 8-bit grayscale
+void jpeg_decoder::gray_convert()
+{
+    int row = m_max_mcu_y_size - m_mcu_lines_left;
+    uint8_t *d = m_pScan_line_0;
+    uint8_t *s = m_pSample_buf + row * 8;
+
+    for (int i = m_max_mcus_per_row; i > 0; i--) {
+        *(uint32_t *)d = *(uint32_t *)s;
+        *(uint32_t *)(&d[4]) = *(uint32_t *)(&s[4]);
+        s += 64;
+        d += 8;
+    }
+}
+
+
+void jpeg_decoder::expanded_convert()
+{
+    int row = m_max_mcu_y_size - m_mcu_lines_left;
+    uint8_t* Py = m_pSample_buf + (row / 8) * 64 * m_comp_h_samp[0] + (row & 7) * 8;
+    uint8_t* d = m_pScan_line_0;
+
+    for (int i = m_max_mcus_per_row; i > 0; i--) {
+        for (int k = 0; k < m_max_mcu_x_size; k += 8) {
+            const int Y_ofs = k * 8;
+            const int Cb_ofs = Y_ofs + 64 * m_expanded_blocks_per_component;
+            const int Cr_ofs = Y_ofs + 64 * m_expanded_blocks_per_component * 2;
+            for (int j = 0; j < 8; j++) {
+                int y = Py[Y_ofs + j];
+                int cb = Py[Cb_ofs + j];
+                int cr = Py[Cr_ofs + j];
+
+                d[0] = clamp(y + m_crr[cr]);
+                d[1] = clamp(y + ((m_crg[cr] + m_cbg[cb]) >> 16));
+                d[2] = clamp(y + m_cbb[cb]);
+                d[3] = 255;
+
+                d += 4;
+            }
+        }
+        Py += 64 * m_expanded_blocks_per_mcu;
+    }
+}
+
+
+// Find end of image (EOI) marker, so we can return to the user the exact size of the input stream.
+void jpeg_decoder::find_eoi()
+{
+    if (!m_progressive_flag) {
+        // Attempt to read the EOI marker.
+        //get_bits_no_markers(m_bits_left & 7);
+
+        // Prime the bit buffer
+        m_bits_left = 16;
+        get_bits(16);
+        get_bits(16);
+
+        // The next marker _should_ be EOI
+        process_markers();
+    }
+    m_total_bytes_read -= m_in_buf_left;
+}
+
+
+int jpeg_decoder::decode(const void** pScan_line, uint32_t* pScan_line_len)
+{
+    if ((m_error_code) || (!m_ready_flag)) return JPGD_FAILED;
+    if (m_total_lines_left == 0) return JPGD_DONE;
+    if (m_mcu_lines_left == 0) {
+        if (setjmp(m_jmp_state)) return JPGD_FAILED;
+        if (m_progressive_flag) load_next_row();
+        else decode_next_row();
+        // Find the EOI marker if that was the last row.
+        if (m_total_lines_left <= m_max_mcu_y_size) find_eoi();
+        m_mcu_lines_left = m_max_mcu_y_size;
+    }
+
+    if (m_freq_domain_chroma_upsample) {
+        expanded_convert();
+        *pScan_line = m_pScan_line_0;
+    } else {
+        switch (m_scan_type) {
+            case JPGD_YH2V2: {
+                if ((m_mcu_lines_left & 1) == 0) {
+                    H2V2Convert();
+                    *pScan_line = m_pScan_line_0;
+                }
+              else *pScan_line = m_pScan_line_1;
+              break;
+            } 
+            case JPGD_YH2V1: {
+                H2V1Convert();
+                *pScan_line = m_pScan_line_0;
+                break;
+            }
+            case JPGD_YH1V2: {
+                if ((m_mcu_lines_left & 1) == 0) {
+                    H1V2Convert();
+                    *pScan_line = m_pScan_line_0;
+                } else *pScan_line = m_pScan_line_1;
+                break;
+            }
+            case JPGD_YH1V1: {
+                H1V1Convert();
+                *pScan_line = m_pScan_line_0;
+                break;
+            }
+            case JPGD_GRAYSCALE: {
+                gray_convert();
+                *pScan_line = m_pScan_line_0;
+                break;
+            }
+        }
+    }
+
+    *pScan_line_len = m_real_dest_bytes_per_scan_line;
+    m_mcu_lines_left--;
+    m_total_lines_left--;
+
+    return JPGD_SUCCESS;
+}
+
+
+// Creates the tables needed for efficient Huffman decoding.
+void jpeg_decoder::make_huff_table(int index, huff_tables *pH)
+{
+    int p, i, l, si;
+    uint8_t huffsize[257];
+    uint32_t huffcode[257];
+    uint32_t code;
+    uint32_t subtree;
+    int code_size;
+    int lastp;
+    int nextfreeentry;
+    int currententry;
+
+    pH->ac_table = m_huff_ac[index] != 0;
+    p = 0;
+
+    for (l = 1; l <= 16; l++)  {
+        for (i = 1; i <= m_huff_num[index][l]; i++) {
+            huffsize[p++] = static_cast<uint8_t>(l);
+        }
+    }
+
+    huffsize[p] = 0;
+    lastp = p;
+    code = 0;
+    si = huffsize[0];
+    p = 0;
+
+    while (huffsize[p]) {
+        while (huffsize[p] == si) {
+            huffcode[p++] = code;
+            code++;
+        }
+        code <<= 1;
+        si++;
+    }
+
+    memset(pH->look_up, 0, sizeof(pH->look_up));
+    memset(pH->look_up2, 0, sizeof(pH->look_up2));
+    memset(pH->tree, 0, sizeof(pH->tree));
+    memset(pH->code_size, 0, sizeof(pH->code_size));
+
+    nextfreeentry = -1;
+    p = 0;
+
+    while (p < lastp) {
+        i = m_huff_val[index][p];
+        code = huffcode[p];
+        code_size = huffsize[p];
+        pH->code_size[i] = static_cast<uint8_t>(code_size);
+
+        if (code_size <= 8) {
+            code <<= (8 - code_size);
+            for (l = 1 << (8 - code_size); l > 0; l--) {
+                JPGD_ASSERT(i < 256);
+                pH->look_up[code] = i;
+                bool has_extrabits = false;
+                int extra_bits = 0;
+                int num_extra_bits = i & 15;
+                int bits_to_fetch = code_size;
+
+                if (num_extra_bits) {
+                    int total_codesize = code_size + num_extra_bits;
+                    if (total_codesize <= 8) {
+                        has_extrabits = true;
+                        extra_bits = ((1 << num_extra_bits) - 1) & (code >> (8 - total_codesize));
+                        JPGD_ASSERT(extra_bits <= 0x7FFF);
+                        bits_to_fetch += num_extra_bits;
+                    }
+                }
+                if (!has_extrabits) pH->look_up2[code] = i | (bits_to_fetch << 8);
+                else pH->look_up2[code] = i | 0x8000 | (extra_bits << 16) | (bits_to_fetch << 8);
+                code++;
+            }
+        } else {
+            subtree = (code >> (code_size - 8)) & 0xFF;
+            currententry = pH->look_up[subtree];
+
+            if (currententry == 0) {
+                pH->look_up[subtree] = currententry = nextfreeentry;
+                pH->look_up2[subtree] = currententry = nextfreeentry;
+                nextfreeentry -= 2;
+            }
+
+            code <<= (16 - (code_size - 8));
+
+            for (l = code_size; l > 9; l--) {
+                if ((code & 0x8000) == 0) currententry--;
+                if (pH->tree[-currententry - 1] == 0) {
+                    pH->tree[-currententry - 1] = nextfreeentry;
+                    currententry = nextfreeentry;
+                    nextfreeentry -= 2;
+                } else currententry = pH->tree[-currententry - 1];
+                code <<= 1;
+            }
+            if ((code & 0x8000) == 0) currententry--;
+            pH->tree[-currententry - 1] = i;
+        }
+        p++;
+    }
+}
+
+
+// Verifies the quantization tables needed for this scan are available.
+void jpeg_decoder::check_quant_tables()
+{
+    for (int i = 0; i < m_comps_in_scan; i++) {
+        if (m_quant[m_comp_quant[m_comp_list[i]]] == nullptr) stop_decoding(JPGD_UNDEFINED_QUANT_TABLE);
+    }
+}
+
+
+// Verifies that all the Huffman tables needed for this scan are available.
+void jpeg_decoder::check_huff_tables()
+{
+    for (int i = 0; i < m_comps_in_scan; i++) {
+      if ((m_spectral_start == 0) && (m_huff_num[m_comp_dc_tab[m_comp_list[i]]] == nullptr)) stop_decoding(JPGD_UNDEFINED_HUFF_TABLE);
+      if ((m_spectral_end > 0) && (m_huff_num[m_comp_ac_tab[m_comp_list[i]]] == nullptr)) stop_decoding(JPGD_UNDEFINED_HUFF_TABLE);
+    }
+
+    for (int i = 0; i < JPGD_MAX_HUFF_TABLES; i++) {
+        if (m_huff_num[i]) {
+            if (!m_pHuff_tabs[i]) m_pHuff_tabs[i] = (huff_tables *)alloc(sizeof(huff_tables));
+            make_huff_table(i, m_pHuff_tabs[i]);
+        }
+    }
+}
+
+
+// Determines the component order inside each MCU.
+// Also calcs how many MCU's are on each row, etc.
+void jpeg_decoder::calc_mcu_block_order()
+{
+    int component_num, component_id;
+    int max_h_samp = 0, max_v_samp = 0;
+
+    for (component_id = 0; component_id < m_comps_in_frame; component_id++) {
+        if (m_comp_h_samp[component_id] > max_h_samp) {
+          max_h_samp = m_comp_h_samp[component_id];
+        }
+        if (m_comp_v_samp[component_id] > max_v_samp) {
+          max_v_samp = m_comp_v_samp[component_id];
+        }
+    }
+
+    for (component_id = 0; component_id < m_comps_in_frame; component_id++) {
+        m_comp_h_blocks[component_id] = ((((m_image_x_size * m_comp_h_samp[component_id]) + (max_h_samp - 1)) / max_h_samp) + 7) / 8;
+        m_comp_v_blocks[component_id] = ((((m_image_y_size * m_comp_v_samp[component_id]) + (max_v_samp - 1)) / max_v_samp) + 7) / 8;
+    }
+
+    if (m_comps_in_scan == 1) {
+        m_mcus_per_row = m_comp_h_blocks[m_comp_list[0]];
+        m_mcus_per_col = m_comp_v_blocks[m_comp_list[0]];
+    } else {
+        m_mcus_per_row = (((m_image_x_size + 7) / 8) + (max_h_samp - 1)) / max_h_samp;
+        m_mcus_per_col = (((m_image_y_size + 7) / 8) + (max_v_samp - 1)) / max_v_samp;
+    }
+
+    if (m_comps_in_scan == 1) {
+        m_mcu_org[0] = m_comp_list[0];
+        m_blocks_per_mcu = 1;
+    } else {
+        m_blocks_per_mcu = 0;
+
+        for (component_num = 0; component_num < m_comps_in_scan; component_num++) {
+            int num_blocks;
+            component_id = m_comp_list[component_num];
+            num_blocks = m_comp_h_samp[component_id] * m_comp_v_samp[component_id];
+            while (num_blocks--) m_mcu_org[m_blocks_per_mcu++] = component_id;
+        }
+    }
+}
+
+
+// Starts a new scan.
+int jpeg_decoder::init_scan()
+{
+    if (!locate_sos_marker()) return false;
+
+    calc_mcu_block_order();
+    check_huff_tables();
+    check_quant_tables();
+
+    memset(m_last_dc_val, 0, m_comps_in_frame * sizeof(uint32_t));
+
+    m_eob_run = 0;
+
+    if (m_restart_interval) {
+        m_restarts_left = m_restart_interval;
+        m_next_restart_num = 0;
+    }
+    fix_in_buffer();
+    return true;
+}
+
+
+// Starts a frame. Determines if the number of components or sampling factors
+// are supported.
+void jpeg_decoder::init_frame()
+{
+    int i;
+
+    if (m_comps_in_frame == 1) {
+        if ((m_comp_h_samp[0] != 1) || (m_comp_v_samp[0] != 1)) stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+        m_scan_type = JPGD_GRAYSCALE;
+        m_max_blocks_per_mcu = 1;
+        m_max_mcu_x_size = 8;
+        m_max_mcu_y_size = 8;
+    } else if (m_comps_in_frame == 3) {
+        if (((m_comp_h_samp[1] != 1) || (m_comp_v_samp[1] != 1)) || ((m_comp_h_samp[2] != 1) || (m_comp_v_samp[2] != 1)))
+            stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+
+        if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 1)) {
+            m_scan_type = JPGD_YH1V1;
+            m_max_blocks_per_mcu = 3;
+            m_max_mcu_x_size = 8;
+            m_max_mcu_y_size = 8;
+        } else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 1)) {
+            m_scan_type = JPGD_YH2V1;
+            m_max_blocks_per_mcu = 4;
+            m_max_mcu_x_size = 16;
+            m_max_mcu_y_size = 8;
+        } else if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 2)) {
+            m_scan_type = JPGD_YH1V2;
+            m_max_blocks_per_mcu = 4;
+            m_max_mcu_x_size = 8;
+            m_max_mcu_y_size = 16;
+        } else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 2)) {
+            m_scan_type = JPGD_YH2V2;
+            m_max_blocks_per_mcu = 6;
+            m_max_mcu_x_size = 16;
+            m_max_mcu_y_size = 16;
+        } else stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+    } else stop_decoding(JPGD_UNSUPPORTED_COLORSPACE);
+
+    m_max_mcus_per_row = (m_image_x_size + (m_max_mcu_x_size - 1)) / m_max_mcu_x_size;
+    m_max_mcus_per_col = (m_image_y_size + (m_max_mcu_y_size - 1)) / m_max_mcu_y_size;
+
+    // These values are for the *destination* pixels: after conversion.
+    if (m_scan_type == JPGD_GRAYSCALE) m_dest_bytes_per_pixel = 1;
+    else m_dest_bytes_per_pixel = 4;
+
+    m_dest_bytes_per_scan_line = ((m_image_x_size + 15) & 0xFFF0) * m_dest_bytes_per_pixel;
+    m_real_dest_bytes_per_scan_line = (m_image_x_size * m_dest_bytes_per_pixel);
+
+    // Initialize two scan line buffers.
+    m_pScan_line_0 = (uint8_t *)alloc(m_dest_bytes_per_scan_line, true);
+    if ((m_scan_type == JPGD_YH1V2) || (m_scan_type == JPGD_YH2V2)) {
+        m_pScan_line_1 = (uint8_t *)alloc(m_dest_bytes_per_scan_line, true);
+    }
+
+    m_max_blocks_per_row = m_max_mcus_per_row * m_max_blocks_per_mcu;
+
+    // Should never happen
+    if (m_max_blocks_per_row > JPGD_MAX_BLOCKS_PER_ROW) stop_decoding(JPGD_ASSERTION_ERROR);
+
+    // Allocate the coefficient buffer, enough for one MCU
+    m_pMCU_coefficients = (jpgd_block_t*)alloc(m_max_blocks_per_mcu * 64 * sizeof(jpgd_block_t));
+
+    for (i = 0; i < m_max_blocks_per_mcu; i++) {
+        m_mcu_block_max_zag[i] = 64;
+    }
+
+    m_expanded_blocks_per_component = m_comp_h_samp[0] * m_comp_v_samp[0];
+    m_expanded_blocks_per_mcu = m_expanded_blocks_per_component * m_comps_in_frame;
+    m_expanded_blocks_per_row = m_max_mcus_per_row * m_expanded_blocks_per_mcu;
+    // Freq. domain chroma upsampling is only supported for H2V2 subsampling factor (the most common one I've seen).
+    m_freq_domain_chroma_upsample = false;
+#if JPGD_SUPPORT_FREQ_DOMAIN_UPSAMPLING
+    m_freq_domain_chroma_upsample = (m_expanded_blocks_per_mcu == 4*3);
+#endif
+
+    if (m_freq_domain_chroma_upsample)
+        m_pSample_buf = (uint8_t *)alloc(m_expanded_blocks_per_row * 64);
+    else
+        m_pSample_buf = (uint8_t *)alloc(m_max_blocks_per_row * 64);
+
+    m_total_lines_left = m_image_y_size;
+    m_mcu_lines_left = 0;
+    create_look_ups();
+}
+
+
+// The coeff_buf series of methods originally stored the coefficients
+// into a "virtual" file which was located in EMS, XMS, or a disk file. A cache
+// was used to make this process more efficient. Now, we can store the entire
+// thing in RAM.
+jpeg_decoder::coeff_buf* jpeg_decoder::coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y)
+{
+    coeff_buf* cb = (coeff_buf*)alloc(sizeof(coeff_buf));
+    cb->block_num_x = block_num_x;
+    cb->block_num_y = block_num_y;
+    cb->block_len_x = block_len_x;
+    cb->block_len_y = block_len_y;
+    cb->block_size = (block_len_x * block_len_y) * sizeof(jpgd_block_t);
+    cb->pData = (uint8_t *)alloc(cb->block_size * block_num_x * block_num_y, true);
+    return cb;
+}
+
+
+inline jpgd_block_t *jpeg_decoder::coeff_buf_getp(coeff_buf *cb, int block_x, int block_y)
+{
+    JPGD_ASSERT((block_x < cb->block_num_x) && (block_y < cb->block_num_y));
+    return (jpgd_block_t *)(cb->pData + block_x * cb->block_size + block_y * (cb->block_size * cb->block_num_x));
+}
+
+
+// The following methods decode the various types of m_blocks encountered
+// in progressively encoded images.
+void jpeg_decoder::decode_block_dc_first(jpeg_decoder *pD, int component_id, int block_x, int block_y)
+{
+    int s, r;
+    jpgd_block_t *p = pD->coeff_buf_getp(pD->m_dc_coeffs[component_id], block_x, block_y);
+
+    if ((s = pD->huff_decode(pD->m_pHuff_tabs[pD->m_comp_dc_tab[component_id]])) != 0) {
+        r = pD->get_bits_no_markers(s);
+        s = JPGD_HUFF_EXTEND(r, s);
+    }
+    pD->m_last_dc_val[component_id] = (s += pD->m_last_dc_val[component_id]);
+    p[0] = static_cast<jpgd_block_t>(s << pD->m_successive_low);
+}
+
+
+void jpeg_decoder::decode_block_dc_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y)
+{
+    if (pD->get_bits_no_markers(1)) {
+        jpgd_block_t *p = pD->coeff_buf_getp(pD->m_dc_coeffs[component_id], block_x, block_y);
+        p[0] |= (1 << pD->m_successive_low);
+    }
+}
+
+
+void jpeg_decoder::decode_block_ac_first(jpeg_decoder *pD, int component_id, int block_x, int block_y)
+{
+    int k, s, r;
+
+    if (pD->m_eob_run) {
+        pD->m_eob_run--;
+        return;
+    }
+    jpgd_block_t *p = pD->coeff_buf_getp(pD->m_ac_coeffs[component_id], block_x, block_y);
+
+    for (k = pD->m_spectral_start; k <= pD->m_spectral_end; k++) {
+        s = pD->huff_decode(pD->m_pHuff_tabs[pD->m_comp_ac_tab[component_id]]);
+        r = s >> 4;
+        s &= 15;
+        if (s) {
+            if ((k += r) > 63) pD->stop_decoding(JPGD_DECODE_ERROR);
+            r = pD->get_bits_no_markers(s);
+            s = JPGD_HUFF_EXTEND(r, s);
+            p[g_ZAG[k]] = static_cast<jpgd_block_t>(s << pD->m_successive_low);
+        } else {
+            if (r == 15) {
+                if ((k += 15) > 63) pD->stop_decoding(JPGD_DECODE_ERROR);
+            } else {
+                pD->m_eob_run = 1 << r;
+                if (r) pD->m_eob_run += pD->get_bits_no_markers(r);
+                pD->m_eob_run--;
+                break;
+            }
+        }
+    }
+}
+
+
+void jpeg_decoder::decode_block_ac_refine(jpeg_decoder *pD, int component_id, int block_x, int block_y)
+{
+    int s, k, r;
+    int p1 = 1 << pD->m_successive_low;
+    int m1 = (-1) << pD->m_successive_low;
+    jpgd_block_t *p = pD->coeff_buf_getp(pD->m_ac_coeffs[component_id], block_x, block_y);
+    
+    JPGD_ASSERT(pD->m_spectral_end <= 63);
+    
+    k = pD->m_spectral_start;
+    
+    if (pD->m_eob_run == 0) {
+        for ( ; k <= pD->m_spectral_end; k++) {
+            s = pD->huff_decode(pD->m_pHuff_tabs[pD->m_comp_ac_tab[component_id]]);
+            r = s >> 4;
+            s &= 15;
+            if (s) {
+                if (s != 1) pD->stop_decoding(JPGD_DECODE_ERROR);
+                if (pD->get_bits_no_markers(1)) s = p1;
+                else s = m1;
+            } else {
+                if (r != 15) {
+                    pD->m_eob_run = 1 << r;
+                    if (r) pD->m_eob_run += pD->get_bits_no_markers(r);
+                    break;
+                }
+            }
+
+            do {
+                jpgd_block_t *this_coef = p + g_ZAG[k & 63];
+
+                if (*this_coef != 0) {
+                    if (pD->get_bits_no_markers(1)) {
+                        if ((*this_coef & p1) == 0) {
+                            if (*this_coef >= 0) *this_coef = static_cast<jpgd_block_t>(*this_coef + p1);
+                            else *this_coef = static_cast<jpgd_block_t>(*this_coef + m1);
+                        }
+                    }
+                } else {
+                    if (--r < 0) break;
+                }
+                k++;
+            } while (k <= pD->m_spectral_end);
+
+            if ((s) && (k < 64)) {
+              p[g_ZAG[k]] = static_cast<jpgd_block_t>(s);
+            }
+        }
+    }
+
+    if (pD->m_eob_run > 0) {
+        for ( ; k <= pD->m_spectral_end; k++) {
+            jpgd_block_t *this_coef = p + g_ZAG[k & 63]; // logical AND to shut up static code analysis
+
+            if (*this_coef != 0) {
+                if (pD->get_bits_no_markers(1)) {
+                    if ((*this_coef & p1) == 0) {
+                        if (*this_coef >= 0) *this_coef = static_cast<jpgd_block_t>(*this_coef + p1);
+                        else *this_coef = static_cast<jpgd_block_t>(*this_coef + m1);
+                    }
+                }
+            }
+        }
+        pD->m_eob_run--;
+    }
+}
+
+
+// Decode a scan in a progressively encoded image.
+void jpeg_decoder::decode_scan(pDecode_block_func decode_block_func)
+{
+    int mcu_row, mcu_col, mcu_block;
+    int block_x_mcu[JPGD_MAX_COMPONENTS], m_block_y_mcu[JPGD_MAX_COMPONENTS];
+
+    memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
+
+    for (mcu_col = 0; mcu_col < m_mcus_per_col; mcu_col++) {
+        int component_num, component_id;
+        memset(block_x_mcu, 0, sizeof(block_x_mcu));
+
+        for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++) {
+            int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
+
+            if ((m_restart_interval) && (m_restarts_left == 0)) process_restart();
+
+            for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++) {
+                component_id = m_mcu_org[mcu_block];
+                decode_block_func(this, component_id, block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
+
+                if (m_comps_in_scan == 1) block_x_mcu[component_id]++;
+                else {
+                    if (++block_x_mcu_ofs == m_comp_h_samp[component_id]) {
+                        block_x_mcu_ofs = 0;
+
+                        if (++block_y_mcu_ofs == m_comp_v_samp[component_id]) {
+                            block_y_mcu_ofs = 0;
+                            block_x_mcu[component_id] += m_comp_h_samp[component_id];
+                        }
+                    }
+                }
+            }
+            m_restarts_left--;
+        }
+
+        if (m_comps_in_scan == 1) m_block_y_mcu[m_comp_list[0]]++;
+        else {
+            for (component_num = 0; component_num < m_comps_in_scan; component_num++) {
+                component_id = m_comp_list[component_num];
+                m_block_y_mcu[component_id] += m_comp_v_samp[component_id];
+            }
+        }
+    }
+}
+
+
+// Decode a progressively encoded image.
+void jpeg_decoder::init_progressive()
+{
+    int i;
+
+    if (m_comps_in_frame == 4) stop_decoding(JPGD_UNSUPPORTED_COLORSPACE);
+
+    // Allocate the coefficient buffers.
+    for (i = 0; i < m_comps_in_frame; i++) {
+        m_dc_coeffs[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp[i], m_max_mcus_per_col * m_comp_v_samp[i], 1, 1);
+        m_ac_coeffs[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp[i], m_max_mcus_per_col * m_comp_v_samp[i], 8, 8);
+    }
+
+    while (true) {
+        int dc_only_scan, refinement_scan;
+        pDecode_block_func decode_block_func;
+
+        if (!init_scan()) break;
+
+        dc_only_scan = (m_spectral_start == 0);
+        refinement_scan = (m_successive_high != 0);
+
+        if ((m_spectral_start > m_spectral_end) || (m_spectral_end > 63)) stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+
+        if (dc_only_scan) {
+            if (m_spectral_end) stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+        } else if (m_comps_in_scan != 1) {  /* AC scans can only contain one component */
+            stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+        }
+
+        if ((refinement_scan) && (m_successive_low != m_successive_high - 1)) stop_decoding(JPGD_BAD_SOS_SUCCESSIVE);
+
+        if (dc_only_scan) {
+            if (refinement_scan) decode_block_func = decode_block_dc_refine;
+            else decode_block_func = decode_block_dc_first;
+        } else {
+            if (refinement_scan) decode_block_func = decode_block_ac_refine;
+            else decode_block_func = decode_block_ac_first;
+        }
+        decode_scan(decode_block_func);
+        m_bits_left = 16;
+        get_bits(16);
+        get_bits(16);
+    }
+
+    m_comps_in_scan = m_comps_in_frame;
+
+    for (i = 0; i < m_comps_in_frame; i++) {
+        m_comp_list[i] = i;
+    }
+
+    calc_mcu_block_order();
+}
+
+
+void jpeg_decoder::init_sequential()
+{
+    if (!init_scan()) stop_decoding(JPGD_UNEXPECTED_MARKER);
+}
+
+
+void jpeg_decoder::decode_start()
+{
+    init_frame();
+    if (m_progressive_flag) init_progressive();
+    else init_sequential();
+}
+
+
+void jpeg_decoder::decode_init(jpeg_decoder_stream *pStream)
+{
+    init(pStream);
+    locate_sof_marker();
+}
+
+
+jpeg_decoder::jpeg_decoder(jpeg_decoder_stream *pStream)
+{
+    if (setjmp(m_jmp_state)) return;
+    decode_init(pStream);
+}
+
+
+int jpeg_decoder::begin_decoding()
+{
+    if (m_ready_flag) return JPGD_SUCCESS;
+    if (m_error_code) return JPGD_FAILED;
+    if (setjmp(m_jmp_state)) return JPGD_FAILED;
+
+    decode_start();
+    m_ready_flag = true;
+
+    return JPGD_SUCCESS;
+}
+
+
+jpeg_decoder::~jpeg_decoder()
+{
+    free_all_blocks();
+}
+
+
+jpeg_decoder_file_stream::jpeg_decoder_file_stream()
+{
+    m_pFile = nullptr;
+    m_eof_flag = false;
+    m_error_flag = false;
+}
+
+
+void jpeg_decoder_file_stream::close()
+{
+    if (m_pFile) {
+        fclose(m_pFile);
+        m_pFile = nullptr;
+    }
+    m_eof_flag = false;
+    m_error_flag = false;
+}
+
+
+jpeg_decoder_file_stream::~jpeg_decoder_file_stream()
+{
+    close();
+}
+
+
+bool jpeg_decoder_file_stream::open(const char *Pfilename)
+{
+    close();
+
+    m_eof_flag = false;
+    m_error_flag = false;
+
+#if defined(_MSC_VER)
+    m_pFile = nullptr;
+    fopen_s(&m_pFile, Pfilename, "rb");
+#else
+    m_pFile = fopen(Pfilename, "rb");
+#endif
+    return m_pFile != nullptr;
+}
+
+
+int jpeg_decoder_file_stream::read(uint8_t *pBuf, int max_bytes_to_read, bool *pEOF_flag)
+{
+    if (!m_pFile) return -1;
+
+    if (m_eof_flag) {
+        *pEOF_flag = true;
+        return 0;
+    }
+
+    if (m_error_flag) return -1;
+
+    int bytes_read = static_cast<int>(fread(pBuf, 1, max_bytes_to_read, m_pFile));
+    if (bytes_read < max_bytes_to_read) {
+        if (ferror(m_pFile)) {
+            m_error_flag = true;
+            return -1;
+        }
+        m_eof_flag = true;
+        *pEOF_flag = true;
+    }
+    return bytes_read;
+}
+
+
+bool jpeg_decoder_mem_stream::open(const uint8_t *pSrc_data, uint32_t size)
+{
+    close();
+    m_pSrc_data = pSrc_data;
+    m_ofs = 0;
+    m_size = size;
+    return true;
+}
+
+
+int jpeg_decoder_mem_stream::read(uint8_t *pBuf, int max_bytes_to_read, bool *pEOF_flag)
+{
+    *pEOF_flag = false;
+    if (!m_pSrc_data) return -1;
+
+    uint32_t bytes_remaining = m_size - m_ofs;
+    if ((uint32_t)max_bytes_to_read > bytes_remaining) {
+        max_bytes_to_read = bytes_remaining;
+        *pEOF_flag = true;
+    }
+    memcpy(pBuf, m_pSrc_data + m_ofs, max_bytes_to_read);
+    m_ofs += max_bytes_to_read;
+
+    return max_bytes_to_read;
+}
+
+
+unsigned char *decompress_jpeg_image_from_stream(jpeg_decoder_stream *pStream, int *width, int *height, int *actual_comps, int req_comps)
+{
+    if (!actual_comps) return nullptr;
+    *actual_comps = 0;
+
+    if ((!pStream) || (!width) || (!height) || (!req_comps))  return nullptr;
+    if ((req_comps != 1) && (req_comps != 3) && (req_comps != 4)) return nullptr;
+
+    jpeg_decoder decoder(pStream);
+    if (decoder.get_error_code() != JPGD_SUCCESS) return nullptr;
+
+    const int image_width = decoder.get_width(), image_height = decoder.get_height();
+    *width = image_width;
+    *height = image_height;
+    *actual_comps = decoder.get_num_components();
+
+    if (decoder.begin_decoding() != JPGD_SUCCESS) return nullptr;
+
+    const int dst_bpl = image_width * req_comps;
+    uint8_t *pImage_data = (uint8_t*)malloc(dst_bpl * image_height);
+    if (!pImage_data) return nullptr;
+
+    for (int y = 0; y < image_height; y++) {
+        const uint8_t* pScan_line;
+        uint32_t scan_line_len;
+        if (decoder.decode((const void**)&pScan_line, &scan_line_len) != JPGD_SUCCESS) {
+            free(pImage_data);
+            return nullptr;
+        }
+
+        uint8_t *pDst = pImage_data + y * dst_bpl;
+
+        //Return as BGRA
+        if ((req_comps == 4) && (decoder.get_num_components() == 3)) {
+            for (int x = 0; x < image_width; x++) {
+                pDst[0] = pScan_line[x*4+2];
+                pDst[1] = pScan_line[x*4+1];
+                pDst[2] = pScan_line[x*4+0];
+                pDst[3] = 255;
+                pDst += 4;
+            } 
+        } else if (((req_comps == 1) && (decoder.get_num_components() == 1)) || ((req_comps == 4) && (decoder.get_num_components() == 3))) {
+            memcpy(pDst, pScan_line, dst_bpl);
+        } else if (decoder.get_num_components() == 1) {
+            if (req_comps == 3) {
+                for (int x = 0; x < image_width; x++) {
+                    uint8_t luma = pScan_line[x];
+                    pDst[0] = luma;
+                    pDst[1] = luma;
+                    pDst[2] = luma;
+                    pDst += 3;
+                }
+            } else {
+                for (int x = 0; x < image_width; x++) {
+                    uint8_t luma = pScan_line[x];
+                    pDst[0] = luma;
+                    pDst[1] = luma;
+                    pDst[2] = luma;
+                    pDst[3] = 255;
+                    pDst += 4;
+                }
+            }
+        } else if (decoder.get_num_components() == 3) {
+            if (req_comps == 1) {
+                const int YR = 19595, YG = 38470, YB = 7471;
+                for (int x = 0; x < image_width; x++) {
+                    int r = pScan_line[x*4+0];
+                    int g = pScan_line[x*4+1];
+                    int b = pScan_line[x*4+2];
+                    *pDst++ = static_cast<uint8_t>((r * YR + g * YG + b * YB + 32768) >> 16);
+                }
+            } else {
+                for (int x = 0; x < image_width; x++) {
+                    pDst[0] = pScan_line[x*4+0];
+                    pDst[1] = pScan_line[x*4+1];
+                    pDst[2] = pScan_line[x*4+2];
+                    pDst += 3;
+                }
+            }
+        }
+    }
+    return pImage_data;
+}
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+unsigned char *decompress_jpeg_image_from_memory(const unsigned char *pSrc_data, int src_data_size, int *width, int *height, int *actual_comps, int req_comps)
+{
+    jpeg_decoder_mem_stream mem_stream(pSrc_data, src_data_size);
+    return decompress_jpeg_image_from_stream(&mem_stream, width, height, actual_comps, req_comps);
+}
+
+
+unsigned char *decompress_jpeg_image_from_file(const char *pSrc_filename, int *width, int *height, int *actual_comps, int req_comps)
+{
+    jpeg_decoder_file_stream file_stream;
+    if (!file_stream.open(pSrc_filename)) return nullptr;
+    return decompress_jpeg_image_from_stream(&file_stream, width, height, actual_comps, req_comps);
+}
\ No newline at end of file
diff --git a/src/loaders/jpg/tvgJpgd.h b/src/loaders/jpg/tvgJpgd.h
new file mode 100644
index 0000000..61c6969
--- /dev/null
+++ b/src/loaders/jpg/tvgJpgd.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// jpgd.h - C++ class for JPEG decompression.
+// Public domain, Rich Geldreich <richgel99@gmail.com>
+#ifndef _TVG_JPGD_H_
+#define _TVG_JPGD_H_
+
+// Loads a JPEG image from a memory buffer or a file.
+// req_comps can be 1 (grayscale), 3 (RGB), or 4 (RGBA).
+// On return, width/height will be set to the image's dimensions, and actual_comps will be set to the either 1 (grayscale) or 3 (RGB).
+// Notes: For more control over where and how the source data is read, see the decompress_jpeg_image_from_stream() function below, or call the jpeg_decoder class directly.
+// Requesting a 8 or 32bpp image is currently a little faster than 24bpp because the jpeg_decoder class itself currently always unpacks to either 8 or 32bpp.
+unsigned char *decompress_jpeg_image_from_memory(const unsigned char *pSrc_data, int src_data_size, int *width, int *height, int *actual_comps, int req_comps);
+unsigned char *decompress_jpeg_image_from_file(const char *pSrc_filename, int *width, int *height, int *actual_comps, int req_comps);
+
+#endif //_TVG_JPGD_H_
-- 
2.7.4


From 2f789d0ca3b5c23b437e55896214b482d28b95e5 Mon Sep 17 00:00:00 2001
From: Mira Grudzinska <m.grudzinska@samsung.com>
Date: Sat, 23 Oct 2021 21:31:02 +0200
Subject: [PATCH 15/16] sw_engine: rasterization region edited in the case of
 fast tracking

Since no antialiasing is applied for the fastTracking cases,
the shape's rasterization region needs to be edited. To establish
the rastered bbox, the rounding is performed before the bbox corners
are casted to the SwCoords.
---
 src/lib/sw_engine/tvgSwShape.cpp | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/lib/sw_engine/tvgSwShape.cpp b/src/lib/sw_engine/tvgSwShape.cpp
index 08cd694..0d93b55 100644
--- a/src/lib/sw_engine/tvgSwShape.cpp
+++ b/src/lib/sw_engine/tvgSwShape.cpp
@@ -22,6 +22,7 @@
 #include "tvgSwCommon.h"
 #include "tvgBezier.h"
 #include <float.h>
+#include <math.h>
 
 /************************************************************************/
 /* Internal Class Implementation                                        */
@@ -508,8 +509,33 @@ bool shapeGenRle(SwShape* shape, TVG_UNUSED const Shape* sdata, bool antiAlias,
     //if (shape.outline->opened) return true;
 
     //Case A: Fast Track Rectangle Drawing
-    if (!hasComposite && (shape->rect = _fastTrack(shape->outline))) return true;
-    //Case B: Normale Shape RLE Drawing
+    if (!hasComposite && (shape->rect = _fastTrack(shape->outline))) {
+        //Since no antialiasing is applied in the Fast Track case,
+        //the rasterization region has to be modified
+        auto corner1 = shape->outline->pts;
+        auto corner3 = shape->outline->pts + 2;
+
+        auto xMin = corner1->x;
+        auto xMax = corner3->x;
+        if (xMin > xMax) {
+            xMax = xMin;
+            xMin = corner3->x;
+        }
+        auto yMin = corner1->y;
+        auto yMax = corner3->y;
+        if (yMin > yMax) {
+            yMax = yMin;
+            yMin = corner3->y;
+        }
+
+        shape->bbox.min.x = static_cast<SwCoord>(round(xMin / 64.0f));
+        shape->bbox.max.x = static_cast<SwCoord>(round(xMax / 64.0f));
+        shape->bbox.min.y = static_cast<SwCoord>(round(yMin / 64.0f));
+        shape->bbox.max.y = static_cast<SwCoord>(round(yMax / 64.0f));
+
+        return true;
+    }
+    //Case B: Normal Shape RLE Drawing
     if ((shape->rle = rleRender(shape->rle, shape->outline, shape->bbox, antiAlias))) return true;
 
     return false;
-- 
2.7.4


From 47e3848ae8b42de4f46c3725c14d0da2d8149a60 Mon Sep 17 00:00:00 2001
From: Hermet Park <chuneon.park@samsung.com>
Date: Thu, 28 Oct 2021 15:15:16 +0900
Subject: [PATCH 16/16] tizen_only: Leave the redundant capis until efl remove
 them.

Change-Id: Ib0f76263cc233408022495056a6e9bd9e3df412b
---
 src/bindings/capi/thorvg_capi.h |  8 ++++++--
 src/bindings/capi/tvgCapi.cpp   | 16 +++++++++++++++-
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/bindings/capi/thorvg_capi.h b/src/bindings/capi/thorvg_capi.h
index b9f2667..0aa201f 100644
--- a/src/bindings/capi/thorvg_capi.h
+++ b/src/bindings/capi/thorvg_capi.h
@@ -855,8 +855,8 @@ TVG_EXPORT Tvg_Paint* tvg_paint_duplicate(Tvg_Paint* paint);
 *
 * \note The bounding box doesn't indicate the actual drawing region. It's the smallest rectangle that encloses the object.
 */
-TVG_EXPORT Tvg_Result tvg_paint_get_bounds(const Tvg_Paint* paint, float* x, float* y, float* w, float* h, bool transformed);
-
+//TIZEN_ONLY: RENAME TO tvg_paint_get_bounds() AFTER DUPLICATED API IS REMOVED
+TVG_EXPORT Tvg_Result _tvg_paint_get_bounds(const Tvg_Paint* paint, float* x, float* y, float* w, float* h, bool transformed);
 
 /*!
 * \brief Sets the composition target object and the composition method.
@@ -2022,6 +2022,10 @@ TVG_EXPORT Tvg_Result tvg_saver_del(Tvg_Saver* saver);
 /** \} */   // end defgroup ThorVGCapi_Saver
 
 
+//TIZEN_ONLY: MUST BE REMOVED AFTER RESOLVING EFL USAGE
+TVG_EXPORT Tvg_Result tvg_paint_get_bounds(const Tvg_Paint* paint, float* x, float* y, float* w, float* h);
+TVG_EXPORT Tvg_Result tvg_paint_transform(Tvg_Paint* paint, const Tvg_Matrix* m);
+
 /** \} */   // end defgroup ThorVG_CAPI
 
 
diff --git a/src/bindings/capi/tvgCapi.cpp b/src/bindings/capi/tvgCapi.cpp
index 7c2fd12..e707e43 100644
--- a/src/bindings/capi/tvgCapi.cpp
+++ b/src/bindings/capi/tvgCapi.cpp
@@ -169,6 +169,12 @@ TVG_EXPORT Tvg_Result tvg_paint_set_transform(Tvg_Paint* paint, const Tvg_Matrix
 }
 
 
+//TIZEN_ONLY: MUST BE REMOVED AFTER RESOLVING EFL USAGE
+TVG_EXPORT Tvg_Result tvg_paint_transform(Tvg_Paint* paint, const Tvg_Matrix* m)
+{
+    return tvg_paint_set_transform(paint, m);
+}
+
 TVG_EXPORT Tvg_Result tvg_paint_get_transform(Tvg_Paint* paint, Tvg_Matrix* m)
 {
     if (!paint || !m) return TVG_RESULT_INVALID_ARGUMENT;
@@ -199,13 +205,21 @@ TVG_EXPORT Tvg_Result tvg_paint_get_opacity(const Tvg_Paint* paint, uint8_t* opa
 }
 
 
-TVG_EXPORT Tvg_Result tvg_paint_get_bounds(const Tvg_Paint* paint, float* x, float* y, float* w, float* h, bool transformed)
+//TIZEN_ONLY: RENAME TO tvg_paint_get_bounds() AFTER DUPLICATED API IS REMOVED
+TVG_EXPORT Tvg_Result _tvg_paint_get_bounds(const Tvg_Paint* paint, float* x, float* y, float* w, float* h, bool transformed)
 {
    if (!paint) return TVG_RESULT_INVALID_ARGUMENT;
    return (Tvg_Result) reinterpret_cast<const Paint*>(paint)->bounds(x, y, w, h, transformed);
 }
 
 
+//TIZEN_ONLY: MUST BE REMOVED AFTER RESOLVING EFL USAGE
+TVG_EXPORT Tvg_Result tvg_paint_get_bounds(const Tvg_Paint* paint, float* x, float* y, float* w, float* h)
+{
+   return _tvg_paint_get_bounds(paint, x, y, w, h, false);
+}
+
+
 TVG_EXPORT Tvg_Result tvg_paint_set_composite_method(Tvg_Paint* paint, Tvg_Paint* target, Tvg_Composite_Method method)
 {
    if (!paint) return TVG_RESULT_INVALID_ARGUMENT;
-- 
2.7.4