From 1bd2aeb95e97cded73f308ec47daaf845eb2ab07 Mon Sep 17 00:00:00 2001
From: Pawel Wasowski <p.wasowski2@partner.samsung.com>
Date: Thu, 18 May 2017 11:50:31 +0200
Subject: [PATCH] [Filesystem] Add validation of decoded UTF-8 characters

_utf8_decode function used to throw a RangeError exception, when an
invalid UTF-8 sequence was converted to a Unicode code point.

Byte sequences, invalid in terms of UTF-8, are now substituted with
an Unicode replacement character.

[Verification] TCT tct-filesystem-tizen-tests and tct-file-cordova-tests
               pass rate on a Z400 mobile device is 100%.
               Decoding was tested manually against numerous problematic
               byte sequences.

Change-Id: If8aefd3434a1b96ead11e36a1db1ddee4f2c3904
Signed-off-by: Pawel Wasowski <p.wasowski2@partner.samsung.com>
(cherry picked from commit 3c44b8fe425f22c41bea2c54ce04de60e29135e4)
---
 src/filesystem/js/base64.js | 76 +++++++++++++++++++++++++++++++------
 1 file changed, 65 insertions(+), 11 deletions(-)

diff --git a/src/filesystem/js/base64.js b/src/filesystem/js/base64.js
index edd45133..4074d48e 100755
--- a/src/filesystem/js/base64.js
+++ b/src/filesystem/js/base64.js
@@ -126,35 +126,89 @@ var Base64 = {
 
     return utfarray;
   },
+
+    /*
+     * This function validates read characters. Non-standard UTF-8 characters are substituted with
+     * a replacement symbol.
+     *
+     * Used validation check cases are described in http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
+     * by Markus Kuhn, distributed under CC-BY 4.0 license (https://creativecommons.org/licenses/by/4.0/legalcode).
+     */
+
   _utf8_decode: function(utfarray) {
     var str = '';
-    var i = 0, c = 0, c1 = 0, c2 = 0, c3 = 0;
+    var i = 0, c = 0, c1 = 0, c2 = 0, c3 = 0, charCode = 0;
+    var INVALID_CHARACTER = String.fromCharCode(0xFFFD);
 
     while (i < utfarray.length) {
-
       c = utfarray[i];
 
       if (c < 128) {
         str += String.fromCharCode(c);
         i++;
-      }
-      else if ((c >= 192) && (c < 224)) {
+      } else if ((c >= 194) && (c < 224) && (utfarray[i + 1] & 0x80)) {
         c1 = utfarray[i + 1];
-        str += String.fromCharCode(((c & 31) << 6) | (c1 & 63));
+        charCode = ((c & 31) << 6) | (c1 & 63);
+          /*
+           * Below condition is true, if the sequence could be encoded in less than 2 bytes.
+           * Such a byte series is invalid in terms of UTF-8.
+           * This and similar, longer, sequences will be refered to as "overlong sequence".
+           */
+        if (!(charCode & 0xFF80)) {
+            str += INVALID_CHARACTER;
+        } else {
+            str += String.fromCharCode(charCode);
+        }
+
         i += 2;
-      }
-      else if((c >= 224) && (c < 240)) {
+      } else if ((c >= 224) && (c < 240) && (utfarray[i + 1] & 0x80) && (utfarray[i + 2] & 0x80)) {
         c1 = utfarray[i + 1];
         c2 = utfarray[i + 2];
-        str += String.fromCharCode(((c & 15) << 12) | ((c1 & 63) << 6) | (c2 & 63));
+        charCode = ((c & 15) << 12) | ((c1 & 63) << 6) | (c2 & 63);
+
+        if (!(charCode & 0xF800) //overlong sequence test
+          /*
+           * Below test checks, if the character is an UTF-16 surrogate halve,
+           * UTF-16 surrogate halves are invalid Unicode codepoints.
+           */
+        || (0xD800 <= charCode && charCode <=0xDFFF)) {
+          str += INVALID_CHARACTER;
+        } else {
+          str += String.fromCharCode(charCode);
+        }
+
         i += 3;
-      }
-      else {//support 4 bytes characters e.g. Emojis
+      } else if ((c >= 240) && (c < 245) & (utfarray[i + 1] & 0x80) && (utfarray[i + 2] & 0x80) && (utfarray[i + 3] & 0x80)) {
         c1 = utfarray[i + 1];
         c2 = utfarray[i + 2];
         c3 = utfarray[i + 3];
-        str += String.fromCodePoint(((c & 7) << 18) | ((c1 & 63) << 12) | ((c2 & 63) << 6) | (c3 & 63));
+        charCode = ((c & 7) << 18) | ((c1 & 63) << 12) | ((c2 & 63) << 6) | (c3 & 63);
+
+        if (!(charCode & 0x1F0000)) { //overlong sequence test
+          str += INVALID_CHARACTER;
+        } else {
+          str += String.fromCharCode(charCode);
+        }
         i += 4;
+         /*
+          * Below condition is true if a continuation byte appeared without a proper leading byte
+          */
+      } else if ((c & 0x80) && (~c & 0x40)) {
+        str += INVALID_CHARACTER;
+        i++;
+      } else {
+        /*
+         * One or more continuation bytes are missing
+         * OR 'c' is a prohibited byte in terms of UTF-8 standard.
+         */
+        str += INVALID_CHARACTER;
+
+        /*
+         * All following continuation bytes are skipped.
+         */
+        do {
+          i++;
+        } while((utfarray[i] & 0x80) && (~utfarray[i] & 0x40));
       }
     }
 
-- 
2.34.1