[Filesystem] Add validation of decoded UTF-8 characters

author Pawel Wasowski <p.wasowski2@partner.samsung.com>

Thu, 18 May 2017 09:50:31 +0000 (11:50 +0200)

committer Pawel Wasowski <p.wasowski2@partner.samsung.com>

Thu, 18 May 2017 15:06:09 +0000 (17:06 +0200)
author Pawel Wasowski <p.wasowski2@partner.samsung.com>
Thu, 18 May 2017 09:50:31 +0000 (11:50 +0200)
committer Pawel Wasowski <p.wasowski2@partner.samsung.com>
Thu, 18 May 2017 15:06:09 +0000 (17:06 +0200)
diff --git a/src/filesystem/js/base64.js b/src/filesystem/js/base64.js

index edd4513316535ebf855006cf4eaf5bc24bec9e9f..4074d48eb9fb3d53f97fbbf08ed339b5762f5fd7 100755 (executable)
--- a/src/filesystem/js/base64.js
+++ b/src/filesystem/js/base64.js
@@ -126,35 +126,89 @@ var Base64 = {
  
      return utfarray;
    },
+
+    /*
+     * This function validates read characters. Non-standard UTF-8 characters are substituted with
+     * a replacement symbol.
+     *
+     * Used validation check cases are described in http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
+     * by Markus Kuhn, distributed under CC-BY 4.0 license (https://creativecommons.org/licenses/by/4.0/legalcode).
+     */
+
    _utf8_decode: function(utfarray) {
      var str = '';
-    var i = 0, c = 0, c1 = 0, c2 = 0, c3 = 0;
+    var i = 0, c = 0, c1 = 0, c2 = 0, c3 = 0, charCode = 0;
+    var INVALID_CHARACTER = String.fromCharCode(0xFFFD);
  
      while (i < utfarray.length) {
-
        c = utfarray[i];
  
        if (c < 128) {
          str += String.fromCharCode(c);
          i++;
-      }
-      else if ((c >= 192) && (c < 224)) {
+      } else if ((c >= 194) && (c < 224) && (utfarray[i + 1] & 0x80)) {
          c1 = utfarray[i + 1];
-        str += String.fromCharCode(((c & 31) << 6) | (c1 & 63));
+        charCode = ((c & 31) << 6) | (c1 & 63);
+          /*
+           * Below condition is true, if the sequence could be encoded in less than 2 bytes.
+           * Such a byte series is invalid in terms of UTF-8.
+           * This and similar, longer, sequences will be refered to as "overlong sequence".
+           */
+        if (!(charCode & 0xFF80)) {
+            str += INVALID_CHARACTER;
+        } else {
+            str += String.fromCharCode(charCode);
+        }
+
          i += 2;
-      }
-      else if((c >= 224) && (c < 240)) {
+      } else if ((c >= 224) && (c < 240) && (utfarray[i + 1] & 0x80) && (utfarray[i + 2] & 0x80)) {
          c1 = utfarray[i + 1];
          c2 = utfarray[i + 2];
-        str += String.fromCharCode(((c & 15) << 12) | ((c1 & 63) << 6) | (c2 & 63));
+        charCode = ((c & 15) << 12) | ((c1 & 63) << 6) | (c2 & 63);
+
+        if (!(charCode & 0xF800) //overlong sequence test
+          /*
+           * Below test checks, if the character is an UTF-16 surrogate halve,
+           * UTF-16 surrogate halves are invalid Unicode codepoints.
+           */
+        || (0xD800 <= charCode && charCode <=0xDFFF)) {
+          str += INVALID_CHARACTER;
+        } else {
+          str += String.fromCharCode(charCode);
+        }
+
          i += 3;
-      }
-      else {//support 4 bytes characters e.g. Emojis
+      } else if ((c >= 240) && (c < 245) & (utfarray[i + 1] & 0x80) && (utfarray[i + 2] & 0x80) && (utfarray[i + 3] & 0x80)) {
          c1 = utfarray[i + 1];
          c2 = utfarray[i + 2];
          c3 = utfarray[i + 3];
-        str += String.fromCodePoint(((c & 7) << 18) | ((c1 & 63) << 12) | ((c2 & 63) << 6) | (c3 & 63));
+        charCode = ((c & 7) << 18) | ((c1 & 63) << 12) | ((c2 & 63) << 6) | (c3 & 63);
+
+        if (!(charCode & 0x1F0000)) { //overlong sequence test
+          str += INVALID_CHARACTER;
+        } else {
+          str += String.fromCharCode(charCode);
+        }
          i += 4;
+         /*
+          * Below condition is true if a continuation byte appeared without a proper leading byte
+          */
+      } else if ((c & 0x80) && (~c & 0x40)) {
+        str += INVALID_CHARACTER;
+        i++;
+      } else {
+        /*
+         * One or more continuation bytes are missing
+         * OR 'c' is a prohibited byte in terms of UTF-8 standard.
+         */
+        str += INVALID_CHARACTER;
+
+        /*
+         * All following continuation bytes are skipped.
+         */
+        do {
+          i++;
+        } while((utfarray[i] & 0x80) && (~utfarray[i] & 0x40));
        }
      }
author	Pawel Wasowski <p.wasowski2@partner.samsung.com>
	Thu, 18 May 2017 09:50:31 +0000 (11:50 +0200)
committer	Pawel Wasowski <p.wasowski2@partner.samsung.com>
	Thu, 18 May 2017 15:06:09 +0000 (17:06 +0200)