From 1bd2aeb95e97cded73f308ec47daaf845eb2ab07 Mon Sep 17 00:00:00 2001
From: Pawel Wasowski
Date: Thu, 18 May 2017 11:50:31 +0200
Subject: [PATCH] [Filesystem] Add validation of decoded UTF-8 characters
_utf8_decode function used to throw a RangeError exception, when an
invalid UTF-8 sequence was converted to a Unicode code point.
Byte sequences, invalid in terms of UTF-8, are now substituted with
an Unicode replacement character.
[Verification] TCT tct-filesystem-tizen-tests and tct-file-cordova-tests
pass rate on a Z400 mobile device is 100%.
Decoding was tested manually against numerous problematic
byte sequences.
Change-Id: If8aefd3434a1b96ead11e36a1db1ddee4f2c3904
Signed-off-by: Pawel Wasowski
(cherry picked from commit 3c44b8fe425f22c41bea2c54ce04de60e29135e4)
---
src/filesystem/js/base64.js | 76 +++++++++++++++++++++++++++++++------
1 file changed, 65 insertions(+), 11 deletions(-)
diff --git a/src/filesystem/js/base64.js b/src/filesystem/js/base64.js
index edd45133..4074d48e 100755
--- a/src/filesystem/js/base64.js
+++ b/src/filesystem/js/base64.js
@@ -126,35 +126,89 @@ var Base64 = {
return utfarray;
},
+
+ /*
+ * This function validates read characters. Non-standard UTF-8 characters are substituted with
+ * a replacement symbol.
+ *
+ * Used validation check cases are described in http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
+ * by Markus Kuhn, distributed under CC-BY 4.0 license (https://creativecommons.org/licenses/by/4.0/legalcode).
+ */
+
_utf8_decode: function(utfarray) {
var str = '';
- var i = 0, c = 0, c1 = 0, c2 = 0, c3 = 0;
+ var i = 0, c = 0, c1 = 0, c2 = 0, c3 = 0, charCode = 0;
+ var INVALID_CHARACTER = String.fromCharCode(0xFFFD);
while (i < utfarray.length) {
-
c = utfarray[i];
if (c < 128) {
str += String.fromCharCode(c);
i++;
- }
- else if ((c >= 192) && (c < 224)) {
+ } else if ((c >= 194) && (c < 224) && (utfarray[i + 1] & 0x80)) {
c1 = utfarray[i + 1];
- str += String.fromCharCode(((c & 31) << 6) | (c1 & 63));
+ charCode = ((c & 31) << 6) | (c1 & 63);
+ /*
+ * Below condition is true, if the sequence could be encoded in less than 2 bytes.
+ * Such a byte series is invalid in terms of UTF-8.
+ * This and similar, longer, sequences will be refered to as "overlong sequence".
+ */
+ if (!(charCode & 0xFF80)) {
+ str += INVALID_CHARACTER;
+ } else {
+ str += String.fromCharCode(charCode);
+ }
+
i += 2;
- }
- else if((c >= 224) && (c < 240)) {
+ } else if ((c >= 224) && (c < 240) && (utfarray[i + 1] & 0x80) && (utfarray[i + 2] & 0x80)) {
c1 = utfarray[i + 1];
c2 = utfarray[i + 2];
- str += String.fromCharCode(((c & 15) << 12) | ((c1 & 63) << 6) | (c2 & 63));
+ charCode = ((c & 15) << 12) | ((c1 & 63) << 6) | (c2 & 63);
+
+ if (!(charCode & 0xF800) //overlong sequence test
+ /*
+ * Below test checks, if the character is an UTF-16 surrogate halve,
+ * UTF-16 surrogate halves are invalid Unicode codepoints.
+ */
+ || (0xD800 <= charCode && charCode <=0xDFFF)) {
+ str += INVALID_CHARACTER;
+ } else {
+ str += String.fromCharCode(charCode);
+ }
+
i += 3;
- }
- else {//support 4 bytes characters e.g. Emojis
+ } else if ((c >= 240) && (c < 245) & (utfarray[i + 1] & 0x80) && (utfarray[i + 2] & 0x80) && (utfarray[i + 3] & 0x80)) {
c1 = utfarray[i + 1];
c2 = utfarray[i + 2];
c3 = utfarray[i + 3];
- str += String.fromCodePoint(((c & 7) << 18) | ((c1 & 63) << 12) | ((c2 & 63) << 6) | (c3 & 63));
+ charCode = ((c & 7) << 18) | ((c1 & 63) << 12) | ((c2 & 63) << 6) | (c3 & 63);
+
+ if (!(charCode & 0x1F0000)) { //overlong sequence test
+ str += INVALID_CHARACTER;
+ } else {
+ str += String.fromCharCode(charCode);
+ }
i += 4;
+ /*
+ * Below condition is true if a continuation byte appeared without a proper leading byte
+ */
+ } else if ((c & 0x80) && (~c & 0x40)) {
+ str += INVALID_CHARACTER;
+ i++;
+ } else {
+ /*
+ * One or more continuation bytes are missing
+ * OR 'c' is a prohibited byte in terms of UTF-8 standard.
+ */
+ str += INVALID_CHARACTER;
+
+ /*
+ * All following continuation bytes are skipped.
+ */
+ do {
+ i++;
+ } while((utfarray[i] & 0x80) && (~utfarray[i] & 0x40));
}
}
--
2.34.1