From: Pawel Wasowski Date: Thu, 18 May 2017 09:50:31 +0000 (+0200) Subject: [Filesystem] Add validation of decoded UTF-8 characters X-Git-Tag: submit/tizen/20170522.103041~1 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=3c44b8fe425f22c41bea2c54ce04de60e29135e4;p=platform%2Fcore%2Fapi%2Fwebapi-plugins.git [Filesystem] Add validation of decoded UTF-8 characters _utf8_decode function used to throw a RangeError exception, when an invalid UTF-8 sequence was converted to a Unicode code point. Byte sequences, invalid in terms of UTF-8, are now substituted with an Unicode replacement character. [Verification] TCT tct-filesystem-tizen-tests and tct-file-cordova-tests pass rate on a Z400 mobile device is 100%. Decoding was tested manually against numerous problematic byte sequences. Change-Id: If8aefd3434a1b96ead11e36a1db1ddee4f2c3904 Signed-off-by: Pawel Wasowski --- diff --git a/src/filesystem/js/base64.js b/src/filesystem/js/base64.js index edd45133..4074d48e 100755 --- a/src/filesystem/js/base64.js +++ b/src/filesystem/js/base64.js @@ -126,35 +126,89 @@ var Base64 = { return utfarray; }, + + /* + * This function validates read characters. Non-standard UTF-8 characters are substituted with + * a replacement symbol. + * + * Used validation check cases are described in http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt, + * by Markus Kuhn, distributed under CC-BY 4.0 license (https://creativecommons.org/licenses/by/4.0/legalcode). + */ + _utf8_decode: function(utfarray) { var str = ''; - var i = 0, c = 0, c1 = 0, c2 = 0, c3 = 0; + var i = 0, c = 0, c1 = 0, c2 = 0, c3 = 0, charCode = 0; + var INVALID_CHARACTER = String.fromCharCode(0xFFFD); while (i < utfarray.length) { - c = utfarray[i]; if (c < 128) { str += String.fromCharCode(c); i++; - } - else if ((c >= 192) && (c < 224)) { + } else if ((c >= 194) && (c < 224) && (utfarray[i + 1] & 0x80)) { c1 = utfarray[i + 1]; - str += String.fromCharCode(((c & 31) << 6) | (c1 & 63)); + charCode = ((c & 31) << 6) | (c1 & 63); + /* + * Below condition is true, if the sequence could be encoded in less than 2 bytes. + * Such a byte series is invalid in terms of UTF-8. + * This and similar, longer, sequences will be refered to as "overlong sequence". + */ + if (!(charCode & 0xFF80)) { + str += INVALID_CHARACTER; + } else { + str += String.fromCharCode(charCode); + } + i += 2; - } - else if((c >= 224) && (c < 240)) { + } else if ((c >= 224) && (c < 240) && (utfarray[i + 1] & 0x80) && (utfarray[i + 2] & 0x80)) { c1 = utfarray[i + 1]; c2 = utfarray[i + 2]; - str += String.fromCharCode(((c & 15) << 12) | ((c1 & 63) << 6) | (c2 & 63)); + charCode = ((c & 15) << 12) | ((c1 & 63) << 6) | (c2 & 63); + + if (!(charCode & 0xF800) //overlong sequence test + /* + * Below test checks, if the character is an UTF-16 surrogate halve, + * UTF-16 surrogate halves are invalid Unicode codepoints. + */ + || (0xD800 <= charCode && charCode <=0xDFFF)) { + str += INVALID_CHARACTER; + } else { + str += String.fromCharCode(charCode); + } + i += 3; - } - else {//support 4 bytes characters e.g. Emojis + } else if ((c >= 240) && (c < 245) & (utfarray[i + 1] & 0x80) && (utfarray[i + 2] & 0x80) && (utfarray[i + 3] & 0x80)) { c1 = utfarray[i + 1]; c2 = utfarray[i + 2]; c3 = utfarray[i + 3]; - str += String.fromCodePoint(((c & 7) << 18) | ((c1 & 63) << 12) | ((c2 & 63) << 6) | (c3 & 63)); + charCode = ((c & 7) << 18) | ((c1 & 63) << 12) | ((c2 & 63) << 6) | (c3 & 63); + + if (!(charCode & 0x1F0000)) { //overlong sequence test + str += INVALID_CHARACTER; + } else { + str += String.fromCharCode(charCode); + } i += 4; + /* + * Below condition is true if a continuation byte appeared without a proper leading byte + */ + } else if ((c & 0x80) && (~c & 0x40)) { + str += INVALID_CHARACTER; + i++; + } else { + /* + * One or more continuation bytes are missing + * OR 'c' is a prohibited byte in terms of UTF-8 standard. + */ + str += INVALID_CHARACTER; + + /* + * All following continuation bytes are skipped. + */ + do { + i++; + } while((utfarray[i] & 0x80) && (~utfarray[i] & 0x40)); } }