return utfarray;
},
+
+ /*
+ * This function validates read characters. Non-standard UTF-8 characters are substituted with
+ * a replacement symbol.
+ *
+ * Used validation check cases are described in http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
+ * by Markus Kuhn, distributed under CC-BY 4.0 license (https://creativecommons.org/licenses/by/4.0/legalcode).
+ */
+
_utf8_decode: function(utfarray) {
var str = '';
- var i = 0, c = 0, c1 = 0, c2 = 0, c3 = 0;
+ var i = 0, c = 0, c1 = 0, c2 = 0, c3 = 0, charCode = 0;
+ var INVALID_CHARACTER = String.fromCharCode(0xFFFD);
while (i < utfarray.length) {
-
c = utfarray[i];
if (c < 128) {
str += String.fromCharCode(c);
i++;
- }
- else if ((c >= 192) && (c < 224)) {
+ } else if ((c >= 194) && (c < 224) && (utfarray[i + 1] & 0x80)) {
c1 = utfarray[i + 1];
- str += String.fromCharCode(((c & 31) << 6) | (c1 & 63));
+ charCode = ((c & 31) << 6) | (c1 & 63);
+ /*
+ * Below condition is true, if the sequence could be encoded in less than 2 bytes.
+ * Such a byte series is invalid in terms of UTF-8.
+ * This and similar, longer, sequences will be refered to as "overlong sequence".
+ */
+ if (!(charCode & 0xFF80)) {
+ str += INVALID_CHARACTER;
+ } else {
+ str += String.fromCharCode(charCode);
+ }
+
i += 2;
- }
- else if((c >= 224) && (c < 240)) {
+ } else if ((c >= 224) && (c < 240) && (utfarray[i + 1] & 0x80) && (utfarray[i + 2] & 0x80)) {
c1 = utfarray[i + 1];
c2 = utfarray[i + 2];
- str += String.fromCharCode(((c & 15) << 12) | ((c1 & 63) << 6) | (c2 & 63));
+ charCode = ((c & 15) << 12) | ((c1 & 63) << 6) | (c2 & 63);
+
+ if (!(charCode & 0xF800) //overlong sequence test
+ /*
+ * Below test checks, if the character is an UTF-16 surrogate halve,
+ * UTF-16 surrogate halves are invalid Unicode codepoints.
+ */
+ || (0xD800 <= charCode && charCode <=0xDFFF)) {
+ str += INVALID_CHARACTER;
+ } else {
+ str += String.fromCharCode(charCode);
+ }
+
i += 3;
- }
- else {//support 4 bytes characters e.g. Emojis
+ } else if ((c >= 240) && (c < 245) & (utfarray[i + 1] & 0x80) && (utfarray[i + 2] & 0x80) && (utfarray[i + 3] & 0x80)) {
c1 = utfarray[i + 1];
c2 = utfarray[i + 2];
c3 = utfarray[i + 3];
- str += String.fromCodePoint(((c & 7) << 18) | ((c1 & 63) << 12) | ((c2 & 63) << 6) | (c3 & 63));
+ charCode = ((c & 7) << 18) | ((c1 & 63) << 12) | ((c2 & 63) << 6) | (c3 & 63);
+
+ if (!(charCode & 0x1F0000)) { //overlong sequence test
+ str += INVALID_CHARACTER;
+ } else {
+ str += String.fromCharCode(charCode);
+ }
i += 4;
+ /*
+ * Below condition is true if a continuation byte appeared without a proper leading byte
+ */
+ } else if ((c & 0x80) && (~c & 0x40)) {
+ str += INVALID_CHARACTER;
+ i++;
+ } else {
+ /*
+ * One or more continuation bytes are missing
+ * OR 'c' is a prohibited byte in terms of UTF-8 standard.
+ */
+ str += INVALID_CHARACTER;
+
+ /*
+ * All following continuation bytes are skipped.
+ */
+ do {
+ i++;
+ } while((utfarray[i] & 0x80) && (~utfarray[i] & 0x40));
}
}