efi/libstub: Add UTF-8 decoding to efi_puts
authorArvind Sankar <nivedita@alum.mit.edu>
Mon, 18 May 2020 19:07:13 +0000 (15:07 -0400)
committerArd Biesheuvel <ardb@kernel.org>
Wed, 20 May 2020 17:09:20 +0000 (19:09 +0200)
In order to be able to use the UTF-16 support added to vsprintf in the
previous commit, enhance efi_puts to decode UTF-8 into UTF-16. Invalid
UTF-8 encodings are passed through unchanged.

Signed-off-by: Arvind Sankar <nivedita@alum.mit.edu>
Link: https://lore.kernel.org/r/20200518190716.751506-22-nivedita@alum.mit.edu
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
drivers/firmware/efi/libstub/efi-stub-helper.c

index f338d14..0d00073 100644 (file)
@@ -36,17 +36,74 @@ void efi_char16_puts(efi_char16_t *str)
                       output_string, str);
 }
 
+static
+u32 utf8_to_utf32(const u8 **s8)
+{
+       u32 c32;
+       u8 c0, cx;
+       size_t clen, i;
+
+       c0 = cx = *(*s8)++;
+       /*
+        * The position of the most-significant 0 bit gives us the length of
+        * a multi-octet encoding.
+        */
+       for (clen = 0; cx & 0x80; ++clen)
+               cx <<= 1;
+       /*
+        * If the 0 bit is in position 8, this is a valid single-octet
+        * encoding. If the 0 bit is in position 7 or positions 1-3, the
+        * encoding is invalid.
+        * In either case, we just return the first octet.
+        */
+       if (clen < 2 || clen > 4)
+               return c0;
+       /* Get the bits from the first octet. */
+       c32 = cx >> clen--;
+       for (i = 0; i < clen; ++i) {
+               /* Trailing octets must have 10 in most significant bits. */
+               cx = (*s8)[i] ^ 0x80;
+               if (cx & 0xc0)
+                       return c0;
+               c32 = (c32 << 6) | cx;
+       }
+       /*
+        * Check for validity:
+        * - The character must be in the Unicode range.
+        * - It must not be a surrogate.
+        * - It must be encoded using the correct number of octets.
+        */
+       if (c32 > 0x10ffff ||
+           (c32 & 0xf800) == 0xd800 ||
+           clen != (c32 >= 0x80) + (c32 >= 0x800) + (c32 >= 0x10000))
+               return c0;
+       *s8 += clen;
+       return c32;
+}
+
 void efi_puts(const char *str)
 {
        efi_char16_t buf[128];
        size_t pos = 0, lim = ARRAY_SIZE(buf);
+       const u8 *s8 = (const u8 *)str;
+       u32 c32;
 
-       while (*str) {
-               if (*str == '\n')
+       while (*s8) {
+               if (*s8 == '\n')
                        buf[pos++] = L'\r';
-               /* Cast to unsigned char to avoid sign-extension */
-               buf[pos++] = (unsigned char)(*str++);
-               if (*str == '\0' || pos >= lim - 2) {
+               c32 = utf8_to_utf32(&s8);
+               if (c32 < 0x10000) {
+                       /* Characters in plane 0 use a single word. */
+                       buf[pos++] = c32;
+               } else {
+                       /*
+                        * Characters in other planes encode into a surrogate
+                        * pair.
+                        */
+                       buf[pos++] = (0xd800 - (0x10000 >> 10)) + (c32 >> 10);
+                       buf[pos++] = 0xdc00 + (c32 & 0x3ff);
+               }
+               if (*s8 == '\0' || pos >= lim - 2) {
                        buf[pos] = L'\0';
                        efi_char16_puts(buf);
                        pos = 0;