Use utf conversions from minipal (#89036)
authorLakshan Fernando <lakshanf@hotmail.com>
Fri, 21 Jul 2023 18:46:14 +0000 (11:46 -0700)
committerGitHub <noreply@github.com>
Fri, 21 Jul 2023 18:46:14 +0000 (11:46 -0700)
* Support for utf conversion

* cast fix

* FB

* FB

* Update src/coreclr/nativeaot/Runtime/eventpipe/ep-rt-aot.h

Co-authored-by: Jan Kotas <jkotas@microsoft.com>
* Apply suggestions from code review

Co-authored-by: Jan Kotas <jkotas@microsoft.com>
* FB

* FB

* FB

---------

Co-authored-by: Jan Kotas <jkotas@microsoft.com>
src/coreclr/nativeaot/Runtime/eventpipe/CMakeLists.txt
src/coreclr/nativeaot/Runtime/eventpipe/ep-rt-aot.h
src/native/minipal/utf8.c

index a6bedd8..3c61f34 100644 (file)
@@ -9,15 +9,21 @@ set(AOT_EVENTPIPE_SHIM_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
 
 set (CONTAINER_SOURCES "")
 set (CONTAINER_HEADERS "")
+set (MINIPAL_SOURCES "")
 set (EVENTPIPE_SOURCES "")
 set (EVENTPIPE_HEADERS "")
 set (GEN_EVENTPIPE_SOURCES "")
 
 set (SHARED_CONTAINERS_SOURCE_PATH "${CLR_SRC_NATIVE_DIR}/containers")
 set (SHARED_EVENTPIPE_SOURCE_PATH "${CLR_SRC_NATIVE_DIR}/eventpipe")
+set (SHARED_MINIPAL_SOURCE_PATH "${CLR_SRC_NATIVE_DIR}/minipal")
 include (${SHARED_EVENTPIPE_SOURCE_PATH}/eventpipe.cmake)
 include (${SHARED_CONTAINERS_SOURCE_PATH}/containers.cmake)
 
+list(APPEND MINIPAL_SOURCES
+  utf8.c
+)
+
 if(CLR_CMAKE_HOST_WIN32)
   list(APPEND SHARED_DIAGNOSTIC_SERVER_SOURCES
       ds-ipc-pal-namedpipe.c
@@ -50,6 +56,7 @@ list(APPEND EVENTPIPE_HEADERS
 
 addprefix(CONTAINER_SOURCES ${SHARED_CONTAINERS_SOURCE_PATH} "${SHARED_CONTAINER_SOURCES}")
 addprefix(CONTAINER_HEADERS ${SHARED_CONTAINERS_SOURCE_PATH} "${SHARED_CONTAINER_HEADERS}")
+addprefix(MINIPAL_SOURCES ${SHARED_MINIPAL_SOURCE_PATH} "${MINIPAL_SOURCES}")
 
 addprefix(EVENTPIPE_SOURCES ${SHARED_EVENTPIPE_SOURCE_PATH} "${EVENTPIPE_SOURCES}")
 addprefix(EVENTPIPE_HEADERS ${SHARED_EVENTPIPE_SOURCE_PATH} "${EVENTPIPE_HEADERS}")
@@ -125,6 +132,7 @@ list(APPEND EVENTPIPE_SOURCES
   ${GEN_EVENTPIPE_SOURCES}
   ${CONTAINER_SOURCES}
   ${CONTAINER_HEADERS}
+  ${MINIPAL_SOURCES}
 )
 
 list(APPEND AOT_EVENTPIPE_DISABLED_SOURCES
index 33deac1..ce3178d 100644 (file)
@@ -10,6 +10,8 @@
 #include <sys/time.h>
 #endif
 
+#include <minipal/utf8.h>
+
 #include <eventpipe/ep-rt-config.h>
 #ifdef ENABLE_PERFTRACING
 #include <eventpipe/ep-thread.h>
@@ -1371,6 +1373,7 @@ ep_rt_utf8_string_replace (
     return false;
 }
 
+
 static
 ep_char16_t *
 ep_rt_utf8_to_utf16le_string (
@@ -1382,22 +1385,36 @@ ep_rt_utf8_to_utf16le_string (
     if (!str)
         return NULL;
 
-    // Shipping criteria: no EVENTPIPE-NATIVEAOT-TODO left in the codebase
-    // Implementation would just use strlen and malloc to make a new buffer, and would then copy the string chars one by one.
-    // Assumes that only ASCII is used for ep_char8_t
-    size_t len_utf8 = strlen(str);
-    ep_char16_t *str_utf16 = reinterpret_cast<ep_char16_t *>(malloc ((len_utf8 + 1) * sizeof (ep_char16_t)));
-    if (!str_utf16)
+    if (len == 0) {
+        // Return an empty string if the length is 0
+        CHAR16_T * lpDestEmptyStr = reinterpret_cast<CHAR16_T *>(malloc(1 * sizeof(CHAR16_T)));
+        if(lpDestEmptyStr==NULL) {
+            return NULL;
+        }
+        *lpDestEmptyStr = '\0';
+        return reinterpret_cast<ep_char16_t*>(lpDestEmptyStr);
+    }
+
+    if (len == (size_t) -1) {
+        // Following the pattern used in EventPipe library where it allocates 1 extra character
+        len = strlen(str) + 1;
+    }
+
+    int32_t flags = MINIPAL_MB_NO_REPLACE_INVALID_CHARS | MINIPAL_TREAT_AS_LITTLE_ENDIAN;
+
+    size_t ret = minipal_get_length_utf8_to_utf16 (str, len, flags);
+
+    if (ret <= 0)
         return NULL;
 
-    for (size_t i = 0; i < len_utf8; i++)
-    {
-        EP_ASSERT(isascii(str[i]));
-         str_utf16[i] = str[i];
+    CHAR16_T * lpDestStr = reinterpret_cast<CHAR16_T *>(malloc((ret + 1) * sizeof(CHAR16_T)));
+    if(lpDestStr==NULL) {
+        return NULL;
     }
+    ret = minipal_convert_utf8_to_utf16 (str, len, lpDestStr, ret, flags);
+    lpDestStr[ret] = '\0';
 
-    str_utf16[len_utf8] = 0;
-    return str_utf16;
+    return reinterpret_cast<ep_char16_t*>(lpDestStr);
 }
 
 static
@@ -1446,27 +1463,37 @@ ep_rt_utf16_to_utf8_string (
     size_t len)
 {
     STATIC_CONTRACT_NOTHROW;
-
     if (!str)
         return NULL;
-    
-    // shipping criteria: no EVENTPIPE-NATIVEAOT-TODO left in the codebase
-    // Simple implementation to create a utf8 string from a utf16 one
-    size_t len_utf16 = len;
-    if(len_utf16 == (size_t)-1)
-        len_utf16 = ep_rt_utf16_string_len (str);
 
-    ep_char8_t *str_utf8 = reinterpret_cast<ep_char8_t *>(malloc ((len_utf16 + 1) * sizeof (ep_char8_t)));
-    if (!str_utf8)
+    if (len == 0) {
+        // Return an empty string if the length is 0
+        char * lpDestEmptyStr = reinterpret_cast<char *>(malloc(1 * sizeof(char)));
+        if(lpDestEmptyStr==NULL) {
+            return NULL;
+        }
+        *lpDestEmptyStr = '\0';
+        return reinterpret_cast<ep_char8_t*>(lpDestEmptyStr);
+    }
+
+    if (len == (size_t) -1) {
+        // Following the pattern used in EventPipe library where it allocates 1 extra character
+        len = ep_rt_utf16_string_len (str) + 1;
+    }
+
+    size_t ret = minipal_get_length_utf16_to_utf8 (reinterpret_cast<const CHAR16_T *>(str), len, 0);
+
+    if (ret <= 0)
         return NULL;
 
-    for (size_t i = 0; i < len_utf16; i++)
-    {
-         str_utf8[i] = (char)str[i];
+    char* lpDestStr = reinterpret_cast<char *>(malloc((ret + 1) * sizeof(char)));
+    if(lpDestStr==NULL) {
+        return NULL;
     }
+    ret = minipal_convert_utf16_to_utf8 (reinterpret_cast<const CHAR16_T*>(str), len, lpDestStr, ret, 0);
+    lpDestStr[ret] = '\0';
 
-    str_utf8[len_utf16] = 0;
-    return str_utf8;
+    return reinterpret_cast<ep_char8_t*>(lpDestStr);
 }
 
 static
index a54b805..77a4e14 100644 (file)
@@ -365,7 +365,8 @@ static size_t GetCharCount(UTF8Encoding* self, unsigned char* bytes, size_t coun
     // Initialize stuff
     unsigned char *pSrc = bytes;
     unsigned char *pEnd = pSrc + count;
-    int availableBytes, chc;
+    size_t availableBytes;
+    int chc;
 
     // Start by assuming we have as many as count, charCount always includes the adjustment
     // for the character being decoded
@@ -532,7 +533,7 @@ static size_t GetCharCount(UTF8Encoding* self, unsigned char* bytes, size_t coun
 
     EncodeChar:
 
-        availableBytes = pEnd - pSrc;
+        availableBytes = (size_t)(pEnd - pSrc);
 
         // don't fall into the fast decoding loop if we don't have enough bytes
         if (availableBytes <= 13)
@@ -749,7 +750,7 @@ static size_t GetCharCount(UTF8Encoding* self, unsigned char* bytes, size_t coun
         return 0;                                  \
     }
 
-static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, CHAR16_T* chars, size_t charCount)
+static size_t GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount, CHAR16_T* chars, size_t charCount)
 {
     assert(chars != NULL);
     assert(byteCount >= 0);
@@ -982,8 +983,8 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount,
         *pTarget = (CHAR16_T)ch;
         ENSURE_BUFFER_INC
 
-        int availableChars = pAllocatedBufferEnd - pTarget;
-        int availableBytes = pEnd - pSrc;
+        size_t availableChars = (size_t)(pAllocatedBufferEnd - pTarget);
+        size_t availableBytes = (size_t)(pEnd - pSrc);
 
         // don't fall into the fast decoding loop if we don't have enough bytes
         // Test for availableChars is done because pStop would be <= pTarget.
@@ -1289,7 +1290,7 @@ static int GetChars(UTF8Encoding* self, unsigned char* bytes, size_t byteCount,
         return 0;
     }
 
-    return pTarget - chars;
+    return (size_t)(pTarget - chars);
 }
 
 static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, unsigned char* bytes, size_t byteCount)
@@ -1510,8 +1511,8 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un
         if (fallbackUsed && (ch = EncoderReplacementFallbackBuffer_InternalGetNextChar(&self->buffer.encoder)) != 0)
             goto ProcessChar;
 
-        int availableChars = pEnd - pSrc;
-        int availableBytes = pAllocatedBufferEnd - pTarget;
+        size_t availableChars = (size_t)(pEnd - pSrc);
+        size_t availableBytes = (size_t)(pAllocatedBufferEnd - pTarget);
 
         // don't fall into the fast decoding loop if we don't have enough characters
         // Note that if we don't have enough bytes, pStop will prevent us from entering the fast loop.
@@ -1709,7 +1710,7 @@ static size_t GetBytes(UTF8Encoding* self, CHAR16_T* chars, size_t charCount, un
         return 0;
     }
 
-    return (int)(pTarget - bytes);
+    return (size_t)(pTarget - bytes);
 }
 
 static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count)
@@ -1889,7 +1890,7 @@ static size_t GetByteCount(UTF8Encoding* self, CHAR16_T *chars, size_t count)
             goto ProcessChar;
         }
 
-        int availableChars = pEnd - pSrc;
+        size_t availableChars = (size_t)(pEnd - pSrc);
 
         // don't fall into the fast decoding loop if we don't have enough characters
         if (availableChars <= 13)