More UTF string conversion wrappers

author Marianne Mailhot-Sarrasin <marianne.mailhot.sarrasin@gmail.com>

Fri, 11 Mar 2016 15:59:32 +0000 (15:59 +0000)

committer Marianne Mailhot-Sarrasin <marianne.mailhot.sarrasin@gmail.com>

Fri, 11 Mar 2016 15:59:32 +0000 (15:59 +0000)
author Marianne Mailhot-Sarrasin <marianne.mailhot.sarrasin@gmail.com>
Fri, 11 Mar 2016 15:59:32 +0000 (15:59 +0000)
committer Marianne Mailhot-Sarrasin <marianne.mailhot.sarrasin@gmail.com>
Fri, 11 Mar 2016 15:59:32 +0000 (15:59 +0000)
diff --git a/llvm/include/llvm/Support/ConvertUTF.h b/llvm/include/llvm/Support/ConvertUTF.h

index 38952ec..612824a 100644 (file)
--- a/llvm/include/llvm/Support/ConvertUTF.h
+++ b/llvm/include/llvm/Support/ConvertUTF.h
@@ -198,6 +198,25 @@ bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
                         char *&ResultPtr, const UTF8 *&ErrorPtr);
  
  /**
+* Converts a UTF-8 StringRef to a std::wstring.
+* \return true on success.
+*/
+bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result);
+
+/**
+* Converts a UTF-8 C-string to a std::wstring.
+* \return true on success.
+*/
+bool ConvertUTF8toWide(const char *Source, std::wstring &Result);
+
+/**
+* Converts a std::wstring to a UTF-8 encoded std::string.
+* \return true on success.
+*/
+bool convertWideToUTF8(const std::wstring &Source, std::string &Result);
+
+
+/**
   * Convert an Unicode code point to UTF8 sequence.
   *
   * \param Source a Unicode code point.
@@ -252,6 +271,15 @@ bool hasUTF16ByteOrderMark(ArrayRef<char> SrcBytes);
  bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out);
  
  /**
+* Converts a UTF16 string into a UTF8 std::string.
+*
+* \param [in] Src A buffer of UTF-16 encoded text.
+* \param [out] Out Converted UTF-8 is stored here on success.
+* \returns true on success
+*/
+bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out);
+
+/**
   * Converts a UTF-8 string into a UTF-16 string with native endianness.
   *
   * \returns true on success
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp

index c1615a0..b0867b4 100644 (file)
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -787,9 +787,28 @@ void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
    assert(envVar && "Environment variable name missing");
  
    // Get the environment variable they want us to parse options out of.
+#ifdef _WIN32
+  std::wstring wenvVar;
+  if (!llvm::ConvertUTF8toWide(envVar, wenvVar)) {
+    assert(false &&
+           "Unicode conversion of environment variable name failed");
+    return;
+  }
+  const wchar_t *wenvValue = _wgetenv(wenvVar.c_str());
+  if (!wenvValue)
+    return;
+  std::string envValueBuffer;
+  if (!llvm::convertWideToUTF8(wenvValue, envValueBuffer)) {
+    assert(false &&
+           "Unicode conversion of environment variable value failed");
+    return;
+  }
+  const char *envValue = envValueBuffer.c_str();
+#else
    const char *envValue = getenv(envVar);
    if (!envValue)
      return;
+#endif
  
    // Get program's "name", which we wouldn't know without the caller
    // telling us.
diff --git a/llvm/lib/Support/ConvertUTFWrapper.cpp b/llvm/lib/Support/ConvertUTFWrapper.cpp

index 1bbef23..f3cef52 100644 (file)
--- a/llvm/lib/Support/ConvertUTFWrapper.cpp
+++ b/llvm/lib/Support/ConvertUTFWrapper.cpp
@@ -8,6 +8,7 @@
  //===----------------------------------------------------------------------===//
  
  #include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/ErrorHandling.h"
  #include "llvm/Support/SwapByteOrder.h"
  #include <string>
  #include <vector>
@@ -36,7 +37,7 @@ bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
      ConversionFlags flags = strictConversion;
      result = ConvertUTF8toUTF16(
          &sourceStart, sourceStart + Source.size(),
-        &targetStart, targetStart + 2*Source.size(), flags);
+        &targetStart, targetStart + Source.size(), flags);
      if (result == conversionOK)
        ResultPtr = reinterpret_cast<char*>(targetStart);
      else
@@ -49,7 +50,7 @@ bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
      ConversionFlags flags = strictConversion;
      result = ConvertUTF8toUTF32(
          &sourceStart, sourceStart + Source.size(),
-        &targetStart, targetStart + 4*Source.size(), flags);
+        &targetStart, targetStart + Source.size(), flags);
      if (result == conversionOK)
        ResultPtr = reinterpret_cast<char*>(targetStart);
      else
@@ -130,6 +131,13 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
    return true;
  }
  
+bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out)
+{
+  return convertUTF16ToUTF8String(
+      llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
+      Src.size() * sizeof(UTF16)), Out);
+}
+
  bool convertUTF8ToUTF16String(StringRef SrcUTF8,
                                SmallVectorImpl<UTF16> &DstUTF16) {
    assert(DstUTF16.empty());
@@ -168,5 +176,74 @@ bool convertUTF8ToUTF16String(StringRef SrcUTF8,
    return true;
  }
  
+static_assert(sizeof(wchar_t) == 1 || sizeof(wchar_t) == 2 ||
+                  sizeof(wchar_t) == 4,
+              "Expected wchar_t to be 1, 2, or 4 bytes");
+
+template <typename TResult>
+static inline bool ConvertUTF8toWideInternal(llvm::StringRef Source,
+                                             TResult &Result) {
+  // Even in the case of UTF-16, the number of bytes in a UTF-8 string is
+  // at least as large as the number of elements in the resulting wide
+  // string, because surrogate pairs take at least 4 bytes in UTF-8.
+  Result.resize(Source.size() + 1);
+  char *ResultPtr = reinterpret_cast<char *>(&Result[0]);
+  const UTF8 *ErrorPtr;
+  if (!ConvertUTF8toWide(sizeof(wchar_t), Source, ResultPtr, ErrorPtr)) {
+    Result.clear();
+    return false;
+  }
+  Result.resize(reinterpret_cast<wchar_t *>(ResultPtr) - &Result[0]);
+  return true;
+}
+
+bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result) {
+  return ConvertUTF8toWideInternal(Source, Result);
+}
+
+bool ConvertUTF8toWide(const char *Source, std::wstring &Result) {
+  if (!Source) {
+    Result.clear();
+    return true;
+  }
+  return ConvertUTF8toWide(llvm::StringRef(Source), Result);
+}
+
+bool convertWideToUTF8(const std::wstring &Source, std::string &Result) {
+  if (sizeof(wchar_t) == 1) {
+    const UTF8 *Start = reinterpret_cast<const UTF8 *>(Source.data());
+    const UTF8 *End =
+        reinterpret_cast<const UTF8 *>(Source.data() + Source.size());
+    if (!isLegalUTF8String(&Start, End))
+      return false;
+    Result.resize(Source.size());
+    memcpy(&Result[0], Source.data(), Source.size());
+    return true;
+  } else if (sizeof(wchar_t) == 2) {
+    return convertUTF16ToUTF8String(
+        llvm::ArrayRef<UTF16>(reinterpret_cast<const UTF16 *>(Source.data()),
+                              Source.size()),
+        Result);
+  } else if (sizeof(wchar_t) == 4) {
+    const UTF32 *Start = reinterpret_cast<const UTF32 *>(Source.data());
+    const UTF32 *End =
+        reinterpret_cast<const UTF32 *>(Source.data() + Source.size());
+    Result.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT * Source.size());
+    UTF8 *ResultPtr = reinterpret_cast<UTF8 *>(&Result[0]);
+    UTF8 *ResultEnd = reinterpret_cast<UTF8 *>(&Result[0] + Result.size());
+    if (ConvertUTF32toUTF8(&Start, End, &ResultPtr, ResultEnd,
+                           strictConversion) == conversionOK) {
+      Result.resize(reinterpret_cast<char *>(ResultPtr) - &Result[0]);
+      return true;
+    } else {
+      Result.clear();
+      return false;
+    }
+  } else {
+    llvm_unreachable(
+        "Control should never reach this point; see static_assert further up");
+  }
+}
+
  } // end namespace llvm
  
diff --git a/llvm/unittests/Support/ConvertUTFTest.cpp b/llvm/unittests/Support/ConvertUTFTest.cpp

index d436fc0..61ed252 100644 (file)
--- a/llvm/unittests/Support/ConvertUTFTest.cpp
+++ b/llvm/unittests/Support/ConvertUTFTest.cpp
@@ -59,7 +59,7 @@ TEST(ConvertUTFTest, OddLengthInput) {
  
  TEST(ConvertUTFTest, Empty) {
    std::string Result;
-  bool Success = convertUTF16ToUTF8String(None, Result);
+  bool Success = convertUTF16ToUTF8String(llvm::ArrayRef<char>(None), Result);
    EXPECT_TRUE(Success);
    EXPECT_TRUE(Result.empty());
  }
@@ -80,6 +80,41 @@ TEST(ConvertUTFTest, HasUTF16BOM) {
    EXPECT_FALSE(HasBOM);
  }
  
+TEST(ConvertUTFTest, UTF16WrappersForConvertUTF16ToUTF8String) {
+  // Src is the look of disapproval.
+  static const char Src[] = "\xff\xfe\xa0\x0c_\x00\xa0\x0c";
+  ArrayRef<UTF16> SrcRef = makeArrayRef((const UTF16 *)Src, 4);
+  std::string Result;
+  bool Success = convertUTF16ToUTF8String(SrcRef, Result);
+  EXPECT_TRUE(Success);
+  std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
+  EXPECT_EQ(Expected, Result);
+}
+
+TEST(ConvertUTFTest, ConvertUTF8toWide) {
+  // Src is the look of disapproval.
+  static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0";
+  std::wstring Result;
+  bool Success = ConvertUTF8toWide((const char*)Src, Result);
+  EXPECT_TRUE(Success);
+  std::wstring Expected(L"\x0ca0_\x0ca0");
+  EXPECT_EQ(Expected, Result);
+  Result.clear();
+  Success = ConvertUTF8toWide(StringRef(Src, 7), Result);
+  EXPECT_TRUE(Success);
+  EXPECT_EQ(Expected, Result);
+}
+
+TEST(ConvertUTFTest, convertWideToUTF8) {
+  // Src is the look of disapproval.
+  static const wchar_t Src[] = L"\x0ca0_\x0ca0";
+  std::string Result;
+  bool Success = convertWideToUTF8(Src, Result);
+  EXPECT_TRUE(Success);
+  std::string Expected("\xe0\xb2\xa0_\xe0\xb2\xa0");
+  EXPECT_EQ(Expected, Result);
+}
+
  struct ConvertUTFResultContainer {
    ConversionResult ErrorCode;
    std::vector<unsigned> UnicodeScalars;
author	Marianne Mailhot-Sarrasin <marianne.mailhot.sarrasin@gmail.com>
	Fri, 11 Mar 2016 15:59:32 +0000 (15:59 +0000)
committer	Marianne Mailhot-Sarrasin <marianne.mailhot.sarrasin@gmail.com>
	Fri, 11 Mar 2016 15:59:32 +0000 (15:59 +0000)
llvm/include/llvm/Support/ConvertUTF.h		patch \| blob \| history
llvm/lib/Support/CommandLine.cpp		patch \| blob \| history
llvm/lib/Support/ConvertUTFWrapper.cpp		patch \| blob \| history
llvm/unittests/Support/ConvertUTFTest.cpp		patch \| blob \| history