[MessagingEmail] Remove malformed UTF-8 characters.

author Pawel Andruszkiewicz <p.andruszkie@samsung.com>

Fri, 31 Jul 2015 08:51:00 +0000 (10:51 +0200)

committer Pawel Andruszkiewicz <p.andruszkie@samsung.com>

Fri, 31 Jul 2015 13:38:22 +0000 (15:38 +0200)
author Pawel Andruszkiewicz <p.andruszkie@samsung.com>
Fri, 31 Jul 2015 08:51:00 +0000 (10:51 +0200)
committer Pawel Andruszkiewicz <p.andruszkie@samsung.com>
Fri, 31 Jul 2015 13:38:22 +0000 (15:38 +0200)
diff --git a/src/messaging/message_conversation.cc b/src/messaging/message_conversation.cc

index 5e9fbf1a31d952264761a563abfb959da4c10f8f..b7db32083667f86c10285522c2d2163dae52e288 100755 (executable)
--- a/src/messaging/message_conversation.cc
+++ b/src/messaging/message_conversation.cc
@@ -187,7 +187,7 @@ PlatformResult MessageConversation::convertMsgConversationToObject(
  
      msg_get_str_value(msg_thread, MSG_THREAD_MSG_DATA_STR, msgData, MAX_THREAD_DATA_LEN);
  
-    conversation->m_preview = msgData;
+    conversation->setPreview(msgData);
  
      err = msg_get_conversation_view_list(handle, conversation->m_conversation_id,
                                           &convViewList);
@@ -265,7 +265,7 @@ PlatformResult MessageConversation::convertMsgConversationToObject(
      char strTemp[MAX_SUBJECT_LEN] = {0};
      msg_get_str_value(msgInfo, MSG_MESSAGE_SUBJECT_STR, strTemp, MAX_SUBJECT_LEN);
  
-    conversation->m_conversation_subject = strTemp;
+    conversation->setSubject(strTemp);
  
      *result = conversation;
      return PlatformResult(ErrorCode::NO_ERROR);
@@ -335,12 +335,12 @@ PlatformResult MessageConversation::convertEmailConversationToObject(
  
          if (resultMail->preview_text[0] != '\0')
          {
-            conversation->m_preview = resultMail->preview_text;
+            conversation->setPreview(resultMail->preview_text);
          }
  
          if (resultMail->subject[0] != '\0')
          {
-            conversation->m_conversation_subject = resultMail->subject;
+            conversation->setSubject(resultMail->subject);
          }
  
          conversation->m_is_read = (bool)resultMail->flags_seen_field;
@@ -413,12 +413,12 @@ void MessageConversation::setUnreadMessages(int unread_messages)
  
  void MessageConversation::setPreview(std::string preview)
  {
-    m_preview = preview;
+    m_preview = SanitizeUtf8String(preview);
  }
  
  void MessageConversation::setSubject(std::string conversation_subject)
  {
-    m_conversation_subject = conversation_subject;
+    m_conversation_subject = SanitizeUtf8String(conversation_subject);
  }
  
  void MessageConversation::setIsRead(bool is_read)
@@ -541,6 +541,19 @@ bool MessageConversation::isMatchingAttributeRange(const std::string& attribute_
      return false;
  }
  
+std::string MessageConversation::SanitizeUtf8String(const std::string& input) {
+  LoggerD("Entered");
+
+  std::string result = input;
+  const gchar* end = nullptr;
+
+  while (FALSE == g_utf8_validate(result.c_str(), -1, &end)) {
+    result = result.substr(0, end - result.c_str());
+  }
+
+  return result;
+}
+
  }    //messaging
  }    //extension
  
diff --git a/src/messaging/message_conversation.h b/src/messaging/message_conversation.h

index f4934bb251d0405ad6aa143d11d2b367c0989090..5a2dc423b3b5247428e3bb8083a198a5b4d903b2 100755 (executable)
--- a/src/messaging/message_conversation.h
+++ b/src/messaging/message_conversation.h
@@ -97,6 +97,8 @@ public:
              tizen::AnyPtr end_value) const;
  
  private:
+    std::string SanitizeUtf8String(const std::string& input);
+
      int m_conversation_id;
      MessageType m_conversation_type;
      time_t m_timestamp;
diff --git a/src/messaging/messaging_util.cc b/src/messaging/messaging_util.cc

index 7122a1531cee454300062ecd604b47e30745bae7..258b9639a2ea797031d3379542fb944d26f4b5e2 100755 (executable)
--- a/src/messaging/messaging_util.cc
+++ b/src/messaging/messaging_util.cc
@@ -310,6 +310,110 @@ std::string GetFilename(const std::string& file_path) {
    return basename.substr(0, basename.find_last_of("."));
  }
  
+std::string PerformConversion(const std::string& input, const gchar* from_charset) {
+  LoggerD("Entered");
+
+  GIConv cd = g_iconv_open("UTF-8//IGNORE", from_charset);
+
+  if ((GIConv)-1 == cd) {
+    LoggerE("Failed to open iconv.");
+    return "";
+  }
+
+  // copied from glib/gconvert.c, g_convert does not handle "//IGNORE" properly
+  static const gsize kNulTerminatorLength = 4;
+  const gchar* str = input.c_str();
+  gssize len = input.size();
+
+  gchar* p = const_cast<gchar*>(str);
+  gsize inbytes_remaining = len;
+  gsize outbuf_size = len + kNulTerminatorLength;
+  gsize outbytes_remaining = outbuf_size - kNulTerminatorLength;
+  gchar* dest = nullptr;
+  gchar* outp = nullptr;
+  gboolean have_error = FALSE;
+  gboolean done = FALSE;
+  gboolean reset = FALSE;
+
+  outp = dest = static_cast<gchar*>(g_malloc(outbuf_size));
+
+  while (!done && !have_error) {
+    gsize err = 0;
+
+    if (reset) {
+      err = g_iconv(cd, nullptr, &inbytes_remaining, &outp, &outbytes_remaining);
+    } else {
+      err = g_iconv(cd, &p, &inbytes_remaining, &outp, &outbytes_remaining);
+    }
+
+    if (static_cast<gsize>(-1) == err) {
+      switch (errno) {
+        case EINVAL:
+          LoggerD("EINVAL");
+          // Incomplete text, do not report an error
+          done = TRUE;
+          break;
+
+        case E2BIG:
+          {
+            LoggerD("E2BIG");
+            gsize used = outp - dest;
+
+            outbuf_size *= 2;
+            dest = static_cast<gchar*>(g_realloc(dest, outbuf_size));
+
+            outp = dest + used;
+            outbytes_remaining = outbuf_size - used - kNulTerminatorLength;
+          }
+          break;
+
+        case EILSEQ:
+          if (0 == inbytes_remaining) {
+            LoggerD("EILSEQ reported, but whole input buffer was processed, assuming it's OK");
+          } else {
+            LoggerE("EILSEQ");
+            have_error = TRUE;
+          }
+          break;
+
+        default:
+          LoggerE("Conversion error: %d", errno);
+          have_error = TRUE;
+          break;
+      }
+    } else {
+      if (!reset) {
+        // call g_iconv with NULL inbuf to cleanup shift state
+        reset = TRUE;
+        inbytes_remaining = 0;
+      } else {
+        done = TRUE;
+      }
+    }
+  }
+
+  memset(outp, 0, kNulTerminatorLength);
+
+  if ((p - str) != len) {
+    LoggerE("Partial character sequence at end of input");
+    have_error = TRUE;
+  }
+
+  g_iconv_close(cd);
+
+  std::string result;
+
+  if (!have_error) {
+    result = dest;
+  } else {
+    LoggerE("Conversion error");
+  }
+
+  g_free(dest);
+
+  return result;
+}
+
  }  // namespace
  
  std::string MessagingUtil::ConvertToUtf8(const std::string& file_path, const std::string& contents) {
@@ -345,57 +449,22 @@ std::string MessagingUtil::ConvertToUtf8(const std::string& file_path, const std
  
    std::string output;
  
-  // if charset is unknown or it's UTF-8, conversion is not needed
-  if ((0 != g_ascii_strcasecmp(from_charset, UNKNOWN_CHARSET_PLAIN_TEXT_FILE)) &&
-      (0 != g_ascii_strcasecmp(from_charset, "UTF-8"))) {
+  // if charset is unknown, conversion is not needed
+  if ((0 != g_ascii_strcasecmp(from_charset, UNKNOWN_CHARSET_PLAIN_TEXT_FILE))) {
+    // we're performing UTF-8 to UTF-8 conversion to remove malformed data
      LoggerD("performing conversion");
  
-    GError* error = nullptr;
-    const gchar* to_charset = "UTF-8//IGNORE";  // convert to UTF-8, ignore unknown characters
-
-    gchar* result = g_convert(contents.c_str(),  // the string to convert
-                              -1,  // string is null terminated
-                              to_charset,  // target encoding
-                              from_charset,  // source encoding
-                              nullptr,  // ignore bytes read
-                              nullptr,  // ignore bytes written
-                              &error);  // store error
-    if ((nullptr == result || nullptr != error) &&
-        0 == g_ascii_strcasecmp(from_charset, "CP949")) {
-      if (nullptr != error) {
-        g_error_free(error);
-      }
-
-      if (nullptr != result) {
-        g_free(result);
-      }
+    output = PerformConversion(contents, from_charset);
  
+    if ("" == output && 0 == g_ascii_strcasecmp(from_charset, "CP949")) {
        LoggerD("change: CP949 ===> EUC-KR, try again");
-      result = g_convert(contents.c_str(),  // the string to convert
-                         -1,  // string is null terminated
-                         to_charset,  // target encoding
-                         "EUC-KR",  // source encoding
-                         nullptr,  // ignore bytes read
-                         nullptr,  // ignore bytes written
-                         &error);  // store error
+      output = PerformConversion(contents, "EUC-KR");
      }
  
-    if (nullptr == result || nullptr != error) {
-      LoggerE("g_convert() failed!");
-      if (nullptr != error) {
-        LoggerE("error_code: [%d], msg: [%s]", error->code, error->message);
-        g_error_free(error);
-      }
-
-      if (nullptr != result) {
-        g_free(result);
-      }
-
+    if ("" == output) {
+      LoggerE("Conversion failed");
        // conversion failed, use original contents
        output = contents;
-    } else {
-      output = result;
-      g_free(result);
      }
    } else {
      // no conversion
author	Pawel Andruszkiewicz <p.andruszkie@samsung.com>
	Fri, 31 Jul 2015 08:51:00 +0000 (10:51 +0200)
committer	Pawel Andruszkiewicz <p.andruszkie@samsung.com>
	Fri, 31 Jul 2015 13:38:22 +0000 (15:38 +0200)
src/messaging/message_conversation.cc		patch \| blob \| history
src/messaging/message_conversation.h		patch \| blob \| history
src/messaging/messaging_util.cc		patch \| blob \| history