From: Pawel Andruszkiewicz Date: Fri, 31 Jul 2015 08:51:00 +0000 (+0200) Subject: [MessagingEmail] Remove malformed UTF-8 characters. X-Git-Tag: submit/tizen/20150803.122237^2^2^2^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0c484320ff758419b8a9a68abc9a8f6047916dc4;p=platform%2Fcore%2Fapi%2Fwebapi-plugins.git [MessagingEmail] Remove malformed UTF-8 characters. [Verification] TCT pass rate: 100% Change-Id: I5aff8ec8baf16902473967e3d077f42e581043dc Signed-off-by: Pawel Andruszkiewicz --- diff --git a/src/messaging/message_conversation.cc b/src/messaging/message_conversation.cc index 5e9fbf1a..b7db3208 100755 --- a/src/messaging/message_conversation.cc +++ b/src/messaging/message_conversation.cc @@ -187,7 +187,7 @@ PlatformResult MessageConversation::convertMsgConversationToObject( msg_get_str_value(msg_thread, MSG_THREAD_MSG_DATA_STR, msgData, MAX_THREAD_DATA_LEN); - conversation->m_preview = msgData; + conversation->setPreview(msgData); err = msg_get_conversation_view_list(handle, conversation->m_conversation_id, &convViewList); @@ -265,7 +265,7 @@ PlatformResult MessageConversation::convertMsgConversationToObject( char strTemp[MAX_SUBJECT_LEN] = {0}; msg_get_str_value(msgInfo, MSG_MESSAGE_SUBJECT_STR, strTemp, MAX_SUBJECT_LEN); - conversation->m_conversation_subject = strTemp; + conversation->setSubject(strTemp); *result = conversation; return PlatformResult(ErrorCode::NO_ERROR); @@ -335,12 +335,12 @@ PlatformResult MessageConversation::convertEmailConversationToObject( if (resultMail->preview_text[0] != '\0') { - conversation->m_preview = resultMail->preview_text; + conversation->setPreview(resultMail->preview_text); } if (resultMail->subject[0] != '\0') { - conversation->m_conversation_subject = resultMail->subject; + conversation->setSubject(resultMail->subject); } conversation->m_is_read = (bool)resultMail->flags_seen_field; @@ -413,12 +413,12 @@ void MessageConversation::setUnreadMessages(int unread_messages) void MessageConversation::setPreview(std::string preview) { - m_preview = preview; + m_preview = SanitizeUtf8String(preview); } void MessageConversation::setSubject(std::string conversation_subject) { - m_conversation_subject = conversation_subject; + m_conversation_subject = SanitizeUtf8String(conversation_subject); } void MessageConversation::setIsRead(bool is_read) @@ -541,6 +541,19 @@ bool MessageConversation::isMatchingAttributeRange(const std::string& attribute_ return false; } +std::string MessageConversation::SanitizeUtf8String(const std::string& input) { + LoggerD("Entered"); + + std::string result = input; + const gchar* end = nullptr; + + while (FALSE == g_utf8_validate(result.c_str(), -1, &end)) { + result = result.substr(0, end - result.c_str()); + } + + return result; +} + } //messaging } //extension diff --git a/src/messaging/message_conversation.h b/src/messaging/message_conversation.h index f4934bb2..5a2dc423 100755 --- a/src/messaging/message_conversation.h +++ b/src/messaging/message_conversation.h @@ -97,6 +97,8 @@ public: tizen::AnyPtr end_value) const; private: + std::string SanitizeUtf8String(const std::string& input); + int m_conversation_id; MessageType m_conversation_type; time_t m_timestamp; diff --git a/src/messaging/messaging_util.cc b/src/messaging/messaging_util.cc index 7122a153..258b9639 100755 --- a/src/messaging/messaging_util.cc +++ b/src/messaging/messaging_util.cc @@ -310,6 +310,110 @@ std::string GetFilename(const std::string& file_path) { return basename.substr(0, basename.find_last_of(".")); } +std::string PerformConversion(const std::string& input, const gchar* from_charset) { + LoggerD("Entered"); + + GIConv cd = g_iconv_open("UTF-8//IGNORE", from_charset); + + if ((GIConv)-1 == cd) { + LoggerE("Failed to open iconv."); + return ""; + } + + // copied from glib/gconvert.c, g_convert does not handle "//IGNORE" properly + static const gsize kNulTerminatorLength = 4; + const gchar* str = input.c_str(); + gssize len = input.size(); + + gchar* p = const_cast(str); + gsize inbytes_remaining = len; + gsize outbuf_size = len + kNulTerminatorLength; + gsize outbytes_remaining = outbuf_size - kNulTerminatorLength; + gchar* dest = nullptr; + gchar* outp = nullptr; + gboolean have_error = FALSE; + gboolean done = FALSE; + gboolean reset = FALSE; + + outp = dest = static_cast(g_malloc(outbuf_size)); + + while (!done && !have_error) { + gsize err = 0; + + if (reset) { + err = g_iconv(cd, nullptr, &inbytes_remaining, &outp, &outbytes_remaining); + } else { + err = g_iconv(cd, &p, &inbytes_remaining, &outp, &outbytes_remaining); + } + + if (static_cast(-1) == err) { + switch (errno) { + case EINVAL: + LoggerD("EINVAL"); + // Incomplete text, do not report an error + done = TRUE; + break; + + case E2BIG: + { + LoggerD("E2BIG"); + gsize used = outp - dest; + + outbuf_size *= 2; + dest = static_cast(g_realloc(dest, outbuf_size)); + + outp = dest + used; + outbytes_remaining = outbuf_size - used - kNulTerminatorLength; + } + break; + + case EILSEQ: + if (0 == inbytes_remaining) { + LoggerD("EILSEQ reported, but whole input buffer was processed, assuming it's OK"); + } else { + LoggerE("EILSEQ"); + have_error = TRUE; + } + break; + + default: + LoggerE("Conversion error: %d", errno); + have_error = TRUE; + break; + } + } else { + if (!reset) { + // call g_iconv with NULL inbuf to cleanup shift state + reset = TRUE; + inbytes_remaining = 0; + } else { + done = TRUE; + } + } + } + + memset(outp, 0, kNulTerminatorLength); + + if ((p - str) != len) { + LoggerE("Partial character sequence at end of input"); + have_error = TRUE; + } + + g_iconv_close(cd); + + std::string result; + + if (!have_error) { + result = dest; + } else { + LoggerE("Conversion error"); + } + + g_free(dest); + + return result; +} + } // namespace std::string MessagingUtil::ConvertToUtf8(const std::string& file_path, const std::string& contents) { @@ -345,57 +449,22 @@ std::string MessagingUtil::ConvertToUtf8(const std::string& file_path, const std std::string output; - // if charset is unknown or it's UTF-8, conversion is not needed - if ((0 != g_ascii_strcasecmp(from_charset, UNKNOWN_CHARSET_PLAIN_TEXT_FILE)) && - (0 != g_ascii_strcasecmp(from_charset, "UTF-8"))) { + // if charset is unknown, conversion is not needed + if ((0 != g_ascii_strcasecmp(from_charset, UNKNOWN_CHARSET_PLAIN_TEXT_FILE))) { + // we're performing UTF-8 to UTF-8 conversion to remove malformed data LoggerD("performing conversion"); - GError* error = nullptr; - const gchar* to_charset = "UTF-8//IGNORE"; // convert to UTF-8, ignore unknown characters - - gchar* result = g_convert(contents.c_str(), // the string to convert - -1, // string is null terminated - to_charset, // target encoding - from_charset, // source encoding - nullptr, // ignore bytes read - nullptr, // ignore bytes written - &error); // store error - if ((nullptr == result || nullptr != error) && - 0 == g_ascii_strcasecmp(from_charset, "CP949")) { - if (nullptr != error) { - g_error_free(error); - } - - if (nullptr != result) { - g_free(result); - } + output = PerformConversion(contents, from_charset); + if ("" == output && 0 == g_ascii_strcasecmp(from_charset, "CP949")) { LoggerD("change: CP949 ===> EUC-KR, try again"); - result = g_convert(contents.c_str(), // the string to convert - -1, // string is null terminated - to_charset, // target encoding - "EUC-KR", // source encoding - nullptr, // ignore bytes read - nullptr, // ignore bytes written - &error); // store error + output = PerformConversion(contents, "EUC-KR"); } - if (nullptr == result || nullptr != error) { - LoggerE("g_convert() failed!"); - if (nullptr != error) { - LoggerE("error_code: [%d], msg: [%s]", error->code, error->message); - g_error_free(error); - } - - if (nullptr != result) { - g_free(result); - } - + if ("" == output) { + LoggerE("Conversion failed"); // conversion failed, use original contents output = contents; - } else { - output = result; - g_free(result); } } else { // no conversion