From: Hyunjee Kim Date: Wed, 22 Jan 2020 08:32:27 +0000 (+0900) Subject: [ACR-1516][UCsdet] Module implementation X-Git-Tag: accepted/tizen/unified/20200428.125949^0 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=c15cbeb6c42559ea55515472768485a8faf5ce23;p=platform%2Fcore%2Fapi%2Fbase-utils.git [ACR-1516][UCsdet] Module implementation Change-Id: I3fa3e889792f697c9d5bae78669e9a42cf45e8fb Signed-off-by: Hyunjee Kim --- diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 279e847..ad7c884 100755 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -46,6 +46,7 @@ SET(BASEUTILS_SRCS utils_i18n_uidna.c utils_i18n_ucnv.c utils_i18n_ucnvsel.c + utils_i18n_ucsdet.c utils_i18n_plural_rules.cpp utils_i18n_plural_format.cpp utils_i18n_immutable_idx.cpp @@ -113,5 +114,6 @@ INSTALL(FILES ${CMAKE_CURRENT_SOURCE_DIR}/${INC_DIR}/utils_i18n_uscript.h DESTIN INSTALL(FILES ${CMAKE_CURRENT_SOURCE_DIR}/${INC_DIR}/utils_i18n_uidna.h DESTINATION ${INCLUDE_INSTALL_DIR}/base) INSTALL(FILES ${CMAKE_CURRENT_SOURCE_DIR}/${INC_DIR}/utils_i18n_ucnv.h DESTINATION ${INCLUDE_INSTALL_DIR}/base) INSTALL(FILES ${CMAKE_CURRENT_SOURCE_DIR}/${INC_DIR}/utils_i18n_ucnvsel.h DESTINATION ${INCLUDE_INSTALL_DIR}/base) +INSTALL(FILES ${CMAKE_CURRENT_SOURCE_DIR}/${INC_DIR}/utils_i18n_ucsdet.h DESTINATION ${INCLUDE_INSTALL_DIR}/base) INSTALL(FILES ${CMAKE_CURRENT_SOURCE_DIR}/${INC_DIR}/utils_i18n.h DESTINATION ${INCLUDE_INSTALL_DIR}/base) INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/${pc_name}.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig) diff --git a/src/include/utils_i18n.h b/src/include/utils_i18n.h index c3cada7..8e5f4d7 100644 --- a/src/include/utils_i18n.h +++ b/src/include/utils_i18n.h @@ -58,6 +58,7 @@ #include #include #include +#include /** * @file utils_i18n.h @@ -109,6 +110,7 @@ extern "C" { * - IDNA * - Character conversion * - Converter selector + * - Charset Detection * * This module provides flexible generation of number or date format patterns and helps you format and parse dates/number for any locale. * The i18n module provides various features based on data from ICU. The following table shows the version of ICU used in each Tizen platform. @@ -295,6 +297,10 @@ extern "C" { * @ref CAPI_BASE_UTILS_I18N_UCONVERTER_SELECTOR_MODULE * A converter selector is built with a set of encoding/charset names and given an input string returns the set of names of the corresponding converters which can convert the string. * + * + * @ref CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE + * Detecting the charset or encoding of character data in an unknown text format + * * * * @section CAPI_BASE_UTILS_I18N_MODULE_MAPPING_TABLE Mapping Table @@ -4879,6 +4885,71 @@ extern "C" { * #i18n_ucnvsel_select_for_utf8 * ucnvsel_selectForUTF8 * + * + * @ref CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE + * #i18n_ucsdet_create + * ucsdet_open + * + * + * @ref CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE + * #i18n_ucsdet_destroy + * ucsdet_close + * + * + * @ref CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE + * #i18n_ucsdet_set_text + * ucsdet_setText + * + * + * @ref CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE + * #i18n_ucsdet_set_declared_encoding + * ucsdet_setDeclaredEncoding + * + * + * @ref CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE + * #i18n_ucsdet_detect + * ucsdet_detect + * + * + * @ref CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE + * #i18n_ucsdet_detect_all + * ucsdet_detectAll + * + * + * @ref CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE + * #i18n_ucsdet_get_name + * ucsdet_getName + * + * + * @ref CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE + * #i18n_ucsdet_get_confidence + * ucsdet_getConfidence + * + * + * @ref CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE + * #i18n_ucsdet_get_language + * ucsdet_getLanguage + * + * + * @ref CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE + * #i18n_ucsdet_get_uchars + * ucsdet_getUChars + * + * + * @ref CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE + * #i18n_ucsdet_get_all_detectable_charsets + * ucsdet_getAllDetectableCharsets + * + * + * @ref CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE + * #i18n_ucsdet_is_input_filter_enabled + * ucsdet_isInputFilterEnabled + * + * + * @ref CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE + * #i18n_ucsdet_enable_input_filter + * ucsdet_enableInputFilter + * * */ diff --git a/src/include/utils_i18n_types.h b/src/include/utils_i18n_types.h index e06958c..47aac46 100644 --- a/src/include/utils_i18n_types.h +++ b/src/include/utils_i18n_types.h @@ -4798,6 +4798,26 @@ typedef enum { * @since_tizen 6.0 */ typedef void *i18n_uconverter_selector_h; +/** + * @} + */ + +/** + * @addtogroup CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE + * @{ + */ + +/** + * @brief An i18n_ucharset_detector_h handle. + * @since_tizen 6.0 + */ +typedef void *i18n_ucharset_detector_h; + +/** + * @brief An i18n_ucharset_match_h handle. + * @since_tizen 6.0 + */ +typedef void *i18n_ucharset_match_h; /** * @} diff --git a/src/include/utils_i18n_ucsdet.h b/src/include/utils_i18n_ucsdet.h new file mode 100644 index 0000000..2b2ea6a --- /dev/null +++ b/src/include/utils_i18n_ucsdet.h @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTILS_I18N_UCSDET_H__ +#define __UTILS_I18N_UCSDET_H__ + +#include + +/** + * @file utils_i18n_ucsdet.h + * @version 0.1 + * @brief utils_i18n_ucsdet + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @ingroup CAPI_BASE_UTILS_I18N_MODULE + * @defgroup CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE Ucsdet + * @brief C API: Charset Detection API. + * @section CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE_HEADER Required Header + * \#include + * + * @section CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE_OVERVIEW Overview + * @details This function provides a facility for detecting the charset or encoding of character data in an unknown text format. + * The input data can be from an array of bytes. + * + * Character set detection is at best an imprecise operation. + * The detection process will attempt to identify the charset that best matches the characteristics of the byte data, + * but the process is partly statistical in nature, and the results can not be guaranteed to always be correct. + * + * For best accuracy in charset detection, the input data should be primarily in a single language, + * and a minimum of a few hundred bytes worth of plain text in the language are needed. + * The detection process will attempt to ignore html or xml style markup that could otherwise obscure the content. + * + * An alternative to the ICU Charset Detector is the Compact Encoding Detector, + * https://github.com/google/compact_enc_det. It often gives more accurate results, especially with short input samples. + * + */ + +/** + * @addtogroup CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE + * @{ + */ + +/** + * @brief Creates an #i18n_ucharset_detector_h. + * @since_tizen 6.0 + * @remarks The @a ucsd should be released using #i18n_ucsdet_destroy(). + * @param[out] ucsd The newly created charset detector. + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * + */ +int i18n_ucsdet_create(i18n_ucharset_detector_h *ucsd); + +/** + * @brief Destroys a charset detector. + * @details All storage and any other resources owned by this charset detector will be released. + * Failure to destroy a charset detector when finished with it can result in memory leaks in the application. + * @since_tizen 6.0 + * @param[in] ucsd The charset detector to be destroyed. + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * + */ +int i18n_ucsdet_destroy(i18n_ucharset_detector_h ucsd); + +/** + * @brief Sets the input byte data whose charset is to detected. + * @details Ownership of the input text byte array remains with the caller. + * The input string must not be altered or deleted until the charset detector is either destroyed or reset to refer to different input text. + * @since_tizen 6.0 + * @param[in] ucsd The charset detector to be used. + * @param[in] text_in The input text of unknown encoding. + * @param[in] len The length of the input text, or -1 if the text is NUL terminated. + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * + */ +int i18n_ucsdet_set_text(i18n_ucharset_detector_h ucsd, const char *text_in, int32_t len); + +/** + * @brief Sets the declared encoding for charset detection. + * @details The declared encoding of an input text is an encoding obtained by the user from an HTTP header + * or XML declaration or similar source that can be provided as an additional hint to the charset detector. + * + * How and whether the declared encoding will be used during the detection process is TBD. + * @since_tizen 6.0 + * @param[in] ucsd The charset detector to be used. + * @param[in] encoding An encoding for the current data obtained from a header or declaration or other source outside of the byte data itself. + * @param[in] length The length of the encoding name, or -1 if the name string is NUL terminated. + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * + */ +int i18n_ucsdet_set_declared_encoding(i18n_ucharset_detector_h ucsd, const char *encoding, int32_t length); + +/** + * @brief Gets the charset that best matches the supplied input data. + * @details Note though, that because the detection only looks at the start of the input data, + * there is a possibility that the returned charset will fail to handle the full set of input data. + * + * The returned match @a ucsm is owned by the detector @a ucsd. + * It will remain valid until the detector input is reset, or until the detector is destroyed. + * @since_tizen 6.0 + * @remarks The @a ucsm is valid until @a ucsd is released. + * @param[in] ucsd The charset detector to be used. + * @param[out] ucsm An #i18n_ucharset_match_h representing the best matching charset, or NULL if no charset matches the byte data. + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * + */ +int i18n_ucsdet_detect(i18n_ucharset_detector_h ucsd, i18n_ucharset_match_h *ucsm); + +/** + * @brief Gets all charset matches that appear to be consistent with the input, returning an array of results. + * @details The results are ordered with the best quality match first. + * + * Because the detection only looks at a limited amount of the input byte data, + * some of the returned charsets may fail to handle the all of input data. + * @since_tizen 6.0 + * @param[in] ucsd The charset detector to be used. + * @param[out] matches_found Pointer to a variable that will be set to the number of charsets + * identified that are consistent with the input data. + * @param[out] ucsm A pointer to an array of pointers to #i18n_ucharset_match_h. + * This array, and the #i18n_ucharset_match_h instances it contains, are owned by the @ref CAPI_BASE_UTILS_I18N_UCHARSET_DETECTION_MODULE, + * and will remain valid until the detector @a ucsd is destroyed or modified. + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * + */ +int i18n_ucsdet_detect_all(i18n_ucharset_detector_h ucsd, int32_t *matches_found, i18n_ucharset_match_h **ucsm); + +/** + * @brief Gets the name of the charset represented by an #i18n_ucharset_match_h. + * @details The storage for the returned name string is owned by @a ucsm, + * and will remain valid while @a ucsm is valid. + * + * The name returned is suitable for use with the ICU conversion APIs. + * @since_tizen 6.0 + * @remarks The @a name should be released using free(). + * @param[in] ucsm The charset match object. + * @param[out] name The name of the matching charset. + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * + */ +int i18n_ucsdet_get_name(const i18n_ucharset_match_h ucsm, const char **name); + +/** + * @brief Gets a confidence number for the quality of the match of the byte data with the charset. + * @details Confidence numbers range from zero to 100, with 100 representing complete confidence and zero representing no confidence. + * + * The confidence values are somewhat arbitrary. + * They define an an ordering within the results for any single detection operation + * but are not generally comparable between the results for different input. + * + * A confidence value of ten does have a general meaning - it is used for charsets + * that can represent the input data, but for which there is no other indication that suggests that the charset is the correct one. + * Pure 7 bit ASCII data, for example, is compatible with a great many charsets, + * most of which will appear as possible matches with a confidence of 10. + * @since_tizen 6.0 + * @param[in] ucsm The charset match object. + * @param[out] number A confidence number for the charset match. + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * + */ +int i18n_ucsdet_get_confidence(const i18n_ucharset_match_h ucsm, int32_t *number); + +/** + * @brief Gets the RFC 3066 code for the language of the input data. + * @details The Charset Detection service is intended primarily for detecting charsets, not language. + * For some, but not all, charsets, a language is identified as a byproduct of the detection process, + * and that is what is returned by this function. + * + * CAUTION:\n + * 1. Language information is not available for input data encoded in all charsets. In particular, no language is identified for UTF-8 input data.\n + * 2. Closely related languages may sometimes be confused. + * If more accurate language detection is required, a linguistic analysis package should be used.\n + * + * The storage for the returned @a code is owned by @a ucsm, and will remain valid while @a ucsm is valid. + * @since_tizen 6.0 + * @remarks The @a code should be released using free(). + * @param[in] ucsm The charset match object. + * @param[out] code The RFC 3066 code for the language of the input data, or an empty string if the language could not be determined. + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * + */ +int i18n_ucsdet_get_language(const i18n_ucharset_match_h ucsm, const char **code); + +/** + * @brief Gets the entire input text as an #i18n_uchar string, placing it into a caller-supplied buffer. + * @details A terminating NUL character will be appended to the buffer if space is available. + * + * The number of #i18n_uchar characters in the output string, not including the terminating NUL, is returned. + * + * If the supplied buffer is smaller than required to hold the output, + * the contents of the buffer are undefined. + * The full output string length (the number of #i18n_uchar characters) is returned as always, + * and can be used to allocate a buffer of the correct size. + * @since_tizen 6.0 + * @param[in] ucsm The charset match object. + * @param[in] buf An #i18n_uchar buffer to be filled with the converted text data. + * @param[in] cap The capacity of the buffer in #i18n_uchar. + * @param[out] number The number of #i18n_uchar in the output string. + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * + */ +int i18n_ucsdet_get_uchars(const i18n_ucharset_match_h ucsm, i18n_uchar *buf, int32_t cap, int32_t *number); + +/** + * @brief Gets an iterator over the set of all detectable charsets - over the charsets that are known to the charset detection service. + * @details The returned @a iterator provides access to the names of the charsets. + * + * The state of the Charset detector that is passed in does not affect the result of this function, + * but requiring a valid charset detector as a parameter insures + * that the charset detection service has been safely initialized and that the required detection data is available. + * + * Note: Multiple different charset encodings in a same family may use a single shared name in this implementation. + * For example, this method returns an array including "ISO-8859-1" (ISO Latin 1), + * but not including "windows-1252" (Windows Latin 1). + * However, actual detection result could be "windows-1252" + * when the input data matches Latin 1 code points with any points only available in "windows-1252". + * @since_tizen 6.0 + * @remarks The @a iterator should be released using #i18n_uenumeration_destroy(). + * @param[in] ucsd A Charset detector. + * @param[out] iterator An iterator providing access to the detectable charset names. + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * + */ +int i18n_ucsdet_get_all_detectable_charsets(i18n_ucharset_detector_h ucsd, i18n_uenumeration_h *iterator); + +/** + * @brief Gets whether input filtering is enabled for this charset detector. + * @details Input filtering removes text that appears to be HTML or XML markup + * from the input before applying the code page detection heuristics. + * @since_tizen 6.0 + * @param[in] ucsd The charset detector to check. + * @param[out] result TRUE if filtering is enabled. + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * + */ +int i18n_ucsdet_is_input_filter_enabled(i18n_ucharset_detector_h ucsd, i18n_ubool *result); + +/** + * @brief Enables filtering of input text. + * @details If filtering is enabled, text within angle brackets ("<" and ">") + * will be removed before detection, which will remove most HTML or XML markup. + * @since_tizen 6.0 + * @param[in] ucsd The charset detector to check. + * @param[in] filter True to enable input text filtering. + * @param[out] previous_setting The previous setting. + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * + */ +int i18n_ucsdet_enable_input_filter(i18n_ucharset_detector_h ucsd, i18n_ubool filter, i18n_ubool *previous_setting); + + +#ifdef __cplusplus +} +#endif + +/** + * @} + * @} + */ +#endif /* __UTILS_I18N_UCSDET_H__*/ diff --git a/src/utils_i18n_ucsdet.c b/src/utils_i18n_ucsdet.c new file mode 100644 index 0000000..8d1388d --- /dev/null +++ b/src/utils_i18n_ucsdet.c @@ -0,0 +1,189 @@ +/* +* Copyright (c) 2020 Samsung Electronics Co., Ltd All Rights Reserved +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include + +#include +#include + +int i18n_ucsdet_create(i18n_ucharset_detector_h *ucsd) +{ + retv_if(ucsd == NULL, I18N_ERROR_INVALID_PARAMETER); + + i18n_error_code_e i18n_error; + UErrorCode icu_error = U_ZERO_ERROR; + + *ucsd = (UCharsetDetector *)ucsdet_open(&icu_error); + ERR_MAPPING(icu_error, i18n_error); + I18N_ERR(i18n_error); + + return i18n_error; +} + +int i18n_ucsdet_destroy(i18n_ucharset_detector_h ucsd) +{ + retv_if(ucsd == NULL, I18N_ERROR_INVALID_PARAMETER); + + ucsdet_close((UCharsetDetector *)ucsd); + + return I18N_ERROR_NONE; +} + +int i18n_ucsdet_set_text(i18n_ucharset_detector_h ucsd, const char *text_in, int32_t len) +{ + retv_if(ucsd == NULL, I18N_ERROR_INVALID_PARAMETER); + + i18n_error_code_e i18n_error; + UErrorCode icu_error = U_ZERO_ERROR; + + ucsdet_setText(ucsd, text_in, len, &icu_error); + ERR_MAPPING(icu_error, i18n_error); + I18N_ERR(i18n_error); + + return i18n_error; +} + +int i18n_ucsdet_set_declared_encoding(i18n_ucharset_detector_h ucsd, const char *encoding, int32_t length) +{ + retv_if(ucsd == NULL || encoding == NULL, I18N_ERROR_INVALID_PARAMETER); + + i18n_error_code_e i18n_error; + UErrorCode icu_error = U_ZERO_ERROR; + + ucsdet_setDeclaredEncoding(ucsd, encoding, length, &icu_error); + ERR_MAPPING(icu_error, i18n_error); + I18N_ERR(i18n_error); + + return i18n_error; +} + +int i18n_ucsdet_detect(i18n_ucharset_detector_h ucsd, i18n_ucharset_match_h *ucsm) +{ + retv_if(ucsd == NULL || ucsm == NULL, I18N_ERROR_INVALID_PARAMETER); + + i18n_error_code_e i18n_error; + UErrorCode icu_error = U_ZERO_ERROR; + + *ucsm = ucsdet_detect(ucsd, &icu_error); + ERR_MAPPING(icu_error, i18n_error); + I18N_ERR(i18n_error); + + return i18n_error; +} + +int i18n_ucsdet_detect_all(i18n_ucharset_detector_h ucsd, int32_t *matches_found, i18n_ucharset_match_h **ucsm) +{ + retv_if(ucsd == NULL || matches_found == NULL || ucsm == NULL, I18N_ERROR_INVALID_PARAMETER); + + i18n_error_code_e i18n_error; + UErrorCode icu_error = U_ZERO_ERROR; + + *ucsm = ucsdet_detectAll(ucsd, matches_found, &icu_error); + ERR_MAPPING(icu_error, i18n_error); + I18N_ERR(i18n_error); + + return i18n_error; +} + +int i18n_ucsdet_get_name(const i18n_ucharset_match_h ucsm, const char **name) +{ + retv_if(ucsm == NULL || name == NULL, I18N_ERROR_INVALID_PARAMETER); + + i18n_error_code_e i18n_error; + UErrorCode icu_error = U_ZERO_ERROR; + + *name = ucsdet_getName(ucsm, &icu_error); + ERR_MAPPING(icu_error, i18n_error); + I18N_ERR(i18n_error); + + return i18n_error; +} + +int i18n_ucsdet_get_confidence(const i18n_ucharset_match_h ucsm, int32_t *number) +{ + retv_if(ucsm == NULL || number == NULL, I18N_ERROR_INVALID_PARAMETER); + + i18n_error_code_e i18n_error; + UErrorCode icu_error = U_ZERO_ERROR; + + *number = ucsdet_getConfidence(ucsm, &icu_error); + ERR_MAPPING(icu_error, i18n_error); + I18N_ERR(i18n_error); + + return i18n_error; +} + +int i18n_ucsdet_get_language (const i18n_ucharset_match_h ucsm, const char **code) +{ + retv_if(ucsm == NULL || code == NULL, I18N_ERROR_INVALID_PARAMETER); + + i18n_error_code_e i18n_error; + UErrorCode icu_error = U_ZERO_ERROR; + + *code = ucsdet_getLanguage(ucsm, &icu_error); + ERR_MAPPING(icu_error, i18n_error); + I18N_ERR(i18n_error); + + return i18n_error; +} + +int i18n_ucsdet_get_uchars(const i18n_ucharset_match_h ucsm, i18n_uchar *buf, int32_t cap, int32_t *number) +{ + retv_if(ucsm == NULL || (cap > 0 && buf == NULL) || cap < 0 || number == NULL, I18N_ERROR_INVALID_PARAMETER); + + i18n_error_code_e i18n_error; + UErrorCode icu_error = U_ZERO_ERROR; + + *number = ucsdet_getUChars(ucsm, buf, cap, &icu_error); + ERR_MAPPING(icu_error, i18n_error); + I18N_ERR(i18n_error); + + return i18n_error; +} + +int i18n_ucsdet_get_all_detectable_charsets(i18n_ucharset_detector_h ucsd, i18n_uenumeration_h *iterator) +{ + retv_if(iterator == NULL, I18N_ERROR_INVALID_PARAMETER); + + i18n_error_code_e i18n_error; + UErrorCode icu_error = U_ZERO_ERROR; + + *iterator = ucsdet_getAllDetectableCharsets(ucsd, &icu_error); + ERR_MAPPING(icu_error, i18n_error); + I18N_ERR(i18n_error); + + return i18n_error; +} + +int i18n_ucsdet_is_input_filter_enabled(i18n_ucharset_detector_h ucsd, i18n_ubool *result) +{ + retv_if(ucsd == NULL || result == NULL, I18N_ERROR_INVALID_PARAMETER); + + *result = ucsdet_isInputFilterEnabled(ucsd); + + return I18N_ERROR_NONE; +} + +int i18n_ucsdet_enable_input_filter(i18n_ucharset_detector_h ucsd, i18n_ubool filter, i18n_ubool *previous_setting) +{ + retv_if(ucsd == NULL || previous_setting == NULL, I18N_ERROR_INVALID_PARAMETER); + + *previous_setting = ucsdet_enableInputFilter(ucsd, filter); + + return I18N_ERROR_NONE; +} + +