From e3ba82fe0944413c8787d58ae762c907386e4607 Mon Sep 17 00:00:00 2001 From: Hyunjee Kim Date: Wed, 15 Jan 2020 19:46:55 +0900 Subject: [PATCH] [ACR-1490][UScript] Module implementation Change-Id: I6ad347c263e918fc966882525dc06aa9548c2ffa Signed-off-by: Hyunjee Kim --- src/CMakeLists.txt | 2 + src/include/utils_i18n.h | 61 +++++++++++ src/include/utils_i18n_types.h | 50 ++++++++- src/include/utils_i18n_uscript.h | 217 +++++++++++++++++++++++++++++++++++++++ src/utils_i18n_uscript.c | 106 +++++++++++++++++++ 5 files changed, 434 insertions(+), 2 deletions(-) create mode 100644 src/include/utils_i18n_uscript.h create mode 100644 src/utils_i18n_uscript.c diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 23e85f9..e7f9817 100755 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -42,6 +42,7 @@ SET(BASEUTILS_SRCS utils_i18n_uchar_iter.c utils_i18n_unumsys.c utils_i18n_utext.c + utils_i18n_uscript.c utils_i18n_plural_rules.cpp utils_i18n_plural_format.cpp utils_i18n_immutable_idx.cpp @@ -105,5 +106,6 @@ INSTALL(FILES ${CMAKE_CURRENT_SOURCE_DIR}/${INC_DIR}/utils_i18n_simple_date_form INSTALL(FILES ${CMAKE_CURRENT_SOURCE_DIR}/${INC_DIR}/utils_i18n_loc_disp_names.h DESTINATION ${INCLUDE_INSTALL_DIR}/base) INSTALL(FILES ${CMAKE_CURRENT_SOURCE_DIR}/${INC_DIR}/utils_i18n_unumsys.h DESTINATION ${INCLUDE_INSTALL_DIR}/base) INSTALL(FILES ${CMAKE_CURRENT_SOURCE_DIR}/${INC_DIR}/utils_i18n_utext.h DESTINATION ${INCLUDE_INSTALL_DIR}/base) +INSTALL(FILES ${CMAKE_CURRENT_SOURCE_DIR}/${INC_DIR}/utils_i18n_uscript.h DESTINATION ${INCLUDE_INSTALL_DIR}/base) INSTALL(FILES ${CMAKE_CURRENT_SOURCE_DIR}/${INC_DIR}/utils_i18n.h DESTINATION ${INCLUDE_INSTALL_DIR}/base) INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/${pc_name}.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig) diff --git a/src/include/utils_i18n.h b/src/include/utils_i18n.h index 1852758..5f7b95d 100644 --- a/src/include/utils_i18n.h +++ b/src/include/utils_i18n.h @@ -54,6 +54,7 @@ #include #include #include +#include /** * @file utils_i18n.h @@ -101,6 +102,7 @@ extern "C" { * - locale display names * - numbering system * - utext + * - unicode Script Information * * This module provides flexible generation of number or date format patterns and helps you format and parse dates/number for any locale. * The i18n module provides various features based on data from ICU. The following table shows the version of ICU used in each Tizen platform. @@ -271,6 +273,10 @@ extern "C" { * @ref CAPI_BASE_UTILS_I18N_UTEXT_MODULE * Abstract Unicode Text API. * + * + * @ref CAPI_BASE_UTILS_I18N_USCRIPT_MODULE + * Unicode Script Information + * * * * @section CAPI_BASE_UTILS_I18N_MODULE_MAPPING_TABLE Mapping Table @@ -4442,6 +4448,61 @@ extern "C" { * #i18n_utext_freeze * utext_freeze * + * + * @ref CAPI_BASE_UTILS_I18N_USCRIPT_MODULE + * #i18n_uscript_get_codes + * uscript_getCode + * + * + * @ref CAPI_BASE_UTILS_I18N_USCRIPT_MODULE + * #i18n_uscript_get_name + * uscript_getName + * + * + * @ref CAPI_BASE_UTILS_I18N_USCRIPT_MODULE + * #i18n_uscript_get_short_name + * uscript_getShortName + * + * + * @ref CAPI_BASE_UTILS_I18N_USCRIPT_MODULE + * #i18n_uscript_get_script + * uscript_getScript + * + * + * @ref CAPI_BASE_UTILS_I18N_USCRIPT_MODULE + * #i18n_uscript_has_script + * uscript_hasScript + * + * + * @ref CAPI_BASE_UTILS_I18N_USCRIPT_MODULE + * #i18n_uscript_get_script_extensions + * uscript_getScriptExtensions + * + * + * @ref CAPI_BASE_UTILS_I18N_USCRIPT_MODULE + * #i18n_uscript_get_sample_string + * uscript_getSampleString + * + * + * @ref CAPI_BASE_UTILS_I18N_USCRIPT_MODULE + * #i18n_uscript_get_usage + * uscript_getUsage + * + * + * @ref CAPI_BASE_UTILS_I18N_USCRIPT_MODULE + * #i18n_uscript_is_right_to_left + * uscript_isRightToLeft + * + * + * @ref CAPI_BASE_UTILS_I18N_USCRIPT_MODULE + * #i18n_uscript_breaks_between_letters + * uscript_breaksBetweenLetters + * + * + * @ref CAPI_BASE_UTILS_I18N_USCRIPT_MODULE + * #i18n_uscript_is_cased + * uscript_isCased + * * */ diff --git a/src/include/utils_i18n_types.h b/src/include/utils_i18n_types.h index 8884623..b7d59ac 100644 --- a/src/include/utils_i18n_types.h +++ b/src/include/utils_i18n_types.h @@ -2888,7 +2888,7 @@ typedef void *i18n_uenumeration_h; /** - * @addtogroup CAPI_BASE_UTILS_I18N_UCHAR_MODULE + * @addtogroup CAPI_BASE_UTILS_I18N_USCRIPT_MODULE * @{ */ @@ -3060,10 +3060,56 @@ typedef enum { I18N_USCRIPT_ANATOLIAN_HIEROGLYPHS = 156,/**< Hluw */ I18N_USCRIPT_KHOJKI = 157,/**< Khoj */ I18N_USCRIPT_TIRHUTA = 158,/**< Tirh */ - I18N_USCRIPT_CODE_LIMIT = 159 /**< Count of i18n_uscript_code_e enumerators*/ + I18N_USCRIPT_CAUCASIAN_ALBANIAN = 159,/**< Aghb (Since 6.0)*/ + I18N_USCRIPT_MAHAJANI = 160,/**< Mahj (Since 6.0)*/ + I18N_USCRIPT_AHOM = 161,/**< Ahom (Since 6.0)*/ + I18N_USCRIPT_HATRAN = 162,/**< Hatr (Since 6.0)*/ + I18N_USCRIPT_MODI = 163,/**< Modi (Since 6.0)*/ + I18N_USCRIPT_MULTANI = 164,/**< Mult (Since 6.0)*/ + I18N_USCRIPT_PAU_CIN_HAU = 165,/**< Pauc (Since 6.0)*/ + I18N_USCRIPT_SIDDHAM = 166,/**< Sidd (Since 6.0)*/ + I18N_USCRIPT_ADLAM = 167,/**< Adlm (Since 6.0)*/ + I18N_USCRIPT_BHAIKSUKI = 168,/**< Bhks (Since 6.0)*/ + I18N_USCRIPT_MARCHEN = 169,/**< Marc (Since 6.0)*/ + I18N_USCRIPT_NEWA = 170,/**< Newa (Since 6.0)*/ + I18N_USCRIPT_OSAGE = 171,/**< Osge (Since 6.0)*/ + I18N_USCRIPT_HAN_WITH_BOPOMOFO = 172,/**< Hanb (Since 6.0)*/ + I18N_USCRIPT_JAMO = 173,/**< Jamo (Since 6.0)*/ + I18N_USCRIPT_SYMBOLS_EMOJI = 174,/**< Zsye (Since 6.0)*/ + I18N_USCRIPT_MASARAM_GONDI = 175,/**< Gonm (Since 6.0)*/ + I18N_USCRIPT_SOYOMBO = 176,/**< Soyo (Since 6.0)*/ + I18N_USCRIPT_ZANABAZAR_SQUARE = 177,/**< Zanb (Since 6.0)*/ + I18N_USCRIPT_DOGRA = 178,/**< Dogr (Since 6.0)*/ + I18N_USCRIPT_GUNJALA_GONDI = 179,/**< Gong (Since 6.0)*/ + I18N_USCRIPT_MAKASAR = 180,/**< Maka (Since 6.0)*/ + I18N_USCRIPT_MEDEFAIDRIN = 181,/**< Medf (Since 6.0)*/ + I18N_USCRIPT_HANIFI_ROHINGYA = 182,/**< Rohg (Since 6.0)*/ + I18N_USCRIPT_SOGDIAN = 183,/**< Sogd (Since 6.0)*/ + I18N_USCRIPT_OLD_SOGDIAN = 184,/**< Sogo (Since 6.0)*/ + I18N_USCRIPT_ELYMAIC = 185,/**< Elym (Since 6.0)*/ + I18N_USCRIPT_NYIAKENG_PUACHUE_HMONG = 186,/**< Hmnp (Since 6.0)*/ + I18N_USCRIPT_NANDINAGARI = 187,/**< Nand (Since 6.0)*/ + I18N_USCRIPT_WANCHO = 188,/**< Wcho (Since 6.0)*/ + I18N_USCRIPT_CODE_LIMIT = 189 /**< Count of i18n_uscript_code_e enumerators*/ } i18n_uscript_code_e; /** + * @brief Script usage constants. + * See UAX #31 Unicode Identifier and Pattern Syntax. + * http://www.unicode.org/reports/tr31/#Table_Candidate_Characters_for_Exclusion_from_Identifiers + * @since_tizen 6.0 + */ +typedef enum { + I18N_USCRIPT_USAGE_NOT_ENCODED, /**< Not encoded in Unicode.*/ + I18N_USCRIPT_USAGE_UNKNOWN, /**< Unknown script usage.*/ + I18N_USCRIPT_USAGE_EXCLUDED, /**< Candidate for Exclusion from Identifiers.*/ + I18N_USCRIPT_USAGE_LIMITED_USE, /**< Limited Use script.*/ + I18N_USCRIPT_USAGE_ASPIRATIONAL, /**< Aspirational Use script.*/ + I18N_USCRIPT_USAGE_RECOMMENDED /**< Recommended script.*/ +} i18n_uscript_usage_e; + + +/** * @} */ diff --git a/src/include/utils_i18n_uscript.h b/src/include/utils_i18n_uscript.h new file mode 100644 index 0000000..85e73cd --- /dev/null +++ b/src/include/utils_i18n_uscript.h @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __UTILS_I18N_USCRIPT_H__ +#define __UTILS_I18N_USCRIPT_H__ + +#include + +/** + * @file utils_i18n_uscript.h + * @version 0.1 + * @brief utils_i18n_uscript + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @ingroup CAPI_BASE_UTILS_I18N_MODULE + * @defgroup CAPI_BASE_UTILS_I18N_USCRIPT_MODULE Utext + * @brief Abstract Unicode Text API. + * @section CAPI_BASE_UTILS_I18N_USCRIPT_MODULE_HEADER Required Header + * \#include + * + * @section CAPI_BASE_UTILS_I18N_USCRIPT_MODULE_OVERVIEW Overview + * @details Unicode Script Information. + * + */ + +/** + * @addtogroup CAPI_BASE_UTILS_I18N_USCRIPT_MODULE + * @{ + */ + +/** + * @brief Gets the script codes associated with the specified language. + * @details The language is described using either locale, ISO 15924 name or ISO 15924 abbreviation. + * Example: If "Malayam" or "Mlym" is given, the expected result is #I18N_USCRIPT_MALAYALAM. + * Note: To search by short or long script alias only, + * use #i18n_uchar_get_property_value_enum() instead. + * That does a fast lookup with no access of the locale data. + * @since_tizen 6.0 + * @remarks @a codes must be allocated before calling the function. + * If the required capacity is greater than the capacity of @a codes, then the @a length is set to the required capacity and #I18N_ERROR_BUFFER_OVERFLOW is returned. + * @param[in] language The language, for which the script codes are to be retrieved + * @param[in,out] codes The array of codes associated with the specified language + * @param[in] capacity Capacity of the @a codes array + * @param[out] length The number of items written to the array, or the required capacity if the array's capacity is insufficient + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * @retval #I18N_ERROR_OUT_OF_MEMORY Out of memory + * @retval #I18N_ERROR_BUFFER_OVERFLOW The supplied array @a codes is of insufficient capacity + * + */ +int i18n_uscript_get_codes(const char* language, i18n_uscript_code_e* codes, int32_t capacity, int32_t *length); + +/** + * @brief Gets the script name for the specified script code. + * @details Returns the long unicode script name, if there is one. Otherwise returns the 4-letter ISO 15924 script code + * Example: If #I18N_USCRIPT_MALAYALAM is given, the expected output is "Malayam". + * @since_tizen 6.0 + * @param[in] script_code Uscript code enum + * @return long script name as given in PropertyValueAliases.txt, or the 4-letter code, or NULL if #i18n_uscript_code_e is invalid + * + */ +const char* i18n_uscript_get_name(i18n_uscript_code_e script_code); + +/** + * @brief Gets the short script name for the specified script code. + * @details Returns the 4-letter ISO 15924 script code, which is the same as the short Unicode script name if Unicode has names for the script. + * Example: If #I18N_USCRIPT_MALAYALAM is given, the expected output is "Mlym". + * @since_tizen 6.0 + * @param[in] script_code Uscript code enum + * @return short script name (4-letter code), or NULL if @a script_code is invalid + * + */ +const char* i18n_uscript_get_short_name(i18n_uscript_code_e script_code); + +/** + * @brief Gets the script code associated with the given @a codepoint. + * @details If the specified @a codepoint is invalid, the script code returned is equal to 0. + * Example: If 0x0D02 is given, the expected output is #I18N_USCRIPT_MALAYALAM. + * @since_tizen 6.0 + * @param[in] codepoint #i18n_uchar32 @a codepoint + * @param[out] script_code The code of the script, that the specified @a codepoint belongs to. + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * @retval #I18N_ERROR_OUT_OF_MEMORY Out of memory + * + */ +int i18n_uscript_get_script(i18n_uchar32 codepoint, i18n_uscript_code_e *script_code); + +/** + * @brief Gets a value indicating whether the Script Extensions of the specified @a codepoint contain the specified script. + * @details If @a codepoint does not have explicit Script Extensions, then this tests whether @a codepoint has the Script property value @a script_code. + * Some characters are commonly used in multiple scripts. For more information, see UAX #24: http://www.unicode.org/reports/tr24/. + * @since_tizen 6.0 + * @param[in] codepoint Code point + * @param[in] script_code Script code + * @return TRUE if @a script_code is in Script Extensions (@a codepoint) + * + */ +i18n_ubool i18n_uscript_has_script(i18n_uchar32 codepoint, i18n_uscript_code_e script_code); + +/** + * @brief Gets the Script Extensions for the specified @a codepoint. + * @details + * - If @a codepoint does have Script Extensions, then the Script property value + * (normally Common or Inherited) is not included. + * - If @a codepoint does not have Script Extensions, then the one Script code is written to the output array. + * - If @a codepoint is not a valid code point, then the one #I18N_USCRIPT_UNKNOWN code is written. + * + * Some characters are commonly used in multiple scripts. + * For more information, see UAX #24: http://www.unicode.org/reports/tr24/. + * + * @since_tizen 6.0 + * @remarks @a scripts must be allocated before calling the function. + * If the required capacity is greater than the capacity of @a scripts, then the @a length is set to the required capacity and #I18N_ERROR_BUFFER_OVERFLOW is returned. + * @param[in] codepoint Code point + * @param[in,out] scripts The array of Script Extensions for the specified @a codepoint + * @param[in] capacity Capacity of the @a scripts array + * @param[out] length The number of items written to the array, or the required capacity if the array's capacity is insufficient + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * @retval #I18N_ERROR_OUT_OF_MEMORY Out of memory + * @retval #I18N_ERROR_BUFFER_OVERFLOW The supplied array @a scripts is of insufficient capacity + * + */ +int i18n_uscript_get_script_extensions(i18n_uchar32 codepoint, i18n_uscript_code_e *scripts, int32_t capacity, int32_t *length); + +/** + * @brief Gets the script sample character string. + * @details This string normally consists of one code point but might be longer. The string is empty if the script is not encoded. + * @since_tizen 6.0 + * @remarks @a sample must be allocated before calling the function. + * If the required capacity is greater than the capacity of @a sample, then the @a length is set to the required capacity and #I18N_ERROR_BUFFER_OVERFLOW is returned. + * @param[in] script Script code + * @param[in,out] sample The sample string for the specified @a codepoint + * @param[in] capacity The number of #i18n_uchar characters that @a sample can hold + * @param[out] length The number of #i18n_uchar characters written to the string, or the required capacity if the string's capacity is insufficient + * @return @c 0 on success, otherwise a negative error value + * @retval #I18N_ERROR_NONE Successful + * @retval #I18N_ERROR_INVALID_PARAMETER Invalid function parameter + * @retval #I18N_ERROR_OUT_OF_MEMORY Out of memory + * @retval #I18N_ERROR_BUFFER_OVERFLOW The supplied array @a sample is of insufficient capacity + * + */ +int i18n_uscript_get_sample_string(i18n_uscript_code_e script, i18n_uchar *sample, int32_t capacity, int32_t *length); + + +/** + * @brief Returns the script usage according to UAX #31 Unicode Identifier and Pattern Syntax. + * @details Returns #I18N_USCRIPT_USAGE_NOT_ENCODED if the script is not encoded in Unicode. + * @since_tizen 6.0 + * @param[in] script Script code + * @return script usage + * + */ +i18n_uscript_usage_e i18n_uscript_get_usage(i18n_uscript_code_e script); + +/** + * @brief Gets a value indicating whether the script is written right-to-left. + * @details For example, Arab and Hebr. + * @since_tizen 6.0 + * @param[in] script Script code + * @return TRUE if the script is right-to-left + * + */ +i18n_ubool i18n_uscript_is_right_to_left(i18n_uscript_code_e script); + +/** + * @brief Gets a value indicating whether the script allows line breaks between letters (excluding hyphenation). + * @details Such a script typically requires dictionary-based line breaking. For example, Hani and Thai. + * @since_tizen 6.0 + * @param[in] script Script code + * @return TRUE if the script allows line breaks between letters + * + */ +i18n_ubool i18n_uscript_breaks_between_letters(i18n_uscript_code_e script); + +/** + * @brief Gets a value indicating whether the script case distinctions in modern usage are customary. + * @details For example, Latn and Cyrl. + * @since_tizen 6.0 + * @param[in] script Script code + * @return TRUE if the script is cased + * + */ +i18n_ubool i18n_uscript_is_cased(i18n_uscript_code_e script); + + +#ifdef __cplusplus +} +#endif + +/** + * @} + * @} + */ +#endif /* __UTILS_I18N_USCRIPT_H__*/ diff --git a/src/utils_i18n_uscript.c b/src/utils_i18n_uscript.c new file mode 100644 index 0000000..e89a637 --- /dev/null +++ b/src/utils_i18n_uscript.c @@ -0,0 +1,106 @@ +/* +* Copyright (c) 2020 Samsung Electronics Co., Ltd All Rights Reserved +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include + +#include +#include + +int i18n_uscript_get_codes(const char* language, i18n_uscript_code_e* codes, int32_t capacity, int32_t *length) +{ + i18n_error_code_e i18n_error; + UErrorCode icu_error = U_ZERO_ERROR; + + *length = uscript_getCode(language, (UScriptCode*)codes, capacity, &icu_error); + ERR_MAPPING(icu_error, i18n_error); + I18N_ERR(i18n_error); + + return i18n_error; +} + +const char *i18n_uscript_get_name(i18n_uscript_code_e script_code) +{ + return uscript_getName(script_code); +} + +const char *i18n_uscript_get_short_name(i18n_uscript_code_e script_code) +{ + return uscript_getShortName(script_code); +} + +int i18n_uscript_get_script(i18n_uchar32 codepoint, i18n_uscript_code_e *script_code) +{ + i18n_error_code_e i18n_error; + UErrorCode icu_error = U_ZERO_ERROR; + + *script_code = uscript_getScript(codepoint, &icu_error); + ERR_MAPPING(icu_error, i18n_error); + I18N_ERR(i18n_error); + + return i18n_error; +} + +i18n_ubool i18n_uscript_has_script(i18n_uchar32 codepoint, i18n_uscript_code_e script_code) +{ + return uscript_hasScript(codepoint, script_code); +} + +int i18n_uscript_get_script_extensions(i18n_uchar32 codepoint, i18n_uscript_code_e *scripts, int32_t capacity, int32_t *length) +{ + i18n_error_code_e i18n_error; + UErrorCode icu_error = U_ZERO_ERROR; + + length = uscript_getScriptExtensions(codepoint, (UScriptCode*)scripts, capacity, &icu_error); + ERR_MAPPING(icu_error, i18n_error); + I18N_ERR(i18n_error); + + return i18n_error; +} + +int i18n_uscript_get_sample_string(i18n_uscript_code_e script, i18n_uchar *sample, int32_t capacity, int32_t *length) +{ + retv_if(script >= I18N_USCRIPT_CODE_LIMIT || script <= I18N_USCRIPT_INVALID_CODE, I18N_USCRIPT_INVALID_CODE); + + i18n_error_code_e i18n_error; + UErrorCode icu_error = U_ZERO_ERROR; + + length = uscript_getSampleString(script, sample, capacity, &icu_error); + ERR_MAPPING(icu_error, i18n_error); + I18N_ERR(i18n_error); + + return i18n_error; +} + +i18n_uscript_usage_e i18n_uscript_get_usage(i18n_uscript_code_e script) +{ + return uscript_getUsage(script); +} + +i18n_ubool i18n_uscript_is_right_to_left(i18n_uscript_code_e script) +{ + return uscript_isRightToLeft(script); +} + +i18n_ubool i18n_uscript_breaks_between_letters(i18n_uscript_code_e script) +{ + return uscript_breaksBetweenLetters(script); +} + +i18n_ubool i18n_uscript_is_cased(i18n_uscript_code_e script) +{ + return uscript_isCased(script); +} + -- 2.7.4