revise packaging
[platform/core/pim/contacts-service.git] / common / ctsvc_normalize.c
1 /*
2  * Contacts Service
3  *
4  * Copyright (c) 2010 - 2015 Samsung Electronics Co., Ltd. All rights reserved.
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  */
19
20 #include <ctype.h>
21 #include <unicode/ulocdata.h>
22 #include <unicode/ustring.h>
23 #include <unicode/unorm.h>
24 #include <unicode/ucol.h>
25 #include <unicode/uset.h>
26
27 #include "contacts.h"
28 #include "ctsvc_internal.h"
29 #include "ctsvc_normalize.h"
30 #include "ctsvc_localize.h"
31 #include "ctsvc_localize_utils.h"
32
33 #define CTSVC_COMBINING_DIACRITICAL_MARKS_START 0x0300
34 #define CTSVC_COMBINING_DIACRITICAL_MARKS_END   0x036f
35
36 typedef struct {
37         UChar letter;
38         char start;
39         char end;
40 } hiragana_group_letter;
41
42 static hiragana_group_letter hiragana_group[13] = {
43         {0x3042, 0x41, 0x4a}, /* ぁ あ ぃ い ぅ う ぇ え ぉ お */
44         {0x3042, 0x94, 0x94}, /* ゔ */
45         {0x304b, 0x4b, 0x54}, /* か が き ぎ く ぐ け げ こ ご */
46         {0x304b, 0x95, 0x96}, /* ゕ ゖ */
47         {0x3055, 0x55, 0x5e}, /* さ ざ し じ す ず せ ぜ そ ぞ */
48         {0x305f, 0x5f, 0x69}, /* た だ ち ぢ っ つ づ て で と ど */
49         {0x306a, 0x6a, 0x6e}, /* な に ぬ ね の */
50         {0x306f, 0x6f, 0x7d}, /* は ば ぱ ひ び ぴ ふ ぶ ぷ へ べ ぺ ほ ぼ ぽ */
51         {0x307e, 0x7e, 0x82}, /* ま み む め も */
52         {0x3084, 0x83, 0x88}, /* ゃ や ゅ ゆ ょ よ*/
53         {0x3089, 0x89, 0x8d}, /* ら り る れ ろ */
54         {0x308f, 0x8e, 0x92}, /* ゎ わ */
55         {0x3093, 0x93, 0x93}, /* ゐ ゑ を */
56 };
57
58 static int __ctsvc_remove_special_char(const char *src, char *dest, int dest_size)
59 {
60         int s_pos = 0, d_pos = 0, char_type, src_size;
61
62         if (NULL == src) {
63                 /* LCOV_EXCL_START */
64                 ERR("The parameter(src) is NULL");
65                 dest[d_pos] = '\0';
66                 return 0;
67                 /* LCOV_EXCL_STOP */
68         }
69         src_size = strlen(src);
70
71         while (src[s_pos] != 0) {
72                 char_type = ctsvc_check_utf8(src[s_pos]);
73
74                 if (0 < char_type && char_type < dest_size - d_pos && char_type <= src_size - s_pos) {
75                         memcpy(dest+d_pos, src+s_pos, char_type);
76                         d_pos += char_type;
77                         s_pos += char_type;
78                 } else {
79                         /* LCOV_EXCL_START */
80                         ERR("The parameter(src:%s) has invalid character set", src);
81                         dest[d_pos] = '\0';
82                         return CONTACTS_ERROR_INVALID_PARAMETER;
83                         /* LCOV_EXCL_STOP */
84                 }
85         }
86
87         dest[d_pos] = '\0';
88         return d_pos;
89 }
90
91 static inline int __ctsvc_collation_str(const char *src, char **dest)
92 {
93         int32_t size = 0;
94         UErrorCode status = U_ZERO_ERROR;
95         UChar *tmp_result = NULL;
96         UCollator *collator;
97
98         char *region = strdup(ctsvc_get_langset());
99         if (NULL == region) {
100                 /* LCOV_EXCL_START */
101                 ERR("strdup() Fail");
102                 return CONTACTS_ERROR_OUT_OF_MEMORY;
103                 /* LCOV_EXCL_STOP */
104         }
105
106         char *dot = strchr(region, '.');
107         if (dot)
108                 *dot = '\0';
109
110         collator = ucol_open(region, &status);
111         if (U_FAILURE(status)) {
112                 /* LCOV_EXCL_START */
113                 ERR("ucol_open Fail(%s)", u_errorName(status));
114                 free(region);
115                 return CONTACTS_ERROR_SYSTEM;
116                 /* LCOV_EXCL_STOP */
117         }
118
119         /* TODO: ucol_setAttribute is not called */
120         if (U_FAILURE(status)) {
121                 /* LCOV_EXCL_START */
122                 ERR("ucol_setAttribute Fail(%s)", u_errorName(status));
123                 free(region);
124                 ucol_close(collator);
125                 return CONTACTS_ERROR_SYSTEM;
126                 /* LCOV_EXCL_STOP */
127         }
128
129         u_strFromUTF8(NULL, 0, &size, src, strlen(src), &status);
130         if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
131                 /* LCOV_EXCL_START */
132                 ERR("u_strFromUTF8 to get the dest length Fail(%s)", u_errorName(status));
133                 free(region);
134                 ucol_close(collator);
135                 return CONTACTS_ERROR_SYSTEM;
136                 /* LCOV_EXCL_STOP */
137         }
138         status = U_ZERO_ERROR;
139         tmp_result = calloc(1, sizeof(UChar) * (size + 1));
140         u_strFromUTF8(tmp_result, size + 1, NULL, src, -1, &status);
141         if (U_FAILURE(status)) {
142                 /* LCOV_EXCL_START */
143                 ERR("u_strFromUTF8 Fail(%s)", u_errorName(status));
144                 free(region);
145                 free(tmp_result);
146                 ucol_close(collator);
147                 return CONTACTS_ERROR_SYSTEM;
148                 /* LCOV_EXCL_STOP */
149         }
150
151         size = ucol_getSortKey(collator, tmp_result, -1, NULL, 0);
152         *dest = calloc(1, sizeof(uint8_t) * (size + 1));
153         size = ucol_getSortKey(collator, tmp_result, -1, (uint8_t *)*dest, size + 1);
154
155         ucol_close(collator);
156         free(tmp_result);
157         free(region);
158         return CONTACTS_ERROR_NONE;
159 }
160
161 int ctsvc_collation_str(char *src, char **dest)
162 {
163         int ret;
164         char temp[SAFE_STRLEN(src) + 1];
165
166         ret = __ctsvc_remove_special_char(src, temp, sizeof(temp));
167         WARN_IF(ret < CONTACTS_ERROR_NONE, "__ctsvc_remove_special_char() Fail(%d)", ret);
168
169         return __ctsvc_collation_str(temp, dest);
170 }
171
172 static int __ctsvc_normalize_str(const char *src, char **dest, bool is_index)
173 {
174         int32_t tmp_size = 100;
175         int32_t upper_size;
176         int32_t size = 100;
177         UErrorCode status = 0;
178         UChar *tmp_result = NULL;
179         UChar *tmp_upper = NULL;
180         UChar *result = NULL;
181
182         tmp_result = calloc(1, sizeof(UChar)*(tmp_size+1));
183         if (NULL == tmp_result) {
184                 /* LCOV_EXCL_START */
185                 ERR("calloc() Fail");
186                 return CONTACTS_ERROR_OUT_OF_MEMORY;
187                 /* LCOV_EXCL_STOP */
188         }
189
190         u_strFromUTF8(tmp_result, tmp_size + 1, &tmp_size, src, -1, &status);
191         if (status == U_BUFFER_OVERFLOW_ERROR) {
192                 status = U_ZERO_ERROR;
193                 free(tmp_result);
194                 tmp_result = calloc(1, sizeof(UChar) * (tmp_size + 1));
195                 if (NULL == tmp_result) {
196                         /* LCOV_EXCL_START */
197                         ERR("calloc() Fail");
198                         return CONTACTS_ERROR_OUT_OF_MEMORY;
199                         /* LCOV_EXCL_STOP */
200                 }
201
202                 u_strFromUTF8(tmp_result, tmp_size + 1, NULL, src, -1, &status);
203                 if (U_FAILURE(status)) {
204                         /* LCOV_EXCL_START */
205                         ERR("u_strFromUTF8()Fail(%s)", u_errorName(status));
206                         free(tmp_result);
207                         return CONTACTS_ERROR_SYSTEM;
208                         /* LCOV_EXCL_STOP */
209                 }
210         } else if (U_FAILURE(status)) {
211                 /* LCOV_EXCL_START */
212                 ERR("u_strFromUTF8() Fail(%s)", u_errorName(status));
213                 free(tmp_result);
214                 return CONTACTS_ERROR_SYSTEM;
215                 /* LCOV_EXCL_STOP */
216         }
217
218         tmp_upper = calloc(1, sizeof(UChar)*(tmp_size+1));
219         if (NULL == tmp_upper) {
220                 /* LCOV_EXCL_START */
221                 ERR("calloc() Fail");
222                 free(tmp_result);
223                 return CONTACTS_ERROR_OUT_OF_MEMORY;
224                 /* LCOV_EXCL_STOP */
225         }
226
227         upper_size = u_strToUpper(tmp_upper, tmp_size+1, tmp_result, -1, NULL, &status);
228         if (status == U_BUFFER_OVERFLOW_ERROR) {
229                 status = U_ZERO_ERROR;
230                 free(tmp_upper);
231                 tmp_upper = calloc(1, sizeof(UChar) * (upper_size + 1));
232                 if (NULL == tmp_upper) {
233                         /* LCOV_EXCL_START */
234                         ERR("calloc() Fail");
235                         free(tmp_result);
236                         return CONTACTS_ERROR_OUT_OF_MEMORY;
237                         /* LCOV_EXCL_STOP */
238                 }
239
240                 u_strFromUTF8(tmp_upper, upper_size + 1, NULL, src, -1, &status);
241                 if (U_FAILURE(status)) {
242                         /* LCOV_EXCL_START */
243                         ERR("u_strFromUTF8()Fail(%s)", u_errorName(status));
244                         free(tmp_result);
245                         free(tmp_upper);
246                         return CONTACTS_ERROR_SYSTEM;
247                         /* LCOV_EXCL_STOP */
248                 }
249         } else if (U_FAILURE(status)) {
250                 /* LCOV_EXCL_START */
251                 ERR("u_strToUpper() Fail(%s)", u_errorName(status));
252                 free(tmp_result);
253                 free(tmp_upper);
254                 return CONTACTS_ERROR_SYSTEM;
255                 /* LCOV_EXCL_STOP */
256         }
257
258         result = calloc(1, sizeof(UChar)*(size+1));
259         if (NULL == result) {
260                 /* LCOV_EXCL_START */
261                 ERR("calloc() Fail");
262                 free(tmp_result);
263                 free(tmp_upper);
264                 return CONTACTS_ERROR_OUT_OF_MEMORY;
265                 /* LCOV_EXCL_STOP */
266         }
267
268         size = unorm_normalize(tmp_upper, -1, UNORM_NFD, 0, result, size+1, &status);
269         if (status == U_BUFFER_OVERFLOW_ERROR) {
270                 status = U_ZERO_ERROR;
271                 free(result);
272                 result = calloc(1, sizeof(UChar)*(size + 1));
273                 if (NULL == result) {
274                         /* LCOV_EXCL_START */
275                         ERR("calloc() Fail");
276                         free(tmp_result);
277                         free(tmp_upper);
278                         return CONTACTS_ERROR_OUT_OF_MEMORY;
279                         /* LCOV_EXCL_STOP */
280                 }
281
282                 unorm_normalize(tmp_upper, -1, UNORM_NFD, 0, result, size+1, &status);
283                 if (U_FAILURE(status)) {
284                         /* LCOV_EXCL_START */
285                         ERR("unorm_normalize() Fail(%s)", u_errorName(status));
286                         free(tmp_result);
287                         free(tmp_upper);
288                         free(result);
289                         return CONTACTS_ERROR_SYSTEM;
290                         /* LCOV_EXCL_STOP */
291                 }
292         } else if (U_FAILURE(status)) {
293                 /* LCOV_EXCL_START */
294                 ERR("unorm_normalize() Fail(%s)", u_errorName(status));
295                 free(tmp_result);
296                 free(tmp_upper);
297                 free(result);
298                 return CONTACTS_ERROR_SYSTEM;
299                 /* LCOV_EXCL_STOP */
300         }
301
302         ctsvc_check_language(result);
303         if (is_index)
304                 ctsvc_extra_index_normalize(result, size);
305         else
306                 ctsvc_extra_normalize(result, size);
307
308         /* remove diacritical : U+3000 ~ U+034F */
309         int i, j;
310         UChar *temp_result = NULL;
311         temp_result = calloc(1, sizeof(UChar)*(size+1));
312         if (NULL == temp_result) {
313                 /* LCOV_EXCL_START */
314                 ERR("calloc() Fail");
315                 free(tmp_result);
316                 free(tmp_upper);
317                 free(result);
318                 return CONTACTS_ERROR_OUT_OF_MEMORY;
319                 /* LCOV_EXCL_STOP */
320         }
321
322         bool replaced = false;
323         for (i = 0, j = 0; i < size; i++) {
324                 if (CTSVC_COMPARE_BETWEEN((UChar)CTSVC_COMBINING_DIACRITICAL_MARKS_START,
325                                         result[i], (UChar)CTSVC_COMBINING_DIACRITICAL_MARKS_END)) {
326                         replaced = true;
327                 } else {
328                         temp_result[j++] = result[i];
329                 }
330         }
331
332         if (replaced) {
333                 temp_result[j] = 0x0;
334                 free(result);
335                 result = temp_result;
336         } else {
337                 free(temp_result);
338         }
339
340         u_strToUTF8(NULL, 0, &size, result, -1, &status);
341         status = U_ZERO_ERROR;
342         *dest = calloc(1, sizeof(char) * (size+1));
343         if (NULL == *dest) {
344                 /* LCOV_EXCL_START */
345                 ERR("calloc() Fail");
346                 free(tmp_result);
347                 free(tmp_upper);
348                 free(result);
349                 return CONTACTS_ERROR_OUT_OF_MEMORY;
350                 /* LCOV_EXCL_STOP */
351         }
352
353         u_strToUTF8(*dest, size+1, NULL, result, -1, &status);
354         if (U_FAILURE(status)) {
355                 /* LCOV_EXCL_START */
356                 ERR("u_strToUTF8() Fail(%s)", u_errorName(status));
357                 free(*dest);
358                 *dest = NULL;
359                 free(tmp_result);
360                 free(tmp_upper);
361                 free(result);
362                 return CONTACTS_ERROR_SYSTEM;
363                 /* LCOV_EXCL_STOP */
364         }
365         free(tmp_result);
366         free(tmp_upper);
367         free(result);
368         return CONTACTS_ERROR_NONE;
369 }
370
371 static int __ctsvc_convert_halfwidth_ascii_and_symbol(const char *src, UChar *dest, int dest_size, int* str_size)
372 {
373         int i;
374         int32_t size = dest_size;
375         UErrorCode status = 0;
376
377         u_strFromUTF8(dest, dest_size, &size, src, strlen(src), &status);
378         if (U_FAILURE(status)) {
379                 /* LCOV_EXCL_START */
380                 ERR("u_strFromUTF8() Fail(%s)", u_errorName(status));
381                 return CONTACTS_ERROR_SYSTEM;
382                 /* LCOV_EXCL_STOP */
383         }
384
385         *str_size = size;
386
387         /* full width -> half width */
388         for (i = 0; i < size; i++) {
389                 /* FF00 ~ FF60: Fullwidth ASCII variants */
390                 if (CTSVC_COMPARE_BETWEEN((UChar)0xFF00, dest[i], (UChar)0xFF60)) {
391                         int unicode_value1 = 0;
392                         int unicode_value2 = 0;
393                         unicode_value1 = 0x0;
394                         unicode_value2 = (0xFF & dest[i]) + 0x20;
395                         dest[i] = unicode_value1 << 8 | unicode_value2;
396                 } else if (CTSVC_COMPARE_BETWEEN((UChar)0xFFE0, dest[i], (UChar)0xFFE6)) {
397                         /* FFE0~FFE6: Fullwidth symbol variants */
398                         if (dest[i] == (UChar)0xFFE0)
399                                 dest[i] = (UChar)0x00A2;
400                         else if (dest[i] == (UChar)0xFFE1)
401                                 dest[i] = (UChar)0x00A3;
402                         else if (dest[i] == (UChar)0xFFE2)
403                                 dest[i] = (UChar)0x00AC;
404                         else if (dest[i] == (UChar)0xFFE3)
405                                 dest[i] = (UChar)0x00AF;
406                         else if (dest[i] == (UChar)0xFFE4)
407                                 dest[i] = (UChar)0x00A6;
408                         else if (dest[i] == (UChar)0xFFE5)
409                                 dest[i] = (UChar)0x00A5;
410                         else if (dest[i] == (UChar)0xFFE6)
411                                 dest[i] = (UChar)0x20A9;
412                         /* else */
413                 } /* else */
414
415         }
416
417         dest[size] = 0x00;
418         return CONTACTS_ERROR_NONE;
419 }
420
421 #define LARGE_BUFFER_SIZE 100
422
423 int ctsvc_get_halfwidth_string(const char *src, char **dest, int *dest_size)
424 {
425         UChar unicodes[LARGE_BUFFER_SIZE+1];
426         int ustr_size = 0;
427
428         if (CONTACTS_ERROR_NONE != __ctsvc_convert_halfwidth_ascii_and_symbol(src, unicodes, LARGE_BUFFER_SIZE, &ustr_size)) {
429                 /* LCOV_EXCL_START */
430                 ERR("__ctsvc_convert_halfwidth_ascii_and_symbol() Fail");
431                 return CONTACTS_ERROR_SYSTEM;
432                 /* LCOV_EXCL_STOP */
433         }
434
435         UErrorCode status = 0;
436
437         /* pre-flighting */
438         int size = 0;
439         u_strToUTF8(NULL, 0, &size, unicodes, -1, &status);
440         status = U_ZERO_ERROR;
441         *dest = calloc(1, sizeof(char) * (size+1));
442
443         u_strToUTF8(*dest, size+1, dest_size, unicodes, ustr_size, &status);
444         if (U_FAILURE(status)) {
445                 /* LCOV_EXCL_START */
446                 ERR("u_strToUTF8() Fail(%s)", u_errorName(status));
447
448                 free(*dest);
449                 *dest = NULL;
450                 *dest_size = 0;
451
452                 return CONTACTS_ERROR_SYSTEM;
453                 /* LCOV_EXCL_STOP */
454         }
455
456         return CONTACTS_ERROR_NONE;
457 }
458
459 int ctsvc_normalize_str(const char *src, char **dest)
460 {
461         int ret = CONTACTS_ERROR_NONE;
462         char temp[strlen(src) + 1];
463
464         ret = __ctsvc_remove_special_char(src, temp, strlen(src) + 1);
465         RETVM_IF(ret < CONTACTS_ERROR_NONE, ret, "__ctsvc_remove_special_char() Fail(%d)", ret);
466
467         ret = __ctsvc_normalize_str(temp, dest, false);
468         return ret;
469 }
470
471 static void __ctsvc_convert_japanese_group_letter(char *dest)
472 {
473         int i, size, dest_len;
474         UErrorCode status = 0;
475         UChar tmp_result[2];
476         UChar result[2] = {0x00};
477         int unicode_value;
478
479         dest_len = strlen(dest) + 1;
480         u_strFromUTF8(tmp_result, array_sizeof(tmp_result), NULL, dest, -1, &status);
481         RETM_IF(U_FAILURE(status), "u_strFromUTF8() Fail(%s)", u_errorName(status));
482
483         unicode_value = (0xFF & (tmp_result[0]));
484
485         for (i = 0; i < 13; i++) {
486                 if (hiragana_group[i].start <= unicode_value
487                                 && unicode_value <= hiragana_group[i].end)
488                         result[0] = hiragana_group[i].letter;
489         }
490
491         u_strToUTF8(dest, dest_len, &size, result, -1, &status);
492         RETM_IF(U_FAILURE(status), "u_strToUTF8() Fail(%s)", u_errorName(status));
493
494 }
495
496 static bool __ctsvc_check_range_out_index(const char src[])
497 {
498         if (src[0] == 0xe2 && src[1] == 0x80 && src[2] == 0xa6)
499                 return true;
500
501         return false;
502 }
503
504 int ctsvc_normalize_index(const char *src, char **dest)
505 {
506         int ret = CONTACTS_ERROR_NONE;
507         char first_str[10] = {0};
508         int length = 0;
509
510         if (first_str[0] == '\0' || __ctsvc_check_range_out_index(first_str)) {
511                 length = ctsvc_check_utf8(src[0]);
512                 RETVM_IF(length <= 0, CONTACTS_ERROR_INTERNAL, "check_utf8() Fail");
513
514                 memset(first_str, 0x00, sizeof(first_str));
515                 strncpy(first_str, src, length);
516                 if (length != strlen(first_str)) {
517                         /* LCOV_EXCL_START */
518                         ERR("length : %d, first_str : %s, strlne : %d", length, first_str, strlen(first_str));
519                         return CONTACTS_ERROR_INVALID_PARAMETER;
520                         /* LCOV_EXCL_STOP */
521                 }
522         }
523         ret = __ctsvc_normalize_str(first_str, dest, true);
524         RETVM_IF(dest == NULL, ret, "__ctsvc_normalize_str() Fail");
525
526         if ((*dest)[0] != '\0') {
527                 length = ctsvc_check_utf8((*dest)[0]);
528                 (*dest)[length] = '\0';
529         }
530
531         if (ret == CTSVC_LANG_JAPANESE)
532                 __ctsvc_convert_japanese_group_letter(*dest);
533
534         return ret;
535 }
536
537