2 * Copyright 1998-2008 The OpenLDAP Foundation.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted only as authorized by the OpenLDAP
9 * A copy of this license is available in file LICENSE in the
10 * top-level directory of the distribution or, alternatively, at
11 * <https://www.OpenLDAP.org/license.html>.
13 /* Copyright 2001 Computing Research Labs, New Mexico State University
15 * Permission is hereby granted, free of charge, to any person obtaining a
16 * copy of this software and associated documentation files (the "Software"),
17 * to deal in the Software without restriction, including without limitation
18 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
19 * and/or sell copies of the Software, and to permit persons to whom the
20 * Software is furnished to do so, subject to the following conditions:
22 * The above copyright notice and this permission notice shall be included in
23 * all copies or substantial portions of the Software.
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
28 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
29 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
30 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
31 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
35 * This work is part of OpenLDAP Software <https://www.openldap.org/>.
36 * $OpenLDAP: pkg/ldap/libraries/liblunicode/ucdata/ucpgba.c,v 1.9 2008/01/07 23:20:05 kurt Exp $
37 * $Id: ucpgba.c,v 1.5 2001/01/02 18:46:20 mleisher Exp $
42 #include "k5-unicode.h"
51 * These macros are used while reordering of RTL runs of text for the
52 * special case of non-spacing characters being in runs of weakly
53 * directional text. They check for weak and non-spacing, and digits and
56 #define ISWEAKSPECIAL(cc) ucisprop(cc, UC_EN|UC_ES|UC_MN, UC_ET|UC_AN|UC_CS)
57 #define ISDIGITSPECIAL(cc) ucisprop(cc, UC_ND|UC_MN, 0)
60 * These macros are used while breaking a string into runs of text in
61 * different directions. Descriptions:
63 * ISLTR_LTR - Test for members of an LTR run in an LTR context. This looks
64 * for characters with ltr, non-spacing, weak, and neutral
67 * ISRTL_RTL - Test for members of an RTL run in an RTL context. This looks
68 * for characters with rtl, non-spacing, weak, and neutral
71 * ISRTL_NEUTRAL - Test for RTL or neutral characters.
73 * ISWEAK_NEUTRAL - Test for weak or neutral characters.
75 #define ISLTR_LTR(cc) ucisprop(cc, UC_L|UC_MN|UC_EN|UC_ES,\
76 UC_ET|UC_CS|UC_B|UC_S|UC_WS|UC_ON)
78 #define ISRTL_RTL(cc) ucisprop(cc, UC_R|UC_MN|UC_EN|UC_ES,\
79 UC_ET|UC_AN|UC_CS|UC_B|UC_S|UC_WS|UC_ON)
81 #define ISRTL_NEUTRAL(cc) ucisprop(cc, UC_R, UC_B|UC_S|UC_WS|UC_ON)
82 #define ISWEAK_NEUTRAL(cc) ucisprop(cc, UC_EN|UC_ES, \
83 UC_B|UC_S|UC_WS|UC_ON|UC_ET|UC_AN|UC_CS)
86 * This table is temporarily hard-coded here until it can be constructed
87 * automatically somehow.
89 static unsigned long _symmetric_pairs[] = {
90 0x0028, 0x0029, 0x0029, 0x0028, 0x003C, 0x003E, 0x003E, 0x003C,
91 0x005B, 0x005D, 0x005D, 0x005B, 0x007B, 0x007D, 0x007D, 0x007B,
92 0x2045, 0x2046, 0x2046, 0x2045, 0x207D, 0x207E, 0x207E, 0x207D,
93 0x208D, 0x208E, 0x208E, 0x208D, 0x3008, 0x3009, 0x3009, 0x3008,
94 0x300A, 0x300B, 0x300B, 0x300A, 0x300C, 0x300D, 0x300D, 0x300C,
95 0x300E, 0x300F, 0x300F, 0x300E, 0x3010, 0x3011, 0x3011, 0x3010,
96 0x3014, 0x3015, 0x3015, 0x3014, 0x3016, 0x3017, 0x3017, 0x3016,
97 0x3018, 0x3019, 0x3019, 0x3018, 0x301A, 0x301B, 0x301B, 0x301A,
98 0xFD3E, 0xFD3F, 0xFD3F, 0xFD3E, 0xFE59, 0xFE5A, 0xFE5A, 0xFE59,
99 0xFE5B, 0xFE5C, 0xFE5C, 0xFE5B, 0xFE5D, 0xFE5E, 0xFE5E, 0xFE5D,
100 0xFF08, 0xFF09, 0xFF09, 0xFF08, 0xFF3B, 0xFF3D, 0xFF3D, 0xFF3B,
101 0xFF5B, 0xFF5D, 0xFF5D, 0xFF5B, 0xFF62, 0xFF63, 0xFF63, 0xFF62,
104 static int _symmetric_pairs_size =
105 sizeof(_symmetric_pairs)/sizeof(_symmetric_pairs[0]);
108 * This routine looks up the other form of a symmetric pair.
111 _ucsymmetric_pair(unsigned long c)
115 for (i = 0; i < _symmetric_pairs_size; i += 2) {
116 if (_symmetric_pairs[i] == c)
117 return _symmetric_pairs[i+1];
123 * This routine creates a new run, copies the text into it, links it into the
124 * logical text order chain and returns it to the caller to be linked into
125 * the visual text order chain.
128 _add_run(ucstring_t *str, unsigned long *src,
129 unsigned long start, unsigned long end, int direction)
134 run = (ucrun_t *) malloc(sizeof(ucrun_t));
135 run->visual_next = run->visual_prev = 0;
136 run->direction = direction;
140 run->chars = (unsigned long *)
141 malloc(sizeof(unsigned long) * ((end - start) << 1));
142 run->positions = run->chars + (end - start);
148 if (direction == UCPGBA_RTL) {
150 * Copy the source text into the run in reverse order and select
151 * replacements for the pairwise punctuation and the <> characters.
153 for (i = 0, t = end - 1; start < end; start++, t--, i++) {
154 run->positions[i] = t;
155 if (ucissymmetric(src[t]) || src[t] == '<' || src[t] == '>')
156 run->chars[i] = _ucsymmetric_pair(src[t]);
158 run->chars[i] = src[t];
162 * Copy the source text into the run directly.
164 for (i = start; i < end; i++) {
165 run->positions[i - start] = i;
166 run->chars[i - start] = src[i];
171 * Add the run to the logical list for cursor traversal.
173 if (str->logical_first == 0)
174 str->logical_first = str->logical_last = run;
176 run->logical_prev = str->logical_last;
177 str->logical_last->logical_next = run;
178 str->logical_last = run;
185 _ucadd_rtl_segment(ucstring_t *str, unsigned long *source, unsigned long start,
192 * This is used to splice runs into strings with overall LTR direction.
193 * The `lrun' variable will never be NULL because at least one LTR run was
194 * added before this RTL run.
196 lrun = str->visual_last;
198 for (e = s = start; s < end;) {
199 for (; e < end && ISRTL_NEUTRAL(source[e]); e++) ;
202 run = _add_run(str, source, s, e, UCPGBA_RTL);
205 * Add the run to the visual list for cursor traversal.
207 if (str->visual_first != 0) {
208 if (str->direction == UCPGBA_LTR) {
209 run->visual_prev = lrun;
210 run->visual_next = lrun->visual_next;
211 if (lrun->visual_next != 0)
212 lrun->visual_next->visual_prev = run;
213 lrun->visual_next = run;
214 if (lrun == str->visual_last)
215 str->visual_last = run;
217 run->visual_next = str->visual_first;
218 str->visual_first->visual_prev = run;
219 str->visual_first = run;
222 str->visual_first = str->visual_last = run;
226 * Handle digits in a special way. This makes sure the weakly
227 * directional characters appear on the expected sides of a number
228 * depending on whether that number is Arabic or not.
230 for (s = e; e < end && ISWEAKSPECIAL(source[e]); e++) {
231 if (!ISDIGITSPECIAL(source[e]) &&
232 (e + 1 == end || !ISDIGITSPECIAL(source[e + 1])))
237 run = _add_run(str, source, s, e, UCPGBA_LTR);
240 * Add the run to the visual list for cursor traversal.
242 if (str->visual_first != 0) {
243 if (str->direction == UCPGBA_LTR) {
244 run->visual_prev = lrun;
245 run->visual_next = lrun->visual_next;
246 if (lrun->visual_next != 0)
247 lrun->visual_next->visual_prev = run;
248 lrun->visual_next = run;
249 if (lrun == str->visual_last)
250 str->visual_last = run;
252 run->visual_next = str->visual_first;
253 str->visual_first->visual_prev = run;
254 str->visual_first = run;
257 str->visual_first = str->visual_last = run;
261 * Collect all weak non-digit sequences for an RTL segment. These
262 * will appear as part of the next RTL segment or will be added as
263 * an RTL segment by themselves.
265 for (s = e; e < end && ucisweak(source[e]) && !ucisdigit(source[e]);
270 * Capture any weak non-digit sequences that occur at the end of the RTL
274 run = _add_run(str, source, s, e, UCPGBA_RTL);
277 * Add the run to the visual list for cursor traversal.
279 if (str->visual_first != 0) {
280 if (str->direction == UCPGBA_LTR) {
281 run->visual_prev = lrun;
282 run->visual_next = lrun->visual_next;
283 if (lrun->visual_next != 0)
284 lrun->visual_next->visual_prev = run;
285 lrun->visual_next = run;
286 if (lrun == str->visual_last)
287 str->visual_last = run;
289 run->visual_next = str->visual_first;
290 str->visual_first->visual_prev = run;
291 str->visual_first = run;
294 str->visual_first = str->visual_last = run;
299 _ucadd_ltr_segment(ucstring_t *str, unsigned long *source, unsigned long start,
304 run = _add_run(str, source, start, end, UCPGBA_LTR);
307 * Add the run to the visual list for cursor traversal.
309 if (str->visual_first != 0) {
310 if (str->direction == UCPGBA_LTR) {
311 run->visual_prev = str->visual_last;
312 str->visual_last->visual_next = run;
313 str->visual_last = run;
315 run->visual_next = str->visual_first;
316 str->visual_first->visual_prev = run;
317 str->visual_first = run;
320 str->visual_first = str->visual_last = run;
324 ucstring_create(unsigned long *source, unsigned long start, unsigned long end,
325 int default_direction, int cursor_motion)
328 unsigned long s, e, ld;
331 str = (ucstring_t *) malloc(sizeof(ucstring_t));
334 * Set the initial values.
336 str->cursor_motion = cursor_motion;
337 str->logical_first = str->logical_last = 0;
338 str->visual_first = str->visual_last = str->cursor = 0;
339 str->source = source;
344 * If the length of the string is 0, then just return it at this point.
350 * This flag indicates whether the collection loop for RTL is called
351 * before the LTR loop the first time.
356 * Look for the first character in the string that has strong
359 for (s = start; s < end && !ucisstrong(source[s]); s++) ;
363 * If the string contains no characters with strong directionality, use
364 * the default direction.
366 str->direction = default_direction;
368 str->direction = ucisrtl(source[s]) ? UCPGBA_RTL : UCPGBA_LTR;
370 if (str->direction == UCPGBA_RTL)
372 * Set the flag that causes the RTL collection loop to run first.
377 * This loop now separates the string into runs based on directionality.
379 for (s = e = 0; s < end; s = e) {
382 * Determine the next run of LTR text.
386 while (e < end && ISLTR_LTR(source[e])) {
387 if (ucisdigit(source[e]) &&
388 !(0x660 <= source[e] && source[e] <= 0x669))
392 if (str->direction != UCPGBA_LTR) {
393 while (e > ld && ISWEAK_NEUTRAL(source[e - 1]))
398 * Add the LTR segment to the string.
401 _ucadd_ltr_segment(str, source, s, e);
405 * Determine the next run of RTL text.
408 while (e < end && ISRTL_RTL(source[e])) {
409 if (ucisdigit(source[e]) &&
410 !(0x660 <= source[e] && source[e] <= 0x669))
414 if (str->direction != UCPGBA_RTL) {
415 while (e > ld && ISWEAK_NEUTRAL(source[e - 1]))
420 * Add the RTL segment to the string.
423 _ucadd_rtl_segment(str, source, s, e);
426 * Clear the flag that allowed the RTL collection loop to run first
427 * for strings with overall RTL directionality.
433 * Set up the initial cursor run.
435 str->cursor = str->logical_first;
437 str->cursor->cursor = (str->cursor->direction == UCPGBA_RTL) ?
438 str->cursor->end - str->cursor->start : 0;
444 ucstring_free(ucstring_t *s)
451 for (l = 0, r = s->visual_first; r != 0; r = r->visual_next) {
452 if (r->end > r->start)
453 free((char *) r->chars);
465 ucstring_set_cursor_motion(ucstring_t *str, int cursor_motion)
472 n = str->cursor_motion;
473 str->cursor_motion = cursor_motion;
478 _ucstring_visual_cursor_right(ucstring_t *str, int count)
487 cursor = str->cursor;
489 size = cursor->end - cursor->start;
490 if ((cursor->direction == UCPGBA_RTL && cursor->cursor + 1 == size) ||
491 cursor->cursor + 1 > size) {
493 * If the next run is NULL, then the cursor is already on the
494 * far right end already.
496 if (cursor->visual_next == 0)
498 * If movement occured, then report it.
500 return (cnt != count);
503 * Move to the next run.
505 str->cursor = cursor = cursor->visual_next;
506 cursor->cursor = (cursor->direction == UCPGBA_RTL) ? -1 : 0;
507 size = cursor->end - cursor->start;
516 _ucstring_logical_cursor_right(ucstring_t *str, int count)
525 cursor = str->cursor;
527 size = cursor->end - cursor->start;
528 if (str->direction == UCPGBA_RTL) {
529 if (cursor->direction == UCPGBA_RTL) {
530 if (cursor->cursor + 1 == size) {
531 if (cursor == str->logical_first)
533 * Already at the beginning of the string.
535 return (cnt != count);
537 str->cursor = cursor = cursor->logical_prev;
538 size = cursor->end - cursor->start;
539 cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
544 if (cursor->cursor == 0) {
545 if (cursor == str->logical_first)
547 * At the beginning of the string already.
549 return (cnt != count);
551 str->cursor = cursor = cursor->logical_prev;
552 size = cursor->end - cursor->start;
553 cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
559 if (cursor->direction == UCPGBA_RTL) {
560 if (cursor->cursor == 0) {
561 if (cursor == str->logical_last)
563 * Already at the end of the string.
565 return (cnt != count);
567 str->cursor = cursor = cursor->logical_next;
568 size = cursor->end - cursor->start;
569 cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
574 if (cursor->cursor + 1 > size) {
575 if (cursor == str->logical_last)
577 * Already at the end of the string.
579 return (cnt != count);
581 str->cursor = cursor = cursor->logical_next;
582 cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
594 ucstring_cursor_right(ucstring_t *str, int count)
598 return (str->cursor_motion == UCPGBA_CURSOR_VISUAL) ?
599 _ucstring_visual_cursor_right(str, count) :
600 _ucstring_logical_cursor_right(str, count);
604 _ucstring_visual_cursor_left(ucstring_t *str, int count)
613 cursor = str->cursor;
615 size = cursor->end - cursor->start;
616 if ((cursor->direction == UCPGBA_LTR && cursor->cursor == 0) ||
617 cursor->cursor - 1 < -1) {
619 * If the preceding run is NULL, then the cursor is already on the
620 * far left end already.
622 if (cursor->visual_prev == 0)
624 * If movement occured, then report it.
626 return (cnt != count);
629 * Move to the previous run.
631 str->cursor = cursor = cursor->visual_prev;
632 size = cursor->end - cursor->start;
633 cursor->cursor = (cursor->direction == UCPGBA_RTL) ?
643 _ucstring_logical_cursor_left(ucstring_t *str, int count)
652 cursor = str->cursor;
654 size = cursor->end - cursor->start;
655 if (str->direction == UCPGBA_RTL) {
656 if (cursor->direction == UCPGBA_RTL) {
657 if (cursor->cursor == -1) {
658 if (cursor == str->logical_last)
660 * Already at the end of the string.
662 return (cnt != count);
664 str->cursor = cursor = cursor->logical_next;
665 size = cursor->end - cursor->start;
666 cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
671 if (cursor->cursor + 1 > size) {
672 if (cursor == str->logical_last)
674 * At the end of the string already.
676 return (cnt != count);
678 str->cursor = cursor = cursor->logical_next;
679 size = cursor->end - cursor->start;
680 cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
686 if (cursor->direction == UCPGBA_RTL) {
687 if (cursor->cursor + 1 == size) {
688 if (cursor == str->logical_first)
690 * Already at the beginning of the string.
692 return (cnt != count);
694 str->cursor = cursor = cursor->logical_prev;
695 size = cursor->end - cursor->start;
696 cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
701 if (cursor->cursor == 0) {
702 if (cursor == str->logical_first)
704 * Already at the beginning of the string.
706 return (cnt != count);
708 str->cursor = cursor = cursor->logical_prev;
709 cursor->cursor = (cursor->direction == UCPGBA_LTR) ?
721 ucstring_cursor_left(ucstring_t *str, int count)
725 return (str->cursor_motion == UCPGBA_CURSOR_VISUAL) ?
726 _ucstring_visual_cursor_left(str, count) :
727 _ucstring_logical_cursor_left(str, count);
731 ucstring_cursor_info(ucstring_t *str, int *direction, unsigned long *position)
737 if (str == 0 || direction == 0 || position == 0)
740 cursor = str->cursor;
742 *direction = cursor->direction;
745 size = cursor->end - cursor->start;
748 *position = (cursor->direction == UCPGBA_RTL) ?
749 cursor->start : cursor->positions[c - 1];
751 *position = (cursor->direction == UCPGBA_RTL) ?
752 cursor->end : cursor->start;
754 *position = cursor->positions[c];