locale: Introduce translate_unicode_codepoint into linereader.c
authorFlorian Weimer <fweimer@redhat.com>
Tue, 5 Jul 2022 07:05:22 +0000 (09:05 +0200)
committerFlorian Weimer <fweimer@redhat.com>
Tue, 5 Jul 2022 07:06:39 +0000 (09:06 +0200)
This will permit reusing the Unicode character processing for
different character encodings, not just the current <U...> encoding.

Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Tested-by: Carlos O'Donell <carlos@redhat.com>
locale/programs/linereader.c

index d5367e0..f7292f0 100644 (file)
@@ -596,6 +596,83 @@ get_ident (struct linereader *lr)
   return &lr->token;
 }
 
+/* Process a decoded Unicode codepoint WCH in a string, placing the
+   multibyte sequence into LRB.  Return false if the character is not
+   found in CHARMAP/REPERTOIRE.  */
+static bool
+translate_unicode_codepoint (struct localedef_t *locale,
+                            const struct charmap_t *charmap,
+                            const struct repertoire_t *repertoire,
+                            uint32_t wch, struct lr_buffer *lrb)
+{
+  /* See whether the charmap contains the Uxxxxxxxx names.  */
+  char utmp[10];
+  snprintf (utmp, sizeof (utmp), "U%08X", wch);
+  struct charseq *seq = charmap_find_value (charmap, utmp, 9);
+
+  if (seq == NULL)
+    {
+      /* No, this isn't the case.  Now determine from
+        the repertoire the name of the character and
+        find it in the charmap.  */
+      if (repertoire != NULL)
+       {
+         const char *symbol = repertoire_find_symbol (repertoire, wch);
+         if (symbol != NULL)
+           seq = charmap_find_value (charmap, symbol, strlen (symbol));
+       }
+
+      if (seq == NULL)
+       {
+#ifndef NO_TRANSLITERATION
+         /* Transliterate if possible.  */
+         if (locale != NULL)
+           {
+             if ((locale->avail & CTYPE_LOCALE) == 0)
+               {
+                 /* Load the CTYPE data now.  */
+                 int old_needed = locale->needed;
+
+                 locale->needed = 0;
+                 locale = load_locale (LC_CTYPE, locale->name,
+                                       locale->repertoire_name,
+                                       charmap, locale);
+                 locale->needed = old_needed;
+               }
+
+             uint32_t *translit;
+             if ((locale->avail & CTYPE_LOCALE) != 0
+                 && ((translit = find_translit (locale, charmap, wch))
+                     != NULL))
+               /* The CTYPE data contains a matching
+                  transliteration.  */
+               {
+                 for (int i = 0; translit[i] != 0; ++i)
+                   {
+                     snprintf (utmp, sizeof (utmp), "U%08X", translit[i]);
+                     seq = charmap_find_value (charmap, utmp, 9);
+                     assert (seq != NULL);
+                     adds (lrb, seq->bytes, seq->nbytes);
+                   }
+                 return true;
+               }
+           }
+#endif /* NO_TRANSLITERATION */
+
+         /* Not a known name.  */
+         return false;
+       }
+    }
+
+  if (seq != NULL)
+    {
+      adds (lrb, seq->bytes, seq->nbytes);
+      return true;
+    }
+  else
+    return false;
+}
+
 
 static struct token *
 get_string (struct linereader *lr, const struct charmap_t *charmap,
@@ -635,7 +712,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
     }
   else
     {
-      int illegal_string = 0;
+      bool illegal_string = false;
       size_t buf2act = 0;
       size_t buf2max = 56 * sizeof (uint32_t);
       int ch;
@@ -695,7 +772,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
            {
              /* <> is no correct name.  Ignore it and also signal an
                 error.  */
-             illegal_string = 1;
+             illegal_string = true;
              continue;
            }
 
@@ -709,8 +786,6 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 
              if (cp == &lrb.buf[lrb.act])
                {
-                 char utmp[10];
-
                  /* Yes, it is.  */
                  addc (&lrb, '\0');
                  wch = strtoul (lrb.buf + startidx + 1, NULL, 16);
@@ -721,81 +796,9 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
                  if (return_widestr)
                    ADDWC (wch);
 
-                 /* See whether the charmap contains the Uxxxxxxxx names.  */
-                 snprintf (utmp, sizeof (utmp), "U%08X", wch);
-                 seq = charmap_find_value (charmap, utmp, 9);
-
-                 if (seq == NULL)
-                   {
-                    /* No, this isn't the case.  Now determine from
-                       the repertoire the name of the character and
-                       find it in the charmap.  */
-                     if (repertoire != NULL)
-                       {
-                         const char *symbol;
-
-                         symbol = repertoire_find_symbol (repertoire, wch);
-
-                         if (symbol != NULL)
-                           seq = charmap_find_value (charmap, symbol,
-                                                     strlen (symbol));
-                       }
-
-                     if (seq == NULL)
-                       {
-#ifndef NO_TRANSLITERATION
-                         /* Transliterate if possible.  */
-                         if (locale != NULL)
-                           {
-                             uint32_t *translit;
-
-                             if ((locale->avail & CTYPE_LOCALE) == 0)
-                               {
-                                 /* Load the CTYPE data now.  */
-                                 int old_needed = locale->needed;
-
-                                 locale->needed = 0;
-                                 locale = load_locale (LC_CTYPE,
-                                                       locale->name,
-                                                       locale->repertoire_name,
-                                                       charmap, locale);
-                                 locale->needed = old_needed;
-                               }
-
-                             if ((locale->avail & CTYPE_LOCALE) != 0
-                                 && ((translit = find_translit (locale,
-                                                                charmap, wch))
-                                     != NULL))
-                               /* The CTYPE data contains a matching
-                                  transliteration.  */
-                               {
-                                 int i;
-
-                                 for (i = 0; translit[i] != 0; ++i)
-                                   {
-                                     char utmp[10];
-
-                                     snprintf (utmp, sizeof (utmp), "U%08X",
-                                               translit[i]);
-                                     seq = charmap_find_value (charmap, utmp,
-                                                               9);
-                                     assert (seq != NULL);
-                                     adds (&lrb, seq->bytes, seq->nbytes);
-                                   }
-
-                                 continue;
-                               }
-                           }
-#endif /* NO_TRANSLITERATION */
-
-                         /* Not a known name.  */
-                         illegal_string = 1;
-                       }
-                   }
-
-                 if (seq != NULL)
-                   adds (&lrb, seq->bytes, seq->nbytes);
-
+                 if (!translate_unicode_codepoint (locale, charmap,
+                                                   repertoire, wch, &lrb))
+                   illegal_string = true;
                  continue;
                }
            }
@@ -812,7 +815,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
              /* This name is not in the charmap.  */
              lr_error (lr, _("symbol `%.*s' not in charmap"),
                        (int) (lrb.act - startidx), &lrb.buf[startidx]);
-             illegal_string = 1;
+             illegal_string = true;
            }
 
          if (return_widestr)
@@ -833,7 +836,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
                  /* This name is not in the repertoire map.  */
                  lr_error (lr, _("symbol `%.*s' not in repertoire map"),
                            (int) (lrb.act - startidx), &lrb.buf[startidx]);
-                 illegal_string = 1;
+                 illegal_string = true;
                }
              else
                ADDWC (wch);
@@ -850,7 +853,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
       if (ch == '\n' || ch == EOF)
        {
          lr_error (lr, _("unterminated string"));
-         illegal_string = 1;
+         illegal_string = true;
        }
 
       if (illegal_string)