lib/sh/casemod.c

   1 /* casemod.c -- functions to change case of strings */
   2
   3 /* Copyright (C) 2008,2009 Free Software Foundation, Inc.
   4
   5    This file is part of GNU Bash, the Bourne Again SHell.
   6
   7    Bash is free software: you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation, either version 3 of the License, or
  10    (at your option) any later version.
  11
  12    Bash is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with Bash.  If not, see <http://www.gnu.org/licenses/>.
  19 */
  20
  21 #if defined (HAVE_CONFIG_H)
  22 #  include <config.h>
  23 #endif
  24
  25 #if defined (HAVE_UNISTD_H)
  26 #  include <unistd.h>
  27 #endif /* HAVE_UNISTD_H */
  28
  29 #include <stdc.h>
  30
  31 #include <bashansi.h>
  32 #include <bashintl.h>
  33 #include <bashtypes.h>
  34
  35 #include <stdio.h>
  36 #include <ctype.h>
  37 #include <xmalloc.h>
  38
  39 #include <shmbchar.h>
  40 #include <shmbutil.h>
  41 #include <chartypes.h>
  42 #include <typemax.h>
  43
  44 #include <glob/strmatch.h>
  45
  46 #define _to_wupper(wc)  (iswlower (wc) ? towupper (wc) : (wc))
  47 #define _to_wlower(wc)  (iswupper (wc) ? towlower (wc) : (wc))
  48
  49 #if !defined (HANDLE_MULTIBYTE)
  50 #  define cval(s, i)    ((s)[(i)])
  51 #  define iswalnum(c)   (isalnum(c))
  52 #  define TOGGLE(x)     (ISUPPER (x) ? tolower (x) : (TOUPPER (x)))
  53 #else
  54 #  define TOGGLE(x)     (iswupper (x) ? towlower (x) : (_to_wupper(x)))
  55 #endif
  56
  57 /* These must agree with the defines in externs.h */
  58 #define CASE_NOOP       0x0000
  59 #define CASE_LOWER      0x0001
  60 #define CASE_UPPER      0x0002
  61 #define CASE_CAPITALIZE 0x0004
  62 #define CASE_UNCAP      0x0008
  63 #define CASE_TOGGLE     0x0010
  64 #define CASE_TOGGLEALL  0x0020
  65 #define CASE_UPFIRST    0x0040
  66 #define CASE_LOWFIRST   0x0080
  67
  68 #define CASE_USEWORDS   0x1000          /* modify behavior to act on words in passed string */
  69
  70 extern char *substring __P((char *, int, int));
  71
  72 #ifndef UCHAR_MAX
  73 #  define UCHAR_MAX     TYPE_MAXIMUM(unsigned char)
  74 #endif
  75
  76 #if defined (HANDLE_MULTIBYTE)
  77 static wchar_t
  78 cval (s, i)
  79      char *s;
  80      int i;
  81 {
  82   size_t tmp;
  83   wchar_t wc;
  84   int l;
  85   mbstate_t mps;
  86
  87   if (MB_CUR_MAX == 1 || is_basic (s[i]))
  88     return ((wchar_t)s[i]);
  89   l = strlen (s);
  90   if (i >= (l - 1))
  91     return ((wchar_t)s[i]);
  92   memset (&mps, 0, sizeof (mbstate_t));
  93   tmp = mbrtowc (&wc, s + i, l - i, &mps);
  94   if (MB_INVALIDCH (tmp) || MB_NULLWCH (tmp))
  95     return ((wchar_t)s[i]);
  96   return wc;
  97 }
  98 #endif
  99
 100 /* Modify the case of characters in STRING matching PAT based on the value of
 101    FLAGS.  If PAT is null, modify the case of each character */
 102 char *
 103 sh_modcase (string, pat, flags)
 104      const char *string;
 105      char *pat;
 106      int flags;
 107 {
 108   int start, next, end;
 109   int inword, c, nc, nop, match, usewords;
 110   char *ret, *s;
 111   wchar_t wc;
 112 #if defined (HANDLE_MULTIBYTE)
 113   wchar_t nwc;
 114   char mb[MB_LEN_MAX+1];
 115   int mlen;
 116   size_t m;
 117   mbstate_t state;
 118 #endif
 119
 120   if (string == 0 || *string == 0)
 121     {
 122       ret = (char *)xmalloc (1);
 123       ret[0] = '\0';
 124       return ret;
 125     }
 126
 127 #if defined (HANDLE_MULTIBYTE)
 128   memset (&state, 0, sizeof (mbstate_t));
 129 #endif
 130
 131   start = 0;
 132   end = strlen (string);
 133
 134   ret = (char *)xmalloc (end + 1);
 135   strcpy (ret, string);
 136
 137   /* See if we are supposed to split on alphanumerics and operate on each word */
 138   usewords = (flags & CASE_USEWORDS);
 139   flags &= ~CASE_USEWORDS;
 140
 141   inword = 0;
 142   while (start < end)
 143     {
 144       wc = cval (ret, start);
 145
 146       if (iswalnum (wc) == 0)
 147         {
 148           inword = 0;
 149 #if 0
 150           ADVANCE_CHAR (ret, end, start);
 151           continue;
 152 #endif
 153         }
 154
 155       if (pat)
 156         {
 157           next = start;
 158           ADVANCE_CHAR (ret, end, next);
 159           s = substring (ret, start, next);
 160           match = strmatch (pat, s, FNM_EXTMATCH) != FNM_NOMATCH;
 161           free (s);
 162           if (match == 0)
 163             {
 164               start = next;
 165               inword = 1;
 166               continue;
 167             }
 168         }
 169
 170       /* XXX - for now, the toggling operators work on the individual
 171          words in the string, breaking on alphanumerics.  Should I
 172          leave the capitalization operators to do that also? */
 173       if (flags == CASE_CAPITALIZE)
 174         {
 175           if (usewords)
 176             nop = inword ? CASE_LOWER : CASE_UPPER;
 177           else
 178             nop = (start > 0) ? CASE_LOWER : CASE_UPPER;
 179           inword = 1;
 180         }
 181       else if (flags == CASE_UNCAP)
 182         {
 183           if (usewords)
 184             nop = inword ? CASE_UPPER : CASE_LOWER;
 185           else
 186             nop = (start > 0) ? CASE_UPPER : CASE_LOWER;
 187           inword = 1;
 188         }
 189       else if (flags == CASE_UPFIRST)
 190         {
 191           if (usewords)
 192             nop = inword ? CASE_NOOP : CASE_UPPER;
 193           else
 194             nop = (start > 0) ? CASE_NOOP : CASE_UPPER;
 195           inword = 1;
 196         }
 197       else if (flags == CASE_LOWFIRST)
 198         {
 199           if (usewords)
 200             nop = inword ? CASE_NOOP : CASE_LOWER;
 201           else
 202             nop = (start > 0) ? CASE_NOOP : CASE_LOWER;
 203           inword = 1;
 204         }
 205       else if (flags == CASE_TOGGLE)
 206         {
 207           nop = inword ? CASE_NOOP : CASE_TOGGLE;
 208           inword = 1;
 209         }
 210       else
 211         nop = flags;
 212
 213       /* Need to check UCHAR_MAX since wc may have already been converted to a
 214          wide character by cval() */
 215       if (MB_CUR_MAX == 1 || (wc <= UCHAR_MAX && is_basic ((int)wc)))
 216         {
 217 singlebyte:
 218           switch (nop)
 219           {
 220           default:
 221           case CASE_NOOP:  nc = wc; break;
 222           case CASE_UPPER:  nc = TOUPPER (wc); break;
 223           case CASE_LOWER:  nc = TOLOWER (wc); break;
 224           case CASE_TOGGLEALL:
 225           case CASE_TOGGLE: nc = TOGGLE (wc); break;
 226           }
 227           ret[start] = nc;
 228         }
 229 #if defined (HANDLE_MULTIBYTE)
 230       else
 231         {
 232           m = mbrtowc (&wc, string + start, end - start, &state);
 233           if (MB_INVALIDCH (m))
 234             {
 235               wc = (unsigned char)string[start];
 236               goto singlebyte;
 237             }
 238           else if (MB_NULLWCH (m))
 239             wc = L'\0';
 240           switch (nop)
 241           {
 242           default:
 243           case CASE_NOOP:  nwc = wc; break;
 244           case CASE_UPPER:  nwc = _to_wupper (wc); break;
 245           case CASE_LOWER:  nwc = _to_wlower (wc); break;
 246           case CASE_TOGGLEALL:
 247           case CASE_TOGGLE: nwc = TOGGLE (wc); break;
 248           }
 249           if  (nwc != wc)       /*  just skip unchanged characters */
 250             {
 251               mlen = wcrtomb (mb, nwc, &state);
 252               if (mlen > 0)
 253                 mb[mlen] = '\0';
 254               /* Assume the same width */
 255               strncpy (ret + start, mb, mlen);
 256             }
 257         }
 258 #endif
 259
 260       /*  This assumes that the upper and lower case versions are the same width. */
 261       ADVANCE_CHAR (ret, end, start);
 262     }
 263
 264   return ret;
 265 }