glib/gshell.c

   1 /* gshell.c - Shell-related utilities
   2  *
   3  *  Copyright 2000 Red Hat, Inc.
   4  *  g_execvpe implementation based on GNU libc execvp:
   5  *   Copyright 1991, 92, 95, 96, 97, 98, 99 Free Software Foundation, Inc.
   6  *
   7  * GLib is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public License as
   9  * published by the Free Software Foundation; either version 2 of the
  10  * License, or (at your option) any later version.
  11  *
  12  * GLib is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with GLib; see the file COPYING.LIB.  If not, write
  19  * to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20  * Boston, MA 02111-1307, USA.
  21  */
  22
  23 #include "config.h"
  24
  25 #include <string.h>
  26
  27 #include "gshell.h"
  28
  29 #include "gslist.h"
  30 #include "gstrfuncs.h"
  31 #include "gstring.h"
  32 #include "gtestutils.h"
  33 #include "glibintl.h"
  34 #include "gthread.h"
  35
  36 /**
  37  * SECTION:shell
  38  * @title: Shell-related Utilities
  39  * @short_description: shell-like commandline handling
  40  *
  41  * GLib provides the functions g_shell_quote() and g_shell_unquote()
  42  * to handle shell-like quoting in strings. The function g_shell_parse_argv()
  43  * parses a string similar to the way a POSIX shell (/bin/sh) would.
  44  *
  45  * Note that string handling in shells has many obscure and historical
  46  * corner-cases which these functions do not necessarily reproduce. They
  47  * are good enough in practice, though.
  48  */
  49
  50 /**
  51  * G_SHELL_ERROR:
  52  *
  53  * Error domain for shell functions. Errors in this domain will be from
  54  * the #GShellError enumeration. See #GError for information on error
  55  * domains.
  56  **/
  57
  58 /**
  59  * GShellError:
  60  * @G_SHELL_ERROR_BAD_QUOTING: Mismatched or otherwise mangled quoting.
  61  * @G_SHELL_ERROR_EMPTY_STRING: String to be parsed was empty.
  62  * @G_SHELL_ERROR_FAILED: Some other error.
  63  *
  64  * Error codes returned by shell functions.
  65  **/
  66 G_DEFINE_QUARK (g-shell-error-quark, g_shell_error)
  67
  68 /* Single quotes preserve the literal string exactly. escape
  69  * sequences are not allowed; not even \' - if you want a '
  70  * in the quoted text, you have to do something like 'foo'\''bar'
  71  *
  72  * Double quotes allow $ ` " \ and newline to be escaped with backslash.
  73  * Otherwise double quotes preserve things literally.
  74  */
  75
  76 static gboolean
  77 unquote_string_inplace (gchar* str, gchar** end, GError** err)
  78 {
  79   gchar* dest;
  80   gchar* s;
  81   gchar quote_char;
  82
  83   g_return_val_if_fail(end != NULL, FALSE);
  84   g_return_val_if_fail(err == NULL || *err == NULL, FALSE);
  85   g_return_val_if_fail(str != NULL, FALSE);
  86
  87   dest = s = str;
  88
  89   quote_char = *s;
  90
  91   if (!(*s == '"' || *s == '\''))
  92     {
  93       g_set_error_literal (err,
  94                            G_SHELL_ERROR,
  95                            G_SHELL_ERROR_BAD_QUOTING,
  96                            _("Quoted text doesn't begin with a quotation mark"));
  97       *end = str;
  98       return FALSE;
  99     }
 100
 101   /* Skip the initial quote mark */
 102   ++s;
 103
 104   if (quote_char == '"')
 105     {
 106       while (*s)
 107         {
 108           g_assert(s > dest); /* loop invariant */
 109
 110           switch (*s)
 111             {
 112             case '"':
 113               /* End of the string, return now */
 114               *dest = '\0';
 115               ++s;
 116               *end = s;
 117               return TRUE;
 118               break;
 119
 120             case '\\':
 121               /* Possible escaped quote or \ */
 122               ++s;
 123               switch (*s)
 124                 {
 125                 case '"':
 126                 case '\\':
 127                 case '`':
 128                 case '$':
 129                 case '\n':
 130                   *dest = *s;
 131                   ++s;
 132                   ++dest;
 133                   break;
 134
 135                 default:
 136                   /* not an escaped char */
 137                   *dest = '\\';
 138                   ++dest;
 139                   /* ++s already done. */
 140                   break;
 141                 }
 142               break;
 143
 144             default:
 145               *dest = *s;
 146               ++dest;
 147               ++s;
 148               break;
 149             }
 150
 151           g_assert(s > dest); /* loop invariant */
 152         }
 153     }
 154   else
 155     {
 156       while (*s)
 157         {
 158           g_assert(s > dest); /* loop invariant */
 159
 160           if (*s == '\'')
 161             {
 162               /* End of the string, return now */
 163               *dest = '\0';
 164               ++s;
 165               *end = s;
 166               return TRUE;
 167             }
 168           else
 169             {
 170               *dest = *s;
 171               ++dest;
 172               ++s;
 173             }
 174
 175           g_assert(s > dest); /* loop invariant */
 176         }
 177     }
 178
 179   /* If we reach here this means the close quote was never encountered */
 180
 181   *dest = '\0';
 182
 183   g_set_error_literal (err,
 184                        G_SHELL_ERROR,
 185                        G_SHELL_ERROR_BAD_QUOTING,
 186                        _("Unmatched quotation mark in command line or other shell-quoted text"));
 187   *end = s;
 188   return FALSE;
 189 }
 190
 191 /**
 192  * g_shell_quote:
 193  * @unquoted_string: a literal string
 194  *
 195  * Quotes a string so that the shell (/bin/sh) will interpret the
 196  * quoted string to mean @unquoted_string. If you pass a filename to
 197  * the shell, for example, you should first quote it with this
 198  * function.  The return value must be freed with g_free(). The
 199  * quoting style used is undefined (single or double quotes may be
 200  * used).
 201  *
 202  * Returns: quoted string
 203  **/
 204 gchar*
 205 g_shell_quote (const gchar *unquoted_string)
 206 {
 207   /* We always use single quotes, because the algorithm is cheesier.
 208    * We could use double if we felt like it, that might be more
 209    * human-readable.
 210    */
 211
 212   const gchar *p;
 213   GString *dest;
 214
 215   g_return_val_if_fail (unquoted_string != NULL, NULL);
 216
 217   dest = g_string_new ("'");
 218
 219   p = unquoted_string;
 220
 221   /* could speed this up a lot by appending chunks of text at a
 222    * time.
 223    */
 224   while (*p)
 225     {
 226       /* Replace literal ' with a close ', a \', and a open ' */
 227       if (*p == '\'')
 228         g_string_append (dest, "'\\''");
 229       else
 230         g_string_append_c (dest, *p);
 231
 232       ++p;
 233     }
 234
 235   /* close the quote */
 236   g_string_append_c (dest, '\'');
 237
 238   return g_string_free (dest, FALSE);
 239 }
 240
 241 /**
 242  * g_shell_unquote:
 243  * @quoted_string: shell-quoted string
 244  * @error: error return location or NULL
 245  *
 246  * Unquotes a string as the shell (/bin/sh) would. Only handles
 247  * quotes; if a string contains file globs, arithmetic operators,
 248  * variables, backticks, redirections, or other special-to-the-shell
 249  * features, the result will be different from the result a real shell
 250  * would produce (the variables, backticks, etc. will be passed
 251  * through literally instead of being expanded). This function is
 252  * guaranteed to succeed if applied to the result of
 253  * g_shell_quote(). If it fails, it returns %NULL and sets the
 254  * error. The @quoted_string need not actually contain quoted or
 255  * escaped text; g_shell_unquote() simply goes through the string and
 256  * unquotes/unescapes anything that the shell would. Both single and
 257  * double quotes are handled, as are escapes including escaped
 258  * newlines. The return value must be freed with g_free(). Possible
 259  * errors are in the #G_SHELL_ERROR domain.
 260  *
 261  * Shell quoting rules are a bit strange. Single quotes preserve the
 262  * literal string exactly. escape sequences are not allowed; not even
 263  * \' - if you want a ' in the quoted text, you have to do something
 264  * like 'foo'\''bar'.  Double quotes allow $, `, ", \, and newline to
 265  * be escaped with backslash. Otherwise double quotes preserve things
 266  * literally.
 267  *
 268  * Returns: an unquoted string
 269  **/
 270 gchar*
 271 g_shell_unquote (const gchar *quoted_string,
 272                  GError     **error)
 273 {
 274   gchar *unquoted;
 275   gchar *end;
 276   gchar *start;
 277   GString *retval;
 278
 279   g_return_val_if_fail (quoted_string != NULL, NULL);
 280
 281   unquoted = g_strdup (quoted_string);
 282
 283   start = unquoted;
 284   end = unquoted;
 285   retval = g_string_new (NULL);
 286
 287   /* The loop allows cases such as
 288    * "foo"blah blah'bar'woo foo"baz"la la la\'\''foo'
 289    */
 290   while (*start)
 291     {
 292       /* Append all non-quoted chars, honoring backslash escape
 293        */
 294
 295       while (*start && !(*start == '"' || *start == '\''))
 296         {
 297           if (*start == '\\')
 298             {
 299               /* all characters can get escaped by backslash,
 300                * except newline, which is removed if it follows
 301                * a backslash outside of quotes
 302                */
 303
 304               ++start;
 305               if (*start)
 306                 {
 307                   if (*start != '\n')
 308                     g_string_append_c (retval, *start);
 309                   ++start;
 310                 }
 311             }
 312           else
 313             {
 314               g_string_append_c (retval, *start);
 315               ++start;
 316             }
 317         }
 318
 319       if (*start)
 320         {
 321           if (!unquote_string_inplace (start, &end, error))
 322             {
 323               goto error;
 324             }
 325           else
 326             {
 327               g_string_append (retval, start);
 328               start = end;
 329             }
 330         }
 331     }
 332
 333   g_free (unquoted);
 334   return g_string_free (retval, FALSE);
 335
 336  error:
 337   g_assert (error == NULL || *error != NULL);
 338
 339   g_free (unquoted);
 340   g_string_free (retval, TRUE);
 341   return NULL;
 342 }
 343
 344 /* g_parse_argv() does a semi-arbitrary weird subset of the way
 345  * the shell parses a command line. We don't do variable expansion,
 346  * don't understand that operators are tokens, don't do tilde expansion,
 347  * don't do command substitution, no arithmetic expansion, IFS gets ignored,
 348  * don't do filename globs, don't remove redirection stuff, etc.
 349  *
 350  * READ THE UNIX98 SPEC on "Shell Command Language" before changing
 351  * the behavior of this code.
 352  *
 353  * Steps to parsing the argv string:
 354  *
 355  *  - tokenize the string (but since we ignore operators,
 356  *    our tokenization may diverge from what the shell would do)
 357  *    note that tokenization ignores the internals of a quoted
 358  *    word and it always splits on spaces, not on IFS even
 359  *    if we used IFS. We also ignore "end of input indicator"
 360  *    (I guess this is control-D?)
 361  *
 362  *    Tokenization steps, from UNIX98 with operator stuff removed,
 363  *    are:
 364  *
 365  *    1) "If the current character is backslash, single-quote or
 366  *        double-quote (\, ' or ") and it is not quoted, it will affect
 367  *        quoting for subsequent characters up to the end of the quoted
 368  *        text. The rules for quoting are as described in Quoting
 369  *        . During token recognition no substitutions will be actually
 370  *        performed, and the result token will contain exactly the
 371  *        characters that appear in the input (except for newline
 372  *        character joining), unmodified, including any embedded or
 373  *        enclosing quotes or substitution operators, between the quote
 374  *        mark and the end of the quoted text. The token will not be
 375  *        delimited by the end of the quoted field."
 376  *
 377  *    2) "If the current character is an unquoted newline character,
 378  *        the current token will be delimited."
 379  *
 380  *    3) "If the current character is an unquoted blank character, any
 381  *        token containing the previous character is delimited and the
 382  *        current character will be discarded."
 383  *
 384  *    4) "If the previous character was part of a word, the current
 385  *        character will be appended to that word."
 386  *
 387  *    5) "If the current character is a "#", it and all subsequent
 388  *        characters up to, but excluding, the next newline character
 389  *        will be discarded as a comment. The newline character that
 390  *        ends the line is not considered part of the comment. The
 391  *        "#" starts a comment only when it is at the beginning of a
 392  *        token. Since the search for the end-of-comment does not
 393  *        consider an escaped newline character specially, a comment
 394  *        cannot be continued to the next line."
 395  *
 396  *    6) "The current character will be used as the start of a new word."
 397  *
 398  *
 399  *  - for each token (word), perform portions of word expansion, namely
 400  *    field splitting (using default whitespace IFS) and quote
 401  *    removal.  Field splitting may increase the number of words.
 402  *    Quote removal does not increase the number of words.
 403  *
 404  *   "If the complete expansion appropriate for a word results in an
 405  *   empty field, that empty field will be deleted from the list of
 406  *   fields that form the completely expanded command, unless the
 407  *   original word contained single-quote or double-quote characters."
 408  *    - UNIX98 spec
 409  *
 410  *
 411  */
 412
 413 static inline void
 414 ensure_token (GString **token)
 415 {
 416   if (*token == NULL)
 417     *token = g_string_new (NULL);
 418 }
 419
 420 static void
 421 delimit_token (GString **token,
 422                GSList **retval)
 423 {
 424   if (*token == NULL)
 425     return;
 426
 427   *retval = g_slist_prepend (*retval, g_string_free (*token, FALSE));
 428
 429   *token = NULL;
 430 }
 431
 432 static GSList*
 433 tokenize_command_line (const gchar *command_line,
 434                        GError **error)
 435 {
 436   gchar current_quote;
 437   const gchar *p;
 438   GString *current_token = NULL;
 439   GSList *retval = NULL;
 440   gboolean quoted;
 441
 442   current_quote = '\0';
 443   quoted = FALSE;
 444   p = command_line;
 445
 446   while (*p)
 447     {
 448       if (current_quote == '\\')
 449         {
 450           if (*p == '\n')
 451             {
 452               /* we append nothing; backslash-newline become nothing */
 453             }
 454           else
 455             {
 456               /* we append the backslash and the current char,
 457                * to be interpreted later after tokenization
 458                */
 459               ensure_token (&current_token);
 460               g_string_append_c (current_token, '\\');
 461               g_string_append_c (current_token, *p);
 462             }
 463
 464           current_quote = '\0';
 465         }
 466       else if (current_quote == '#')
 467         {
 468           /* Discard up to and including next newline */
 469           while (*p && *p != '\n')
 470             ++p;
 471
 472           current_quote = '\0';
 473
 474           if (*p == '\0')
 475             break;
 476         }
 477       else if (current_quote)
 478         {
 479           if (*p == current_quote &&
 480               /* check that it isn't an escaped double quote */
 481               !(current_quote == '"' && quoted))
 482             {
 483               /* close the quote */
 484               current_quote = '\0';
 485             }
 486
 487           /* Everything inside quotes, and the close quote,
 488            * gets appended literally.
 489            */
 490
 491           ensure_token (&current_token);
 492           g_string_append_c (current_token, *p);
 493         }
 494       else
 495         {
 496           switch (*p)
 497             {
 498             case '\n':
 499               delimit_token (&current_token, &retval);
 500               break;
 501
 502             case ' ':
 503             case '\t':
 504               /* If the current token contains the previous char, delimit
 505                * the current token. A nonzero length
 506                * token should always contain the previous char.
 507                */
 508               if (current_token &&
 509                   current_token->len > 0)
 510                 {
 511                   delimit_token (&current_token, &retval);
 512                 }
 513
 514               /* discard all unquoted blanks (don't add them to a token) */
 515               break;
 516
 517
 518               /* single/double quotes are appended to the token,
 519                * escapes are maybe appended next time through the loop,
 520                * comment chars are never appended.
 521                */
 522
 523             case '\'':
 524             case '"':
 525               ensure_token (&current_token);
 526               g_string_append_c (current_token, *p);
 527
 528               /* FALL THRU */
 529             case '\\':
 530               current_quote = *p;
 531               break;
 532
 533             case '#':
 534               if (p == command_line)
 535                 { /* '#' was the first char */
 536                   current_quote = *p;
 537                   break;
 538                 }
 539               switch(*(p-1))
 540                 {
 541                   case ' ':
 542                   case '\n':
 543                   case '\0':
 544                     current_quote = *p;
 545                     break;
 546                   default:
 547                     ensure_token (&current_token);
 548                     g_string_append_c (current_token, *p);
 549                     break;
 550                 }
 551               break;
 552
 553             default:
 554               /* Combines rules 4) and 6) - if we have a token, append to it,
 555                * otherwise create a new token.
 556                */
 557               ensure_token (&current_token);
 558               g_string_append_c (current_token, *p);
 559               break;
 560             }
 561         }
 562
 563       /* We need to count consecutive backslashes mod 2,
 564        * to detect escaped doublequotes.
 565        */
 566       if (*p != '\\')
 567         quoted = FALSE;
 568       else
 569         quoted = !quoted;
 570
 571       ++p;
 572     }
 573
 574   delimit_token (&current_token, &retval);
 575
 576   if (current_quote)
 577     {
 578       if (current_quote == '\\')
 579         g_set_error (error,
 580                      G_SHELL_ERROR,
 581                      G_SHELL_ERROR_BAD_QUOTING,
 582                      _("Text ended just after a '\\' character."
 583                        " (The text was '%s')"),
 584                      command_line);
 585       else
 586         g_set_error (error,
 587                      G_SHELL_ERROR,
 588                      G_SHELL_ERROR_BAD_QUOTING,
 589                      _("Text ended before matching quote was found for %c."
 590                        " (The text was '%s')"),
 591                      current_quote, command_line);
 592
 593       goto error;
 594     }
 595
 596   if (retval == NULL)
 597     {
 598       g_set_error_literal (error,
 599                            G_SHELL_ERROR,
 600                            G_SHELL_ERROR_EMPTY_STRING,
 601                            _("Text was empty (or contained only whitespace)"));
 602
 603       goto error;
 604     }
 605
 606   /* we appended backward */
 607   retval = g_slist_reverse (retval);
 608
 609   return retval;
 610
 611  error:
 612   g_assert (error == NULL || *error != NULL);
 613
 614   g_slist_free_full (retval, g_free);
 615
 616   return NULL;
 617 }
 618
 619 /**
 620  * g_shell_parse_argv:
 621  * @command_line: command line to parse
 622  * @argcp: (out) (optional): return location for number of args, or %NULL
 623  * @argvp: (out) (optional) (array length=argcp zero-terminated=1): return
 624  *   location for array of args, or %NULL
 625  * @error: (optional): return location for error, or %NULL
 626  *
 627  * Parses a command line into an argument vector, in much the same way
 628  * the shell would, but without many of the expansions the shell would
 629  * perform (variable expansion, globs, operators, filename expansion,
 630  * etc. are not supported). The results are defined to be the same as
 631  * those you would get from a UNIX98 /bin/sh, as long as the input
 632  * contains none of the unsupported shell expansions. If the input
 633  * does contain such expansions, they are passed through
 634  * literally. Possible errors are those from the #G_SHELL_ERROR
 635  * domain. Free the returned vector with g_strfreev().
 636  *
 637  * Returns: %TRUE on success, %FALSE if error set
 638  **/
 639 gboolean
 640 g_shell_parse_argv (const gchar *command_line,
 641                     gint        *argcp,
 642                     gchar     ***argvp,
 643                     GError     **error)
 644 {
 645   /* Code based on poptParseArgvString() from libpopt */
 646   gint argc = 0;
 647   gchar **argv = NULL;
 648   GSList *tokens = NULL;
 649   gint i;
 650   GSList *tmp_list;
 651
 652   g_return_val_if_fail (command_line != NULL, FALSE);
 653
 654   tokens = tokenize_command_line (command_line, error);
 655   if (tokens == NULL)
 656     return FALSE;
 657
 658   /* Because we can't have introduced any new blank space into the
 659    * tokens (we didn't do any new expansions), we don't need to
 660    * perform field splitting. If we were going to honor IFS or do any
 661    * expansions, we would have to do field splitting on each word
 662    * here. Also, if we were going to do any expansion we would need to
 663    * remove any zero-length words that didn't contain quotes
 664    * originally; but since there's no expansion we know all words have
 665    * nonzero length, unless they contain quotes.
 666    *
 667    * So, we simply remove quotes, and don't do any field splitting or
 668    * empty word removal, since we know there was no way to introduce
 669    * such things.
 670    */
 671
 672   argc = g_slist_length (tokens);
 673   argv = g_new0 (gchar*, argc + 1);
 674   i = 0;
 675   tmp_list = tokens;
 676   while (tmp_list)
 677     {
 678       argv[i] = g_shell_unquote (tmp_list->data, error);
 679
 680       /* Since we already checked that quotes matched up in the
 681        * tokenizer, this shouldn't be possible to reach I guess.
 682        */
 683       if (argv[i] == NULL)
 684         goto failed;
 685
 686       tmp_list = g_slist_next (tmp_list);
 687       ++i;
 688     }
 689
 690   g_slist_free_full (tokens, g_free);
 691
 692   if (argcp)
 693     *argcp = argc;
 694
 695   if (argvp)
 696     *argvp = argv;
 697   else
 698     g_strfreev (argv);
 699
 700   return TRUE;
 701
 702  failed:
 703
 704   g_assert (error == NULL || *error != NULL);
 705   g_strfreev (argv);
 706   g_slist_free_full (tokens, g_free);
 707
 708   return FALSE;
 709 }