glib/gshell.c

   1 /* gshell.c - Shell-related utilities
   2  *
   3  *  Copyright 2000 Red Hat, Inc.
   4  *  g_execvpe implementation based on GNU libc execvp:
   5  *   Copyright 1991, 92, 95, 96, 97, 98, 99 Free Software Foundation, Inc.
   6  *
   7  * GLib is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public License as
   9  * published by the Free Software Foundation; either version 2 of the
  10  * License, or (at your option) any later version.
  11  *
  12  * GLib is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with GLib; see the file COPYING.LIB.  If not, write
  19  * to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  20  * Boston, MA 02111-1307, USA.
  21  */
  22
  23 #include "config.h"
  24
  25 #include <string.h>
  26
  27 #include "gshell.h"
  28
  29 #include "gslist.h"
  30 #include "gstrfuncs.h"
  31 #include "gstring.h"
  32 #include "gtestutils.h"
  33 #include "glibintl.h"
  34
  35 /**
  36  * SECTION:shell
  37  * @title: Shell-related Utilities
  38  * @short_description: shell-like commandline handling
  39  **/
  40
  41 /**
  42  * G_SHELL_ERROR:
  43  *
  44  * Error domain for shell functions. Errors in this domain will be from
  45  * the #GShellError enumeration. See #GError for information on error
  46  * domains.
  47  **/
  48
  49 /**
  50  * GShellError:
  51  * @G_SHELL_ERROR_BAD_QUOTING: Mismatched or otherwise mangled quoting.
  52  * @G_SHELL_ERROR_EMPTY_STRING: String to be parsed was empty.
  53  * @G_SHELL_ERROR_FAILED: Some other error.
  54  *
  55  * Error codes returned by shell functions.
  56  **/
  57 G_DEFINE_QUARK ("g-shell-error-quark", g_shell_error)
  58
  59 /* Single quotes preserve the literal string exactly. escape
  60  * sequences are not allowed; not even \' - if you want a '
  61  * in the quoted text, you have to do something like 'foo'\''bar'
  62  *
  63  * Double quotes allow $ ` " \ and newline to be escaped with backslash.
  64  * Otherwise double quotes preserve things literally.
  65  */
  66
  67 static gboolean
  68 unquote_string_inplace (gchar* str, gchar** end, GError** err)
  69 {
  70   gchar* dest;
  71   gchar* s;
  72   gchar quote_char;
  73
  74   g_return_val_if_fail(end != NULL, FALSE);
  75   g_return_val_if_fail(err == NULL || *err == NULL, FALSE);
  76   g_return_val_if_fail(str != NULL, FALSE);
  77
  78   dest = s = str;
  79
  80   quote_char = *s;
  81
  82   if (!(*s == '"' || *s == '\''))
  83     {
  84       g_set_error_literal (err,
  85                            G_SHELL_ERROR,
  86                            G_SHELL_ERROR_BAD_QUOTING,
  87                            _("Quoted text doesn't begin with a quotation mark"));
  88       *end = str;
  89       return FALSE;
  90     }
  91
  92   /* Skip the initial quote mark */
  93   ++s;
  94
  95   if (quote_char == '"')
  96     {
  97       while (*s)
  98         {
  99           g_assert(s > dest); /* loop invariant */
 100
 101           switch (*s)
 102             {
 103             case '"':
 104               /* End of the string, return now */
 105               *dest = '\0';
 106               ++s;
 107               *end = s;
 108               return TRUE;
 109               break;
 110
 111             case '\\':
 112               /* Possible escaped quote or \ */
 113               ++s;
 114               switch (*s)
 115                 {
 116                 case '"':
 117                 case '\\':
 118                 case '`':
 119                 case '$':
 120                 case '\n':
 121                   *dest = *s;
 122                   ++s;
 123                   ++dest;
 124                   break;
 125
 126                 default:
 127                   /* not an escaped char */
 128                   *dest = '\\';
 129                   ++dest;
 130                   /* ++s already done. */
 131                   break;
 132                 }
 133               break;
 134
 135             default:
 136               *dest = *s;
 137               ++dest;
 138               ++s;
 139               break;
 140             }
 141
 142           g_assert(s > dest); /* loop invariant */
 143         }
 144     }
 145   else
 146     {
 147       while (*s)
 148         {
 149           g_assert(s > dest); /* loop invariant */
 150
 151           if (*s == '\'')
 152             {
 153               /* End of the string, return now */
 154               *dest = '\0';
 155               ++s;
 156               *end = s;
 157               return TRUE;
 158             }
 159           else
 160             {
 161               *dest = *s;
 162               ++dest;
 163               ++s;
 164             }
 165
 166           g_assert(s > dest); /* loop invariant */
 167         }
 168     }
 169
 170   /* If we reach here this means the close quote was never encountered */
 171
 172   *dest = '\0';
 173
 174   g_set_error_literal (err,
 175                        G_SHELL_ERROR,
 176                        G_SHELL_ERROR_BAD_QUOTING,
 177                        _("Unmatched quotation mark in command line or other shell-quoted text"));
 178   *end = s;
 179   return FALSE;
 180 }
 181
 182 /**
 183  * g_shell_quote:
 184  * @unquoted_string: a literal string
 185  *
 186  * Quotes a string so that the shell (/bin/sh) will interpret the
 187  * quoted string to mean @unquoted_string. If you pass a filename to
 188  * the shell, for example, you should first quote it with this
 189  * function.  The return value must be freed with g_free(). The
 190  * quoting style used is undefined (single or double quotes may be
 191  * used).
 192  *
 193  * Return value: quoted string
 194  **/
 195 gchar*
 196 g_shell_quote (const gchar *unquoted_string)
 197 {
 198   /* We always use single quotes, because the algorithm is cheesier.
 199    * We could use double if we felt like it, that might be more
 200    * human-readable.
 201    */
 202
 203   const gchar *p;
 204   GString *dest;
 205
 206   g_return_val_if_fail (unquoted_string != NULL, NULL);
 207
 208   dest = g_string_new ("'");
 209
 210   p = unquoted_string;
 211
 212   /* could speed this up a lot by appending chunks of text at a
 213    * time.
 214    */
 215   while (*p)
 216     {
 217       /* Replace literal ' with a close ', a \', and a open ' */
 218       if (*p == '\'')
 219         g_string_append (dest, "'\\''");
 220       else
 221         g_string_append_c (dest, *p);
 222
 223       ++p;
 224     }
 225
 226   /* close the quote */
 227   g_string_append_c (dest, '\'');
 228
 229   return g_string_free (dest, FALSE);
 230 }
 231
 232 /**
 233  * g_shell_unquote:
 234  * @quoted_string: shell-quoted string
 235  * @error: error return location or NULL
 236  *
 237  * Unquotes a string as the shell (/bin/sh) would. Only handles
 238  * quotes; if a string contains file globs, arithmetic operators,
 239  * variables, backticks, redirections, or other special-to-the-shell
 240  * features, the result will be different from the result a real shell
 241  * would produce (the variables, backticks, etc. will be passed
 242  * through literally instead of being expanded). This function is
 243  * guaranteed to succeed if applied to the result of
 244  * g_shell_quote(). If it fails, it returns %NULL and sets the
 245  * error. The @quoted_string need not actually contain quoted or
 246  * escaped text; g_shell_unquote() simply goes through the string and
 247  * unquotes/unescapes anything that the shell would. Both single and
 248  * double quotes are handled, as are escapes including escaped
 249  * newlines. The return value must be freed with g_free(). Possible
 250  * errors are in the #G_SHELL_ERROR domain.
 251  *
 252  * Shell quoting rules are a bit strange. Single quotes preserve the
 253  * literal string exactly. escape sequences are not allowed; not even
 254  * \' - if you want a ' in the quoted text, you have to do something
 255  * like 'foo'\''bar'.  Double quotes allow $, `, ", \, and newline to
 256  * be escaped with backslash. Otherwise double quotes preserve things
 257  * literally.
 258  *
 259  * Return value: an unquoted string
 260  **/
 261 gchar*
 262 g_shell_unquote (const gchar *quoted_string,
 263                  GError     **error)
 264 {
 265   gchar *unquoted;
 266   gchar *end;
 267   gchar *start;
 268   GString *retval;
 269
 270   g_return_val_if_fail (quoted_string != NULL, NULL);
 271
 272   unquoted = g_strdup (quoted_string);
 273
 274   start = unquoted;
 275   end = unquoted;
 276   retval = g_string_new (NULL);
 277
 278   /* The loop allows cases such as
 279    * "foo"blah blah'bar'woo foo"baz"la la la\'\''foo'
 280    */
 281   while (*start)
 282     {
 283       /* Append all non-quoted chars, honoring backslash escape
 284        */
 285
 286       while (*start && !(*start == '"' || *start == '\''))
 287         {
 288           if (*start == '\\')
 289             {
 290               /* all characters can get escaped by backslash,
 291                * except newline, which is removed if it follows
 292                * a backslash outside of quotes
 293                */
 294
 295               ++start;
 296               if (*start)
 297                 {
 298                   if (*start != '\n')
 299                     g_string_append_c (retval, *start);
 300                   ++start;
 301                 }
 302             }
 303           else
 304             {
 305               g_string_append_c (retval, *start);
 306               ++start;
 307             }
 308         }
 309
 310       if (*start)
 311         {
 312           if (!unquote_string_inplace (start, &end, error))
 313             {
 314               goto error;
 315             }
 316           else
 317             {
 318               g_string_append (retval, start);
 319               start = end;
 320             }
 321         }
 322     }
 323
 324   g_free (unquoted);
 325   return g_string_free (retval, FALSE);
 326
 327  error:
 328   g_assert (error == NULL || *error != NULL);
 329
 330   g_free (unquoted);
 331   g_string_free (retval, TRUE);
 332   return NULL;
 333 }
 334
 335 /* g_parse_argv() does a semi-arbitrary weird subset of the way
 336  * the shell parses a command line. We don't do variable expansion,
 337  * don't understand that operators are tokens, don't do tilde expansion,
 338  * don't do command substitution, no arithmetic expansion, IFS gets ignored,
 339  * don't do filename globs, don't remove redirection stuff, etc.
 340  *
 341  * READ THE UNIX98 SPEC on "Shell Command Language" before changing
 342  * the behavior of this code.
 343  *
 344  * Steps to parsing the argv string:
 345  *
 346  *  - tokenize the string (but since we ignore operators,
 347  *    our tokenization may diverge from what the shell would do)
 348  *    note that tokenization ignores the internals of a quoted
 349  *    word and it always splits on spaces, not on IFS even
 350  *    if we used IFS. We also ignore "end of input indicator"
 351  *    (I guess this is control-D?)
 352  *
 353  *    Tokenization steps, from UNIX98 with operator stuff removed,
 354  *    are:
 355  *
 356  *    1) "If the current character is backslash, single-quote or
 357  *        double-quote (\, ' or ") and it is not quoted, it will affect
 358  *        quoting for subsequent characters up to the end of the quoted
 359  *        text. The rules for quoting are as described in Quoting
 360  *        . During token recognition no substitutions will be actually
 361  *        performed, and the result token will contain exactly the
 362  *        characters that appear in the input (except for newline
 363  *        character joining), unmodified, including any embedded or
 364  *        enclosing quotes or substitution operators, between the quote
 365  *        mark and the end of the quoted text. The token will not be
 366  *        delimited by the end of the quoted field."
 367  *
 368  *    2) "If the current character is an unquoted newline character,
 369  *        the current token will be delimited."
 370  *
 371  *    3) "If the current character is an unquoted blank character, any
 372  *        token containing the previous character is delimited and the
 373  *        current character will be discarded."
 374  *
 375  *    4) "If the previous character was part of a word, the current
 376  *        character will be appended to that word."
 377  *
 378  *    5) "If the current character is a "#", it and all subsequent
 379  *        characters up to, but excluding, the next newline character
 380  *        will be discarded as a comment. The newline character that
 381  *        ends the line is not considered part of the comment. The
 382  *        "#" starts a comment only when it is at the beginning of a
 383  *        token. Since the search for the end-of-comment does not
 384  *        consider an escaped newline character specially, a comment
 385  *        cannot be continued to the next line."
 386  *
 387  *    6) "The current character will be used as the start of a new word."
 388  *
 389  *
 390  *  - for each token (word), perform portions of word expansion, namely
 391  *    field splitting (using default whitespace IFS) and quote
 392  *    removal.  Field splitting may increase the number of words.
 393  *    Quote removal does not increase the number of words.
 394  *
 395  *   "If the complete expansion appropriate for a word results in an
 396  *   empty field, that empty field will be deleted from the list of
 397  *   fields that form the completely expanded command, unless the
 398  *   original word contained single-quote or double-quote characters."
 399  *    - UNIX98 spec
 400  *
 401  *
 402  */
 403
 404 static inline void
 405 ensure_token (GString **token)
 406 {
 407   if (*token == NULL)
 408     *token = g_string_new (NULL);
 409 }
 410
 411 static void
 412 delimit_token (GString **token,
 413                GSList **retval)
 414 {
 415   if (*token == NULL)
 416     return;
 417
 418   *retval = g_slist_prepend (*retval, g_string_free (*token, FALSE));
 419
 420   *token = NULL;
 421 }
 422
 423 static GSList*
 424 tokenize_command_line (const gchar *command_line,
 425                        GError **error)
 426 {
 427   gchar current_quote;
 428   const gchar *p;
 429   GString *current_token = NULL;
 430   GSList *retval = NULL;
 431   gboolean quoted;
 432
 433   current_quote = '\0';
 434   quoted = FALSE;
 435   p = command_line;
 436
 437   while (*p)
 438     {
 439       if (current_quote == '\\')
 440         {
 441           if (*p == '\n')
 442             {
 443               /* we append nothing; backslash-newline become nothing */
 444             }
 445           else
 446             {
 447               /* we append the backslash and the current char,
 448                * to be interpreted later after tokenization
 449                */
 450               ensure_token (&current_token);
 451               g_string_append_c (current_token, '\\');
 452               g_string_append_c (current_token, *p);
 453             }
 454
 455           current_quote = '\0';
 456         }
 457       else if (current_quote == '#')
 458         {
 459           /* Discard up to and including next newline */
 460           while (*p && *p != '\n')
 461             ++p;
 462
 463           current_quote = '\0';
 464
 465           if (*p == '\0')
 466             break;
 467         }
 468       else if (current_quote)
 469         {
 470           if (*p == current_quote &&
 471               /* check that it isn't an escaped double quote */
 472               !(current_quote == '"' && quoted))
 473             {
 474               /* close the quote */
 475               current_quote = '\0';
 476             }
 477
 478           /* Everything inside quotes, and the close quote,
 479            * gets appended literally.
 480            */
 481
 482           ensure_token (&current_token);
 483           g_string_append_c (current_token, *p);
 484         }
 485       else
 486         {
 487           switch (*p)
 488             {
 489             case '\n':
 490               delimit_token (&current_token, &retval);
 491               break;
 492
 493             case ' ':
 494             case '\t':
 495               /* If the current token contains the previous char, delimit
 496                * the current token. A nonzero length
 497                * token should always contain the previous char.
 498                */
 499               if (current_token &&
 500                   current_token->len > 0)
 501                 {
 502                   delimit_token (&current_token, &retval);
 503                 }
 504
 505               /* discard all unquoted blanks (don't add them to a token) */
 506               break;
 507
 508
 509               /* single/double quotes are appended to the token,
 510                * escapes are maybe appended next time through the loop,
 511                * comment chars are never appended.
 512                */
 513
 514             case '\'':
 515             case '"':
 516               ensure_token (&current_token);
 517               g_string_append_c (current_token, *p);
 518
 519               /* FALL THRU */
 520
 521             case '#':
 522             case '\\':
 523               current_quote = *p;
 524               break;
 525
 526             default:
 527               /* Combines rules 4) and 6) - if we have a token, append to it,
 528                * otherwise create a new token.
 529                */
 530               ensure_token (&current_token);
 531               g_string_append_c (current_token, *p);
 532               break;
 533             }
 534         }
 535
 536       /* We need to count consecutive backslashes mod 2,
 537        * to detect escaped doublequotes.
 538        */
 539       if (*p != '\\')
 540         quoted = FALSE;
 541       else
 542         quoted = !quoted;
 543
 544       ++p;
 545     }
 546
 547   delimit_token (&current_token, &retval);
 548
 549   if (current_quote)
 550     {
 551       if (current_quote == '\\')
 552         g_set_error (error,
 553                      G_SHELL_ERROR,
 554                      G_SHELL_ERROR_BAD_QUOTING,
 555                      _("Text ended just after a '\\' character."
 556                        " (The text was '%s')"),
 557                      command_line);
 558       else
 559         g_set_error (error,
 560                      G_SHELL_ERROR,
 561                      G_SHELL_ERROR_BAD_QUOTING,
 562                      _("Text ended before matching quote was found for %c."
 563                        " (The text was '%s')"),
 564                      current_quote, command_line);
 565
 566       goto error;
 567     }
 568
 569   if (retval == NULL)
 570     {
 571       g_set_error_literal (error,
 572                            G_SHELL_ERROR,
 573                            G_SHELL_ERROR_EMPTY_STRING,
 574                            _("Text was empty (or contained only whitespace)"));
 575
 576       goto error;
 577     }
 578
 579   /* we appended backward */
 580   retval = g_slist_reverse (retval);
 581
 582   return retval;
 583
 584  error:
 585   g_assert (error == NULL || *error != NULL);
 586
 587   g_slist_free_full (retval, g_free);
 588
 589   return NULL;
 590 }
 591
 592 /**
 593  * g_shell_parse_argv:
 594  * @command_line: command line to parse
 595  * @argcp: (out): return location for number of args
 596  * @argvp: (out) (array length=argcp zero-terminated=1): return location for array of args
 597  * @error: return location for error
 598  *
 599  * Parses a command line into an argument vector, in much the same way
 600  * the shell would, but without many of the expansions the shell would
 601  * perform (variable expansion, globs, operators, filename expansion,
 602  * etc. are not supported). The results are defined to be the same as
 603  * those you would get from a UNIX98 /bin/sh, as long as the input
 604  * contains none of the unsupported shell expansions. If the input
 605  * does contain such expansions, they are passed through
 606  * literally. Possible errors are those from the #G_SHELL_ERROR
 607  * domain. Free the returned vector with g_strfreev().
 608  *
 609  * Return value: %TRUE on success, %FALSE if error set
 610  **/
 611 gboolean
 612 g_shell_parse_argv (const gchar *command_line,
 613                     gint        *argcp,
 614                     gchar     ***argvp,
 615                     GError     **error)
 616 {
 617   /* Code based on poptParseArgvString() from libpopt */
 618   gint argc = 0;
 619   gchar **argv = NULL;
 620   GSList *tokens = NULL;
 621   gint i;
 622   GSList *tmp_list;
 623
 624   g_return_val_if_fail (command_line != NULL, FALSE);
 625
 626   tokens = tokenize_command_line (command_line, error);
 627   if (tokens == NULL)
 628     return FALSE;
 629
 630   /* Because we can't have introduced any new blank space into the
 631    * tokens (we didn't do any new expansions), we don't need to
 632    * perform field splitting. If we were going to honor IFS or do any
 633    * expansions, we would have to do field splitting on each word
 634    * here. Also, if we were going to do any expansion we would need to
 635    * remove any zero-length words that didn't contain quotes
 636    * originally; but since there's no expansion we know all words have
 637    * nonzero length, unless they contain quotes.
 638    *
 639    * So, we simply remove quotes, and don't do any field splitting or
 640    * empty word removal, since we know there was no way to introduce
 641    * such things.
 642    */
 643
 644   argc = g_slist_length (tokens);
 645   argv = g_new0 (gchar*, argc + 1);
 646   i = 0;
 647   tmp_list = tokens;
 648   while (tmp_list)
 649     {
 650       argv[i] = g_shell_unquote (tmp_list->data, error);
 651
 652       /* Since we already checked that quotes matched up in the
 653        * tokenizer, this shouldn't be possible to reach I guess.
 654        */
 655       if (argv[i] == NULL)
 656         goto failed;
 657
 658       tmp_list = g_slist_next (tmp_list);
 659       ++i;
 660     }
 661
 662   g_slist_free_full (tokens, g_free);
 663
 664   if (argcp)
 665     *argcp = argc;
 666
 667   if (argvp)
 668     *argvp = argv;
 669   else
 670     g_strfreev (argv);
 671
 672   return TRUE;
 673
 674  failed:
 675
 676   g_assert (error == NULL || *error != NULL);
 677   g_strfreev (argv);
 678   g_slist_free_full (tokens, g_free);
 679
 680   return FALSE;
 681 }