glib/gshell.c

   1 /* gshell.c - Shell-related utilities
   2  *
   3  *  Copyright 2000 Red Hat, Inc.
   4  *  g_execvpe implementation based on GNU libc execvp:
   5  *   Copyright 1991, 92, 95, 96, 97, 98, 99 Free Software Foundation, Inc.
   6  *
   7  * SPDX-License-Identifier: LGPL-2.1-or-later
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public License
  20  * along with this library; if not, see <http://www.gnu.org/licenses/>.
  21  */
  22
  23 #include "config.h"
  24
  25 #include <string.h>
  26
  27 #include "gshell.h"
  28
  29 #include "gslist.h"
  30 #include "gstrfuncs.h"
  31 #include "gstring.h"
  32 #include "gtestutils.h"
  33 #include "glibintl.h"
  34 #include "gthread.h"
  35
  36 /**
  37  * SECTION:shell
  38  * @title: Shell-related Utilities
  39  * @short_description: shell-like commandline handling
  40  *
  41  * GLib provides the functions g_shell_quote() and g_shell_unquote()
  42  * to handle shell-like quoting in strings. The function g_shell_parse_argv()
  43  * parses a string similar to the way a POSIX shell (/bin/sh) would.
  44  *
  45  * Note that string handling in shells has many obscure and historical
  46  * corner-cases which these functions do not necessarily reproduce. They
  47  * are good enough in practice, though.
  48  */
  49
  50 /**
  51  * G_SHELL_ERROR:
  52  *
  53  * Error domain for shell functions.
  54  *
  55  * Errors in this domain will be from the #GShellError enumeration.
  56  *
  57  * See #GError for information on error domains.
  58  **/
  59
  60 /**
  61  * GShellError:
  62  * @G_SHELL_ERROR_BAD_QUOTING: Mismatched or otherwise mangled quoting.
  63  * @G_SHELL_ERROR_EMPTY_STRING: String to be parsed was empty.
  64  * @G_SHELL_ERROR_FAILED: Some other error.
  65  *
  66  * Error codes returned by shell functions.
  67  **/
  68 G_DEFINE_QUARK (g-shell-error-quark, g_shell_error)
  69
  70 /* Single quotes preserve the literal string exactly. escape
  71  * sequences are not allowed; not even \' - if you want a '
  72  * in the quoted text, you have to do something like 'foo'\''bar'
  73  *
  74  * Double quotes allow $ ` " \ and newline to be escaped with backslash.
  75  * Otherwise double quotes preserve things literally.
  76  */
  77
  78 static gboolean
  79 unquote_string_inplace (gchar* str, gchar** end, GError** err)
  80 {
  81   gchar* dest;
  82   gchar* s;
  83   gchar quote_char;
  84
  85   g_return_val_if_fail(end != NULL, FALSE);
  86   g_return_val_if_fail(err == NULL || *err == NULL, FALSE);
  87   g_return_val_if_fail(str != NULL, FALSE);
  88
  89   dest = s = str;
  90
  91   quote_char = *s;
  92
  93   if (!(*s == '"' || *s == '\''))
  94     {
  95       g_set_error_literal (err,
  96                            G_SHELL_ERROR,
  97                            G_SHELL_ERROR_BAD_QUOTING,
  98                            _("Quoted text doesn’t begin with a quotation mark"));
  99       *end = str;
 100       return FALSE;
 101     }
 102
 103   /* Skip the initial quote mark */
 104   ++s;
 105
 106   if (quote_char == '"')
 107     {
 108       while (*s)
 109         {
 110           g_assert(s > dest); /* loop invariant */
 111
 112           switch (*s)
 113             {
 114             case '"':
 115               /* End of the string, return now */
 116               *dest = '\0';
 117               ++s;
 118               *end = s;
 119               return TRUE;
 120               break;
 121
 122             case '\\':
 123               /* Possible escaped quote or \ */
 124               ++s;
 125               switch (*s)
 126                 {
 127                 case '"':
 128                 case '\\':
 129                 case '`':
 130                 case '$':
 131                 case '\n':
 132                   *dest = *s;
 133                   ++s;
 134                   ++dest;
 135                   break;
 136
 137                 default:
 138                   /* not an escaped char */
 139                   *dest = '\\';
 140                   ++dest;
 141                   /* ++s already done. */
 142                   break;
 143                 }
 144               break;
 145
 146             default:
 147               *dest = *s;
 148               ++dest;
 149               ++s;
 150               break;
 151             }
 152
 153           g_assert(s > dest); /* loop invariant */
 154         }
 155     }
 156   else
 157     {
 158       while (*s)
 159         {
 160           g_assert(s > dest); /* loop invariant */
 161
 162           if (*s == '\'')
 163             {
 164               /* End of the string, return now */
 165               *dest = '\0';
 166               ++s;
 167               *end = s;
 168               return TRUE;
 169             }
 170           else
 171             {
 172               *dest = *s;
 173               ++dest;
 174               ++s;
 175             }
 176
 177           g_assert(s > dest); /* loop invariant */
 178         }
 179     }
 180
 181   /* If we reach here this means the close quote was never encountered */
 182
 183   *dest = '\0';
 184
 185   g_set_error_literal (err,
 186                        G_SHELL_ERROR,
 187                        G_SHELL_ERROR_BAD_QUOTING,
 188                        _("Unmatched quotation mark in command line or other shell-quoted text"));
 189   *end = s;
 190   return FALSE;
 191 }
 192
 193 /**
 194  * g_shell_quote:
 195  * @unquoted_string: (type filename): a literal string
 196  *
 197  * Quotes a string so that the shell (/bin/sh) will interpret the
 198  * quoted string to mean @unquoted_string.
 199  *
 200  * If you pass a filename to the shell, for example, you should first
 201  * quote it with this function.
 202  *
 203  * The return value must be freed with g_free().
 204  *
 205  * The quoting style used is undefined (single or double quotes may be
 206  * used).
 207  *
 208  * Returns: (type filename) (transfer full): quoted string
 209  **/
 210 gchar*
 211 g_shell_quote (const gchar *unquoted_string)
 212 {
 213   /* We always use single quotes, because the algorithm is cheesier.
 214    * We could use double if we felt like it, that might be more
 215    * human-readable.
 216    */
 217
 218   const gchar *p;
 219   GString *dest;
 220
 221   g_return_val_if_fail (unquoted_string != NULL, NULL);
 222
 223   dest = g_string_new ("'");
 224
 225   p = unquoted_string;
 226
 227   /* could speed this up a lot by appending chunks of text at a
 228    * time.
 229    */
 230   while (*p)
 231     {
 232       /* Replace literal ' with a close ', a \', and an open ' */
 233       if (*p == '\'')
 234         g_string_append (dest, "'\\''");
 235       else
 236         g_string_append_c (dest, *p);
 237
 238       ++p;
 239     }
 240
 241   /* close the quote */
 242   g_string_append_c (dest, '\'');
 243
 244   return g_string_free (dest, FALSE);
 245 }
 246
 247 /**
 248  * g_shell_unquote:
 249  * @quoted_string: (type filename): shell-quoted string
 250  * @error: error return location or NULL
 251  *
 252  * Unquotes a string as the shell (/bin/sh) would.
 253  *
 254  * This function only handles quotes; if a string contains file globs,
 255  * arithmetic operators, variables, backticks, redirections, or other
 256  * special-to-the-shell features, the result will be different from the
 257  * result a real shell would produce (the variables, backticks, etc.
 258  * will be passed through literally instead of being expanded).
 259  *
 260  * This function is guaranteed to succeed if applied to the result of
 261  * g_shell_quote(). If it fails, it returns %NULL and sets the
 262  * error.
 263  *
 264  * The @quoted_string need not actually contain quoted or escaped text;
 265  * g_shell_unquote() simply goes through the string and unquotes/unescapes
 266  * anything that the shell would. Both single and double quotes are
 267  * handled, as are escapes including escaped newlines.
 268  *
 269  * The return value must be freed with g_free().
 270  *
 271  * Possible errors are in the %G_SHELL_ERROR domain.
 272  *
 273  * Shell quoting rules are a bit strange. Single quotes preserve the
 274  * literal string exactly. escape sequences are not allowed; not even
 275  * `\'` - if you want a `'` in the quoted text, you have to do something
 276  * like `'foo'\''bar'`. Double quotes allow `$`, ```, `"`, `\`, and
 277  * newline to be escaped with backslash. Otherwise double quotes
 278  * preserve things literally.
 279  *
 280  * Returns: (type filename): an unquoted string
 281  **/
 282 gchar*
 283 g_shell_unquote (const gchar *quoted_string,
 284                  GError     **error)
 285 {
 286   gchar *unquoted;
 287   gchar *end;
 288   gchar *start;
 289   GString *retval;
 290
 291   g_return_val_if_fail (quoted_string != NULL, NULL);
 292
 293   unquoted = g_strdup (quoted_string);
 294
 295   start = unquoted;
 296   end = unquoted;
 297   retval = g_string_new (NULL);
 298
 299   /* The loop allows cases such as
 300    * "foo"blah blah'bar'woo foo"baz"la la la\'\''foo'
 301    */
 302   while (*start)
 303     {
 304       /* Append all non-quoted chars, honoring backslash escape
 305        */
 306
 307       while (*start && !(*start == '"' || *start == '\''))
 308         {
 309           if (*start == '\\')
 310             {
 311               /* all characters can get escaped by backslash,
 312                * except newline, which is removed if it follows
 313                * a backslash outside of quotes
 314                */
 315
 316               ++start;
 317               if (*start)
 318                 {
 319                   if (*start != '\n')
 320                     g_string_append_c (retval, *start);
 321                   ++start;
 322                 }
 323             }
 324           else
 325             {
 326               g_string_append_c (retval, *start);
 327               ++start;
 328             }
 329         }
 330
 331       if (*start)
 332         {
 333           if (!unquote_string_inplace (start, &end, error))
 334             {
 335               goto error;
 336             }
 337           else
 338             {
 339               g_string_append (retval, start);
 340               start = end;
 341             }
 342         }
 343     }
 344
 345   g_free (unquoted);
 346   return g_string_free (retval, FALSE);
 347
 348  error:
 349   g_assert (error == NULL || *error != NULL);
 350
 351   g_free (unquoted);
 352   g_string_free (retval, TRUE);
 353   return NULL;
 354 }
 355
 356 /* g_parse_argv() does a semi-arbitrary weird subset of the way
 357  * the shell parses a command line. We don't do variable expansion,
 358  * don't understand that operators are tokens, don't do tilde expansion,
 359  * don't do command substitution, no arithmetic expansion, IFS gets ignored,
 360  * don't do filename globs, don't remove redirection stuff, etc.
 361  *
 362  * READ THE UNIX98 SPEC on "Shell Command Language" before changing
 363  * the behavior of this code.
 364  *
 365  * Steps to parsing the argv string:
 366  *
 367  *  - tokenize the string (but since we ignore operators,
 368  *    our tokenization may diverge from what the shell would do)
 369  *    note that tokenization ignores the internals of a quoted
 370  *    word and it always splits on spaces, not on IFS even
 371  *    if we used IFS. We also ignore "end of input indicator"
 372  *    (I guess this is control-D?)
 373  *
 374  *    Tokenization steps, from UNIX98 with operator stuff removed,
 375  *    are:
 376  *
 377  *    1) "If the current character is backslash, single-quote or
 378  *        double-quote (\, ' or ") and it is not quoted, it will affect
 379  *        quoting for subsequent characters up to the end of the quoted
 380  *        text. The rules for quoting are as described in Quoting
 381  *        . During token recognition no substitutions will be actually
 382  *        performed, and the result token will contain exactly the
 383  *        characters that appear in the input (except for newline
 384  *        character joining), unmodified, including any embedded or
 385  *        enclosing quotes or substitution operators, between the quote
 386  *        mark and the end of the quoted text. The token will not be
 387  *        delimited by the end of the quoted field."
 388  *
 389  *    2) "If the current character is an unquoted newline character,
 390  *        the current token will be delimited."
 391  *
 392  *    3) "If the current character is an unquoted blank character, any
 393  *        token containing the previous character is delimited and the
 394  *        current character will be discarded."
 395  *
 396  *    4) "If the previous character was part of a word, the current
 397  *        character will be appended to that word."
 398  *
 399  *    5) "If the current character is a "#", it and all subsequent
 400  *        characters up to, but excluding, the next newline character
 401  *        will be discarded as a comment. The newline character that
 402  *        ends the line is not considered part of the comment. The
 403  *        "#" starts a comment only when it is at the beginning of a
 404  *        token. Since the search for the end-of-comment does not
 405  *        consider an escaped newline character specially, a comment
 406  *        cannot be continued to the next line."
 407  *
 408  *    6) "The current character will be used as the start of a new word."
 409  *
 410  *
 411  *  - for each token (word), perform portions of word expansion, namely
 412  *    field splitting (using default whitespace IFS) and quote
 413  *    removal.  Field splitting may increase the number of words.
 414  *    Quote removal does not increase the number of words.
 415  *
 416  *   "If the complete expansion appropriate for a word results in an
 417  *   empty field, that empty field will be deleted from the list of
 418  *   fields that form the completely expanded command, unless the
 419  *   original word contained single-quote or double-quote characters."
 420  *    - UNIX98 spec
 421  *
 422  *
 423  */
 424
 425 static inline void
 426 ensure_token (GString **token)
 427 {
 428   if (*token == NULL)
 429     *token = g_string_new (NULL);
 430 }
 431
 432 static void
 433 delimit_token (GString **token,
 434                GSList **retval)
 435 {
 436   if (*token == NULL)
 437     return;
 438
 439   *retval = g_slist_prepend (*retval, g_string_free (*token, FALSE));
 440
 441   *token = NULL;
 442 }
 443
 444 static GSList*
 445 tokenize_command_line (const gchar *command_line,
 446                        GError **error)
 447 {
 448   gchar current_quote;
 449   const gchar *p;
 450   GString *current_token = NULL;
 451   GSList *retval = NULL;
 452   gboolean quoted;
 453
 454   current_quote = '\0';
 455   quoted = FALSE;
 456   p = command_line;
 457
 458   while (*p)
 459     {
 460       if (current_quote == '\\')
 461         {
 462           if (*p == '\n')
 463             {
 464               /* we append nothing; backslash-newline become nothing */
 465             }
 466           else
 467             {
 468               /* we append the backslash and the current char,
 469                * to be interpreted later after tokenization
 470                */
 471               ensure_token (&current_token);
 472               g_string_append_c (current_token, '\\');
 473               g_string_append_c (current_token, *p);
 474             }
 475
 476           current_quote = '\0';
 477         }
 478       else if (current_quote == '#')
 479         {
 480           /* Discard up to and including next newline */
 481           while (*p && *p != '\n')
 482             ++p;
 483
 484           current_quote = '\0';
 485
 486           if (*p == '\0')
 487             break;
 488         }
 489       else if (current_quote)
 490         {
 491           if (*p == current_quote &&
 492               /* check that it isn't an escaped double quote */
 493               !(current_quote == '"' && quoted))
 494             {
 495               /* close the quote */
 496               current_quote = '\0';
 497             }
 498
 499           /* Everything inside quotes, and the close quote,
 500            * gets appended literally.
 501            */
 502
 503           ensure_token (&current_token);
 504           g_string_append_c (current_token, *p);
 505         }
 506       else
 507         {
 508           switch (*p)
 509             {
 510             case '\n':
 511               delimit_token (&current_token, &retval);
 512               break;
 513
 514             case ' ':
 515             case '\t':
 516               /* If the current token contains the previous char, delimit
 517                * the current token. A nonzero length
 518                * token should always contain the previous char.
 519                */
 520               if (current_token &&
 521                   current_token->len > 0)
 522                 {
 523                   delimit_token (&current_token, &retval);
 524                 }
 525
 526               /* discard all unquoted blanks (don't add them to a token) */
 527               break;
 528
 529
 530               /* single/double quotes are appended to the token,
 531                * escapes are maybe appended next time through the loop,
 532                * comment chars are never appended.
 533                */
 534
 535             case '\'':
 536             case '"':
 537               ensure_token (&current_token);
 538               g_string_append_c (current_token, *p);
 539
 540               G_GNUC_FALLTHROUGH;
 541             case '\\':
 542               current_quote = *p;
 543               break;
 544
 545             case '#':
 546               if (p == command_line)
 547                 { /* '#' was the first char */
 548                   current_quote = *p;
 549                   break;
 550                 }
 551               switch(*(p-1))
 552                 {
 553                   case ' ':
 554                   case '\n':
 555                   case '\0':
 556                     current_quote = *p;
 557                     break;
 558                   default:
 559                     ensure_token (&current_token);
 560                     g_string_append_c (current_token, *p);
 561                     break;
 562                 }
 563               break;
 564
 565             default:
 566               /* Combines rules 4) and 6) - if we have a token, append to it,
 567                * otherwise create a new token.
 568                */
 569               ensure_token (&current_token);
 570               g_string_append_c (current_token, *p);
 571               break;
 572             }
 573         }
 574
 575       /* We need to count consecutive backslashes mod 2,
 576        * to detect escaped doublequotes.
 577        */
 578       if (*p != '\\')
 579         quoted = FALSE;
 580       else
 581         quoted = !quoted;
 582
 583       ++p;
 584     }
 585
 586   delimit_token (&current_token, &retval);
 587
 588   if (current_quote)
 589     {
 590       if (current_quote == '\\')
 591         g_set_error (error,
 592                      G_SHELL_ERROR,
 593                      G_SHELL_ERROR_BAD_QUOTING,
 594                      _("Text ended just after a “\\” character."
 595                        " (The text was “%s”)"),
 596                      command_line);
 597       else
 598         g_set_error (error,
 599                      G_SHELL_ERROR,
 600                      G_SHELL_ERROR_BAD_QUOTING,
 601                      _("Text ended before matching quote was found for %c."
 602                        " (The text was “%s”)"),
 603                      current_quote, command_line);
 604
 605       goto error;
 606     }
 607
 608   if (retval == NULL)
 609     {
 610       g_set_error_literal (error,
 611                            G_SHELL_ERROR,
 612                            G_SHELL_ERROR_EMPTY_STRING,
 613                            _("Text was empty (or contained only whitespace)"));
 614
 615       goto error;
 616     }
 617
 618   /* we appended backward */
 619   retval = g_slist_reverse (retval);
 620
 621   return retval;
 622
 623  error:
 624   g_assert (error == NULL || *error != NULL);
 625
 626   g_slist_free_full (retval, g_free);
 627
 628   return NULL;
 629 }
 630
 631 /**
 632  * g_shell_parse_argv:
 633  * @command_line: (type filename): command line to parse
 634  * @argcp: (out) (optional): return location for number of args
 635  * @argvp: (out) (optional) (array length=argcp zero-terminated=1) (element-type filename):
 636  *   return location for array of args
 637  * @error: (optional): return location for error
 638  *
 639  * Parses a command line into an argument vector, in much the same way
 640  * the shell would, but without many of the expansions the shell would
 641  * perform (variable expansion, globs, operators, filename expansion,
 642  * etc. are not supported).
 643  *
 644  * The results are defined to be the same as those you would get from
 645  * a UNIX98 `/bin/sh`, as long as the input contains none of the
 646  * unsupported shell expansions. If the input does contain such expansions,
 647  * they are passed through literally.
 648  *
 649  * Possible errors are those from the %G_SHELL_ERROR domain.
 650  *
 651  * In particular, if @command_line is an empty string (or a string containing
 652  * only whitespace), %G_SHELL_ERROR_EMPTY_STRING will be returned. It’s
 653  * guaranteed that @argvp will be a non-empty array if this function returns
 654  * successfully.
 655  *
 656  * Free the returned vector with g_strfreev().
 657  *
 658  * Returns: %TRUE on success, %FALSE if error set
 659  **/
 660 gboolean
 661 g_shell_parse_argv (const gchar *command_line,
 662                     gint        *argcp,
 663                     gchar     ***argvp,
 664                     GError     **error)
 665 {
 666   /* Code based on poptParseArgvString() from libpopt */
 667   gint argc = 0;
 668   gchar **argv = NULL;
 669   GSList *tokens = NULL;
 670   gint i;
 671   GSList *tmp_list;
 672
 673   g_return_val_if_fail (command_line != NULL, FALSE);
 674
 675   tokens = tokenize_command_line (command_line, error);
 676   if (tokens == NULL)
 677     return FALSE;
 678
 679   /* Because we can't have introduced any new blank space into the
 680    * tokens (we didn't do any new expansions), we don't need to
 681    * perform field splitting. If we were going to honor IFS or do any
 682    * expansions, we would have to do field splitting on each word
 683    * here. Also, if we were going to do any expansion we would need to
 684    * remove any zero-length words that didn't contain quotes
 685    * originally; but since there's no expansion we know all words have
 686    * nonzero length, unless they contain quotes.
 687    *
 688    * So, we simply remove quotes, and don't do any field splitting or
 689    * empty word removal, since we know there was no way to introduce
 690    * such things.
 691    */
 692
 693   argc = g_slist_length (tokens);
 694   argv = g_new0 (gchar*, argc + 1);
 695   i = 0;
 696   tmp_list = tokens;
 697   while (tmp_list)
 698     {
 699       argv[i] = g_shell_unquote (tmp_list->data, error);
 700
 701       /* Since we already checked that quotes matched up in the
 702        * tokenizer, this shouldn't be possible to reach I guess.
 703        */
 704       if (argv[i] == NULL)
 705         goto failed;
 706
 707       tmp_list = g_slist_next (tmp_list);
 708       ++i;
 709     }
 710
 711   g_slist_free_full (tokens, g_free);
 712
 713   g_assert (argc > 0);
 714   g_assert (argv != NULL && argv[0] != NULL);
 715
 716   if (argcp)
 717     *argcp = argc;
 718
 719   if (argvp)
 720     *argvp = argv;
 721   else
 722     g_strfreev (argv);
 723
 724   return TRUE;
 725
 726  failed:
 727
 728   g_assert (error == NULL || *error != NULL);
 729   g_strfreev (argv);
 730   g_slist_free_full (tokens, g_free);
 731
 732   return FALSE;
 733 }