scripts/glibcpp.py

   1 #! /usr/bin/python3
   2 # Approximation to C preprocessing.
   3 # Copyright (C) 2019-2024 Free Software Foundation, Inc.
   4 # This file is part of the GNU C Library.
   5 #
   6 # The GNU C Library is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU Lesser General Public
   8 # License as published by the Free Software Foundation; either
   9 # version 2.1 of the License, or (at your option) any later version.
  10 #
  11 # The GNU C Library is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 # Lesser General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU Lesser General Public
  17 # License along with the GNU C Library; if not, see
  18 # <https://www.gnu.org/licenses/>.
  19
  20 """
  21 Simplified lexical analyzer for C preprocessing tokens.
  22
  23 Does not implement trigraphs.
  24
  25 Does not implement backslash-newline in the middle of any lexical
  26 item other than a string literal.
  27
  28 Does not implement universal-character-names in identifiers.
  29
  30 Treats prefixed strings (e.g. L"...") as two tokens (L and "...").
  31
  32 Accepts non-ASCII characters only within comments and strings.
  33 """
  34
  35 import collections
  36 import operator
  37 import re
  38 import sys
  39
  40 # Caution: The order of the outermost alternation matters.
  41 # STRING must be before BAD_STRING, CHARCONST before BAD_CHARCONST,
  42 # BLOCK_COMMENT before BAD_BLOCK_COM before PUNCTUATOR, and OTHER must
  43 # be last.
  44 # Caution: There should be no capturing groups other than the named
  45 # captures in the outermost alternation.
  46
  47 # For reference, these are all of the C punctuators as of C11:
  48 #   [ ] ( ) { } , ; ? ~
  49 #   ! != * *= / /= ^ ^= = ==
  50 #   # ##
  51 #   % %= %> %: %:%:
  52 #   & &= &&
  53 #   | |= ||
  54 #   + += ++
  55 #   - -= -- ->
  56 #   . ...
  57 #   : :>
  58 #   < <% <: << <<= <=
  59 #   > >= >> >>=
  60
  61 # The BAD_* tokens are not part of the official definition of pp-tokens;
  62 # they match unclosed strings, character constants, and block comments,
  63 # so that the regex engine doesn't have to backtrack all the way to the
  64 # beginning of a broken construct and then emit dozens of junk tokens.
  65
  66 PP_TOKEN_RE_ = re.compile(r"""
  67     (?P<STRING>        \"(?:[^\"\\\r\n]|\\(?:[\r\n -~]|\r\n))*\")
  68    |(?P<BAD_STRING>    \"(?:[^\"\\\r\n]|\\[ -~])*)
  69    |(?P<CHARCONST>     \'(?:[^\'\\\r\n]|\\(?:[\r\n -~]|\r\n))*\')
  70    |(?P<BAD_CHARCONST> \'(?:[^\'\\\r\n]|\\[ -~])*)
  71    |(?P<BLOCK_COMMENT> /\*(?:\*(?!/)|[^*])*\*/)
  72    |(?P<BAD_BLOCK_COM> /\*(?:\*(?!/)|[^*])*\*?)
  73    |(?P<LINE_COMMENT>  //[^\r\n]*)
  74    |(?P<IDENT>         [_a-zA-Z][_a-zA-Z0-9]*)
  75    |(?P<PP_NUMBER>     \.?[0-9](?:[0-9a-df-oq-zA-DF-OQ-Z_.]|[eEpP][+-]?)*)
  76    |(?P<PUNCTUATOR>
  77        [,;?~(){}\[\]]
  78      | [!*/^=]=?
  79      | \#\#?
  80      | %(?:[=>]|:(?:%:)?)?
  81      | &[=&]?
  82      |\|[=|]?
  83      |\+[=+]?
  84      | -[=->]?
  85      |\.(?:\.\.)?
  86      | :>?
  87      | <(?:[%:]|<(?:=|<=?)?)?
  88      | >(?:=|>=?)?)
  89    |(?P<ESCNL>         \\(?:\r|\n|\r\n))
  90    |(?P<WHITESPACE>    [ \t\n\r\v\f]+)
  91    |(?P<OTHER>         .)
  92 """, re.DOTALL | re.VERBOSE)
  93
  94 HEADER_NAME_RE_ = re.compile(r"""
  95     < [^>\r\n]+ >
  96   | " [^"\r\n]+ "
  97 """, re.DOTALL | re.VERBOSE)
  98
  99 ENDLINE_RE_ = re.compile(r"""\r|\n|\r\n""")
 100
 101 # based on the sample code in the Python re documentation
 102 Token_ = collections.namedtuple("Token", (
 103     "kind", "text", "line", "column", "context"))
 104 Token_.__doc__ = """
 105    One C preprocessing token, comment, or chunk of whitespace.
 106    'kind' identifies the token type, which will be one of:
 107        STRING, CHARCONST, BLOCK_COMMENT, LINE_COMMENT, IDENT,
 108        PP_NUMBER, PUNCTUATOR, ESCNL, WHITESPACE, HEADER_NAME,
 109        or OTHER.  The BAD_* alternatives in PP_TOKEN_RE_ are
 110        handled within tokenize_c, below.
 111
 112    'text' is the sequence of source characters making up the token;
 113        no decoding whatsoever is performed.
 114
 115    'line' and 'column' give the position of the first character of the
 116       token within the source file.  They are both 1-based.
 117
 118    'context' indicates whether or not this token occurred within a
 119       preprocessing directive; it will be None for running text,
 120       '<null>' for the leading '#' of a directive line (because '#'
 121       all by itself on a line is a "null directive"), or the name of
 122       the directive for tokens within a directive line, starting with
 123       the IDENT for the name itself.
 124 """
 125
 126 def tokenize_c(file_contents, reporter):
 127     """Yield a series of Token objects, one for each preprocessing
 128        token, comment, or chunk of whitespace within FILE_CONTENTS.
 129        The REPORTER object is expected to have one method,
 130        reporter.error(token, message), which will be called to
 131        indicate a lexical error at the position of TOKEN.
 132        If MESSAGE contains the four-character sequence '{!r}', that
 133        is expected to be replaced by repr(token.text).
 134     """
 135
 136     Token = Token_
 137     PP_TOKEN_RE = PP_TOKEN_RE_
 138     ENDLINE_RE = ENDLINE_RE_
 139     HEADER_NAME_RE = HEADER_NAME_RE_
 140
 141     line_num = 1
 142     line_start = 0
 143     pos = 0
 144     limit = len(file_contents)
 145     directive = None
 146     at_bol = True
 147     while pos < limit:
 148         if directive == "include":
 149             mo = HEADER_NAME_RE.match(file_contents, pos)
 150             if mo:
 151                 kind = "HEADER_NAME"
 152                 directive = "after_include"
 153             else:
 154                 mo = PP_TOKEN_RE.match(file_contents, pos)
 155                 kind = mo.lastgroup
 156                 if kind != "WHITESPACE":
 157                     directive = "after_include"
 158         else:
 159             mo = PP_TOKEN_RE.match(file_contents, pos)
 160             kind = mo.lastgroup
 161
 162         text = mo.group()
 163         line = line_num
 164         column = mo.start() - line_start
 165         adj_line_start = 0
 166         # only these kinds can contain a newline
 167         if kind in ("WHITESPACE", "BLOCK_COMMENT", "LINE_COMMENT",
 168                     "STRING", "CHARCONST", "BAD_BLOCK_COM", "ESCNL"):
 169             for tmo in ENDLINE_RE.finditer(text):
 170                 line_num += 1
 171                 adj_line_start = tmo.end()
 172             if adj_line_start:
 173                 line_start = mo.start() + adj_line_start
 174
 175         # Track whether or not we are scanning a preprocessing directive.
 176         if kind == "LINE_COMMENT" or (kind == "WHITESPACE" and adj_line_start):
 177             at_bol = True
 178             directive = None
 179         else:
 180             if kind == "PUNCTUATOR" and text == "#" and at_bol:
 181                 directive = "<null>"
 182             elif kind == "IDENT" and directive == "<null>":
 183                 directive = text
 184             at_bol = False
 185
 186         # Report ill-formed tokens and rewrite them as their well-formed
 187         # equivalents, so downstream processing doesn't have to know about them.
 188         # (Rewriting instead of discarding provides better error recovery.)
 189         if kind == "BAD_BLOCK_COM":
 190             reporter.error(Token("BAD_BLOCK_COM", "", line, column+1, ""),
 191                            "unclosed block comment")
 192             text += "*/"
 193             kind = "BLOCK_COMMENT"
 194         elif kind == "BAD_STRING":
 195             reporter.error(Token("BAD_STRING", "", line, column+1, ""),
 196                            "unclosed string")
 197             text += "\""
 198             kind = "STRING"
 199         elif kind == "BAD_CHARCONST":
 200             reporter.error(Token("BAD_CHARCONST", "", line, column+1, ""),
 201                            "unclosed char constant")
 202             text += "'"
 203             kind = "CHARCONST"
 204
 205         tok = Token(kind, text, line, column+1,
 206                     "include" if directive == "after_include" else directive)
 207         # Do not complain about OTHER tokens inside macro definitions.
 208         # $ and @ appear in macros defined by headers intended to be
 209         # included from assembly language, e.g. sysdeps/mips/sys/asm.h.
 210         if kind == "OTHER" and directive != "define":
 211             self.error(tok, "stray {!r} in program")
 212
 213         yield tok
 214         pos = mo.end()
 215
 216 class MacroDefinition(collections.namedtuple('MacroDefinition',
 217                                              'name_token args body error')):
 218     """A preprocessor macro definition.
 219
 220     name_token is the Token_ for the name.
 221
 222     args is None for a macro that is not function-like.  Otherwise, it
 223     is a tuple that contains the macro argument name tokens.
 224
 225     body is a tuple that contains the tokens that constitute the body
 226     of the macro definition (excluding whitespace).
 227
 228     error is None if no error was detected, or otherwise a problem
 229     description associated with this macro definition.
 230
 231     """
 232
 233     @property
 234     def function(self):
 235         """Return true if the macro is function-like."""
 236         return self.args is not None
 237
 238     @property
 239     def name(self):
 240         """Return the name of the macro being defined."""
 241         return self.name_token.text
 242
 243     @property
 244     def line(self):
 245         """Return the line number of the macro definition."""
 246         return self.name_token.line
 247
 248     @property
 249     def args_lowered(self):
 250         """Return the macro argument list as a list of strings"""
 251         if self.function:
 252             return [token.text for token in self.args]
 253         else:
 254             return None
 255
 256     @property
 257     def body_lowered(self):
 258         """Return the macro body as a list of strings."""
 259         return [token.text for token in self.body]
 260
 261 def macro_definitions(tokens):
 262     """A generator for C macro definitions among tokens.
 263
 264     The generator yields MacroDefinition objects.
 265
 266     tokens must be iterable, yielding Token_ objects.
 267
 268     """
 269
 270     macro_name = None
 271     macro_start = False # Set to false after macro name and one otken.
 272     macro_args = None # Set to a list during the macro argument sequence.
 273     in_macro_args = False # True while processing macro identifier-list.
 274     error = None
 275     body = []
 276
 277     for token in tokens:
 278         if token.context == 'define' and macro_name is None \
 279            and token.kind == 'IDENT':
 280             # Starting up macro processing.
 281             if macro_start:
 282                 # First identifier is the macro name.
 283                 macro_name = token
 284             else:
 285                 # Next token is the name.
 286                 macro_start = True
 287             continue
 288
 289         if macro_name is None:
 290             # Drop tokens not in macro definitions.
 291             continue
 292
 293         if token.context != 'define':
 294             # End of the macro definition.
 295             if in_macro_args and error is None:
 296                 error = 'macro definition ends in macro argument list'
 297             yield MacroDefinition(macro_name, macro_args, tuple(body), error)
 298             # No longer in a macro definition.
 299             macro_name = None
 300             macro_start = False
 301             macro_args = None
 302             in_macro_args = False
 303             error = None
 304             body.clear()
 305             continue
 306
 307         if macro_start:
 308             # First token after the macro name.
 309             macro_start = False
 310             if token.kind == 'PUNCTUATOR' and token.text == '(':
 311                 macro_args = []
 312                 in_macro_args = True
 313             continue
 314
 315         if in_macro_args:
 316             if token.kind == 'IDENT' \
 317                or (token.kind == 'PUNCTUATOR' and token.text == '...'):
 318                 # Macro argument or ... placeholder.
 319                 macro_args.append(token)
 320             if token.kind == 'PUNCTUATOR':
 321                 if token.text == ')':
 322                     macro_args = tuple(macro_args)
 323                     in_macro_args = False
 324                 elif token.text == ',':
 325                     pass # Skip.  Not a full syntax check.
 326                 elif error is None:
 327                     error = 'invalid punctuator in macro argument list: ' \
 328                         + repr(token.text)
 329             elif error is None:
 330                 error = 'invalid {} token in macro argument list'.format(
 331                     token.kind)
 332             continue
 333
 334         if token.kind not in ('WHITESPACE', 'BLOCK_COMMENT'):
 335             body.append(token)
 336
 337     # Emit the macro in case the last line does not end with a newline.
 338     if macro_name is not None:
 339         if in_macro_args and error is None:
 340             error = 'macro definition ends in macro argument list'
 341         yield MacroDefinition(macro_name, macro_args, tuple(body), error)
 342
 343 # Used to split UL etc. suffixes from numbers such as 123UL.
 344 RE_SPLIT_INTEGER_SUFFIX = re.compile(r'([^ullULL]+)([ullULL]*)')
 345
 346 BINARY_OPERATORS = {
 347     '+': operator.add,
 348     '<<': operator.lshift,
 349     '|': operator.or_,
 350 }
 351
 352 # Use the general-purpose dict type if it is order-preserving.
 353 if (sys.version_info[0], sys.version_info[1]) <= (3, 6):
 354     OrderedDict = collections.OrderedDict
 355 else:
 356     OrderedDict = dict
 357
 358 def macro_eval(macro_defs, reporter):
 359     """Compute macro values
 360
 361     macro_defs is the output from macro_definitions.  reporter is an
 362     object that accepts reporter.error(line_number, message) and
 363     reporter.note(line_number, message) calls to report errors
 364     and error context invocations.
 365
 366     The returned dict contains the values of macros which are not
 367     function-like, pairing their names with their computed values.
 368
 369     The current implementation is incomplete.  It is deliberately not
 370     entirely faithful to C, even in the implemented parts.  It checks
 371     that macro replacements follow certain syntactic rules even if
 372     they are never evaluated.
 373
 374     """
 375
 376     # Unevaluated macro definitions by name.
 377     definitions = OrderedDict()
 378     for md in macro_defs:
 379         if md.name in definitions:
 380             reporter.error(md.line, 'macro {} redefined'.format(md.name))
 381             reporter.note(definitions[md.name].line,
 382                           'location of previous definition')
 383         else:
 384             definitions[md.name] = md
 385
 386     # String to value mappings for fully evaluated macros.
 387     evaluated = OrderedDict()
 388
 389     # String to macro definitions during evaluation.  Nice error
 390     # reporting relies on deterministic iteration order.
 391     stack = OrderedDict()
 392
 393     def eval_token(current, token):
 394         """Evaluate one macro token.
 395
 396         Integers and strings are returned as such (the latter still
 397         quoted).  Identifiers are expanded.
 398
 399         None indicates an empty expansion or an error.
 400
 401         """
 402
 403         if token.kind == 'PP_NUMBER':
 404             value = None
 405             m = RE_SPLIT_INTEGER_SUFFIX.match(token.text)
 406             if m:
 407                 try:
 408                     value = int(m.group(1), 0)
 409                 except ValueError:
 410                     pass
 411             if value is None:
 412                 reporter.error(token.line,
 413                     'invalid number {!r} in definition of {}'.format(
 414                         token.text, current.name))
 415             return value
 416
 417         if token.kind == 'STRING':
 418             return token.text
 419
 420         if token.kind == 'CHARCONST' and len(token.text) == 3:
 421             return ord(token.text[1])
 422
 423         if token.kind == 'IDENT':
 424             name = token.text
 425             result = eval1(current, name)
 426             if name not in evaluated:
 427                 evaluated[name] = result
 428             return result
 429
 430         reporter.error(token.line,
 431             'unrecognized {!r} in definition of {}'.format(
 432                 token.text, current.name))
 433         return None
 434
 435
 436     def eval1(current, name):
 437         """Evaluate one name.
 438
 439         The name is looked up and the macro definition evaluated
 440         recursively if necessary.  The current argument is the macro
 441         definition being evaluated.
 442
 443         None as a return value indicates an error.
 444
 445         """
 446
 447         # Fast path if the value has already been evaluated.
 448         if name in evaluated:
 449             return evaluated[name]
 450
 451         try:
 452             md = definitions[name]
 453         except KeyError:
 454             reporter.error(current.line,
 455                 'reference to undefined identifier {} in definition of {}'
 456                            .format(name, current.name))
 457             return None
 458
 459         if md.name in stack:
 460             # Recursive macro definition.
 461             md = stack[name]
 462             reporter.error(md.line,
 463                 'macro definition {} refers to itself'.format(md.name))
 464             for md1 in reversed(list(stack.values())):
 465                 if md1 is md:
 466                     break
 467                 reporter.note(md1.line,
 468                               'evaluated from {}'.format(md1.name))
 469             return None
 470
 471         stack[md.name] = md
 472         if md.function:
 473             reporter.error(current.line,
 474                 'attempt to evaluate function-like macro {}'.format(name))
 475             reporter.note(md.line, 'definition of {}'.format(md.name))
 476             return None
 477
 478         try:
 479             body = md.body
 480             if len(body) == 0:
 481                 # Empty expansion.
 482                 return None
 483
 484             # Remove surrounding ().
 485             if body[0].text == '(' and body[-1].text == ')':
 486                 body = body[1:-1]
 487                 had_parens = True
 488             else:
 489                 had_parens = False
 490
 491             if len(body) == 1:
 492                 return eval_token(md, body[0])
 493
 494             # Minimal expression evaluator for binary operators.
 495             op = body[1].text
 496             if len(body) == 3 and op in BINARY_OPERATORS:
 497                 if not had_parens:
 498                     reporter.error(body[1].line,
 499                         'missing parentheses around {} expression'.format(op))
 500                     reporter.note(md.line,
 501                                   'in definition of macro {}'.format(md.name))
 502
 503                 left = eval_token(md, body[0])
 504                 right = eval_token(md, body[2])
 505
 506                 if type(left) != type(1):
 507                     reporter.error(left.line,
 508                         'left operand of {} is not an integer'.format(op))
 509                     reporter.note(md.line,
 510                                   'in definition of macro {}'.format(md.name))
 511                 if type(right) != type(1):
 512                     reporter.error(left.line,
 513                         'right operand of {} is not an integer'.format(op))
 514                     reporter.note(md.line,
 515                                   'in definition of macro {}'.format(md.name))
 516                 return BINARY_OPERATORS[op](left, right)
 517
 518             reporter.error(md.line,
 519                 'uninterpretable macro token sequence: {}'.format(
 520                     ' '.join(md.body_lowered)))
 521             return None
 522         finally:
 523             del stack[md.name]
 524
 525     # Start of main body of macro_eval.
 526     for md in definitions.values():
 527         name = md.name
 528         if name not in evaluated and not md.function:
 529             evaluated[name] = eval1(md, name)
 530     return evaluated