1  # Regular expression patterns for C syntax.
       2  #
       3  # None of these patterns has any capturing.  However, a number of them
       4  # have capturing markers compatible with utils.set_capture_groups().
       5  
       6  import textwrap
       7  
       8  
       9  def _ind(text, level=1, edges='both'):
      10      indent = '    ' * level
      11      text = textwrap.indent(text, indent)
      12      if edges == 'pre' or edges == 'both':
      13          text = '\n' + indent + text.lstrip()
      14      if edges == 'post' or edges == 'both':
      15          text = text.rstrip() + '\n' + '    ' * (level - 1)
      16      return text
      17  
      18  
      19  #######################################
      20  # general
      21  
      22  HEX = r'(?: [0-9a-zA-Z] )'
      23  
      24  STRING_LITERAL = textwrap.dedent(rf'''
      25      (?:
      26          # character literal
      27          (?:
      28              ['] [^'] [']
      29              |
      30              ['] \\ . [']
      31              |
      32              ['] \\x{HEX}{HEX} [']
      33              |
      34              ['] \\0\d\d [']
      35              |
      36              (?:
      37                  ['] \\o[01]\d\d [']
      38                  |
      39                  ['] \\o2[0-4]\d [']
      40                  |
      41                  ['] \\o25[0-5] [']
      42               )
      43           )
      44          |
      45          # string literal
      46          (?:
      47              ["] (?: [^"\\]* \\ . )* [^"\\]* ["]
      48           )
      49          # end string literal
      50       )
      51      ''')
      52  
      53  _KEYWORD = textwrap.dedent(r'''
      54      (?:
      55          \b
      56          (?:
      57              auto |
      58              extern |
      59              register |
      60              static |
      61              typedef |
      62  
      63              const |
      64              volatile |
      65  
      66              signed |
      67              unsigned |
      68              char |
      69              short |
      70              int |
      71              long |
      72              float |
      73              double |
      74              void |
      75  
      76              struct |
      77              union |
      78              enum |
      79  
      80              goto |
      81              return |
      82              sizeof |
      83              break |
      84              continue |
      85              if |
      86              else |
      87              for |
      88              do |
      89              while |
      90              switch |
      91              case |
      92              default |
      93              entry
      94           )
      95          \b
      96       )
      97      ''')
      98  KEYWORD = rf'''
      99      # keyword
     100      {_KEYWORD}
     101      # end keyword
     102      '''
     103  _KEYWORD = ''.join(_KEYWORD.split())
     104  
     105  IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )'
     106  # We use a negative lookahead to filter out keywords.
     107  STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )'
     108  ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )'
     109  
     110  
     111  #######################################
     112  # types
     113  
     114  SIMPLE_TYPE = textwrap.dedent(rf'''
     115      # simple type
     116      (?:
     117          \b
     118          (?:
     119              void
     120              |
     121              (?: signed | unsigned )  # implies int
     122              |
     123              (?:
     124                  (?: (?: signed | unsigned ) \s+ )?
     125                  (?: (?: long | short ) \s+ )?
     126                  (?: char | short | int | long | float | double )
     127               )
     128           )
     129          \b
     130       )
     131      # end simple type
     132      ''')
     133  
     134  COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )'
     135  
     136  
     137  #######################################
     138  # variable declarations
     139  
     140  _STORAGE = 'auto register static extern'.split()
     141  STORAGE_CLASS = rf'(?: \b (?: {" | ".join(_STORAGE)} ) \b )'
     142  TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )'
     143  PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )'
     144  
     145  TYPE_SPEC = textwrap.dedent(rf'''
     146      # type spec
     147      (?:
     148          {_ind(SIMPLE_TYPE, 2)}
     149          |
     150          (?:
     151              [_]*typeof[_]*
     152              \s* [(]
     153              (?: \s* [*&] )*
     154              \s* {STRICT_IDENTIFIER}
     155              \s* [)]
     156           )
     157          |
     158          # reference to a compound type
     159          (?:
     160              {COMPOUND_TYPE_KIND}
     161              (?: \s* {ANON_IDENTIFIER} )?
     162           )
     163          |
     164          # reference to a typedef
     165          {STRICT_IDENTIFIER}
     166       )
     167      # end type spec
     168      ''')
     169  
     170  DECLARATOR = textwrap.dedent(rf'''
     171      # declarator  (possibly abstract)
     172      (?:
     173          (?: {PTR_QUALIFIER} \s* )*
     174          (?:
     175              (?:
     176                  (?:  # <IDENTIFIER>
     177                      {STRICT_IDENTIFIER}
     178                  )
     179                  # Inside the brackets is actually a "constant expression".
     180                  (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
     181               )
     182              |
     183              (?:
     184                  [(] \s*
     185                  (?:  # <WRAPPED_IDENTIFIER>
     186                      {STRICT_IDENTIFIER}
     187                  )
     188                  # Inside the brackets is actually a "constant expression".
     189                  (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
     190                  \s* [)]
     191               )
     192              |
     193              # func ptr
     194              (?:
     195                  [(] (?: \s* {PTR_QUALIFIER} )? \s*
     196                  (?:  # <FUNC_IDENTIFIER>
     197                      {STRICT_IDENTIFIER}
     198                  )
     199                  # Inside the brackets is actually a "constant expression".
     200                  (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
     201                  \s* [)]
     202                  # We allow for a single level of paren nesting in parameters.
     203                  \s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)]
     204               )
     205           )
     206       )
     207      # end declarator
     208      ''')
     209  
     210  VAR_DECL = textwrap.dedent(rf'''
     211      # var decl (and typedef and func return type)
     212      (?:
     213          (?:
     214              (?:  # <STORAGE>
     215                  {STORAGE_CLASS}
     216              )
     217              \s*
     218          )?
     219          (?:
     220              (?:  # <TYPE_QUAL>
     221                  {TYPE_QUALIFIER}
     222              )
     223              \s*
     224           )?
     225          (?:
     226              (?:  # <TYPE_SPEC>
     227                  {_ind(TYPE_SPEC, 4)}
     228              )
     229           )
     230          \s*
     231          (?:
     232              (?:  # <DECLARATOR>
     233                  {_ind(DECLARATOR, 4)}
     234              )
     235           )
     236       )
     237      # end var decl
     238      ''')
     239  
     240  INITIALIZER = textwrap.dedent(rf'''
     241      # initializer
     242      (?:
     243          (?:
     244              [(]
     245              # no nested parens (e.g. func ptr)
     246              [^)]*
     247              [)]
     248              \s*
     249           )?
     250          (?:
     251              # a string literal
     252              (?:
     253                  (?: {_ind(STRING_LITERAL, 4)} \s* )*
     254                  {_ind(STRING_LITERAL, 4)}
     255               )
     256              |
     257  
     258              # a simple initializer
     259              (?:
     260                  (?:
     261                      [^'",;{{]*
     262                      {_ind(STRING_LITERAL, 4)}
     263                   )*
     264                  [^'",;{{]*
     265               )
     266              |
     267  
     268              # a struct/array literal
     269              (?:
     270                  # We only expect compound initializers with
     271                  # single-variable declarations.
     272                  {{
     273                  (?:
     274                      [^'";]*?
     275                      {_ind(STRING_LITERAL, 5)}
     276                   )*
     277                  [^'";]*?
     278                  }}
     279                  (?= \s* ; )  # Note this lookahead.
     280               )
     281           )
     282       )
     283      # end initializer
     284      ''')
     285  
     286  
     287  #######################################
     288  # compound type declarations
     289  
     290  STRUCT_MEMBER_DECL = textwrap.dedent(rf'''
     291      (?:
     292          # inline compound type decl
     293          (?:
     294              (?:  # <COMPOUND_TYPE_KIND>
     295                  {COMPOUND_TYPE_KIND}
     296               )
     297              (?:
     298                  \s+
     299                  (?:  # <COMPOUND_TYPE_NAME>
     300                      {STRICT_IDENTIFIER}
     301                   )
     302               )?
     303              \s* {{
     304           )
     305          |
     306          (?:
     307              # typed member
     308              (?:
     309                  # Technically it doesn't have to have a type...
     310                  (?:  # <SPECIFIER_QUALIFIER>
     311                      (?: {TYPE_QUALIFIER} \s* )?
     312                      {_ind(TYPE_SPEC, 5)}
     313                   )
     314                  (?:
     315                      # If it doesn't have a declarator then it will have
     316                      # a size and vice versa.
     317                      \s*
     318                      (?:  # <DECLARATOR>
     319                          {_ind(DECLARATOR, 6)}
     320                       )
     321                   )?
     322              )
     323  
     324              # sized member
     325              (?:
     326                  \s* [:] \s*
     327                  (?:  # <SIZE>
     328                      # This is actually a "constant expression".
     329                      \d+
     330                      |
     331                      [^'",}}]+
     332                   )
     333               )?
     334              \s*
     335              (?:  # <ENDING>
     336                  [,;]
     337               )
     338           )
     339          |
     340          (?:
     341              \s*
     342              (?:  # <CLOSE>
     343                  }}
     344               )
     345           )
     346       )
     347      ''')
     348  
     349  ENUM_MEMBER_DECL = textwrap.dedent(rf'''
     350      (?:
     351          (?:
     352              \s*
     353              (?:  # <CLOSE>
     354                  }}
     355               )
     356           )
     357          |
     358          (?:
     359              \s*
     360              (?:  # <NAME>
     361                  {IDENTIFIER}
     362               )
     363              (?:
     364                  \s* = \s*
     365                  (?:  # <INIT>
     366                      # This is actually a "constant expression".
     367                      {_ind(STRING_LITERAL, 4)}
     368                      |
     369                      [^'",}}]+
     370                   )
     371               )?
     372              \s*
     373              (?:  # <ENDING>
     374                  , | }}
     375               )
     376           )
     377       )
     378      ''')
     379  
     380  
     381  #######################################
     382  # statements
     383  
     384  SIMPLE_STMT_BODY = textwrap.dedent(rf'''
     385      # simple statement body
     386      (?:
     387          (?:
     388              [^'"{{}};]*
     389              {_ind(STRING_LITERAL, 3)}
     390           )*
     391          [^'"{{}};]*
     392          #(?= [;{{] )  # Note this lookahead.
     393       )
     394      # end simple statement body
     395      ''')
     396  SIMPLE_STMT = textwrap.dedent(rf'''
     397      # simple statement
     398      (?:
     399          (?:  # <SIMPLE_STMT>
     400              # stmt-inline "initializer"
     401              (?:
     402                  return \b
     403                  (?:
     404                      \s*
     405                      {_ind(INITIALIZER, 5)}
     406                  )?
     407               )
     408              |
     409              # variable assignment
     410              (?:
     411                  (?: [*] \s* )?
     412                  (?:
     413                      {STRICT_IDENTIFIER} \s*
     414                      (?: . | -> ) \s*
     415                   )*
     416                  {STRICT_IDENTIFIER}
     417                  (?: \s* \[ \s* \d+ \s* \] )?
     418                  \s* = \s*
     419                  {_ind(INITIALIZER, 4)}
     420               )
     421              |
     422              # catchall return statement
     423              (?:
     424                  return \b
     425                  (?:
     426                      (?:
     427                          [^'";]*
     428                          {_ind(STRING_LITERAL, 6)}
     429                       )*
     430                      \s* [^'";]*
     431                   )?
     432               )
     433              |
     434              # simple statement
     435              (?:
     436                  {_ind(SIMPLE_STMT_BODY, 4)}
     437               )
     438           )
     439          \s*
     440          (?:  # <SIMPLE_ENDING>
     441              ;
     442           )
     443       )
     444      # end simple statement
     445      ''')
     446  COMPOUND_STMT = textwrap.dedent(rf'''
     447      # compound statement
     448      (?:
     449          \b
     450          (?:
     451              (?:
     452                  (?:  # <COMPOUND_BARE>
     453                      else | do
     454                   )
     455                  \b
     456               )
     457              |
     458              (?:
     459                  (?:  # <COMPOUND_LABELED>
     460                      (?:
     461                          case \b
     462                          (?:
     463                              [^'":]*
     464                              {_ind(STRING_LITERAL, 7)}
     465                           )*
     466                          \s* [^'":]*
     467                       )
     468                      |
     469                      default
     470                      |
     471                      {STRICT_IDENTIFIER}
     472                   )
     473                  \s* [:]
     474               )
     475              |
     476              (?:
     477                  (?:  # <COMPOUND_PAREN>
     478                      for | while | if | switch
     479                   )
     480                  \s* (?= [(] )  # Note this lookahead.
     481               )
     482           )
     483          \s*
     484       )
     485      # end compound statement
     486      ''')
     487  
     488  
     489  #######################################
     490  # function bodies
     491  
     492  LOCAL = textwrap.dedent(rf'''
     493      (?:
     494          # an empty statement
     495          (?:  # <EMPTY>
     496              ;
     497           )
     498          |
     499          # inline type decl
     500          (?:
     501              (?:
     502                  (?:  # <INLINE_LEADING>
     503                      [^;{{}}]+?
     504                   )
     505                  \s*
     506               )?
     507              (?:  # <INLINE_PRE>
     508                  (?: {STORAGE_CLASS} \s* )?
     509                  (?: {TYPE_QUALIFIER} \s* )?
     510               )?  # </INLINE_PRE>
     511              (?:  # <INLINE_KIND>
     512                  {COMPOUND_TYPE_KIND}
     513               )
     514              (?:
     515                  \s+
     516                  (?:  # <INLINE_NAME>
     517                      {STRICT_IDENTIFIER}
     518                   )
     519               )?
     520              \s* {{
     521           )
     522          |
     523          # var decl
     524          (?:
     525              (?:  # <STORAGE>
     526                  {STORAGE_CLASS}
     527               )?  # </STORAGE>
     528              (?:
     529                  \s*
     530                  (?:  # <VAR_DECL>
     531                      {_ind(VAR_DECL, 5)}
     532                   )
     533               )
     534              (?:
     535                  (?:
     536                      # initializer
     537                      # We expect only basic initializers.
     538                      \s* = \s*
     539                      (?:  # <VAR_INIT>
     540                          {_ind(INITIALIZER, 6)}
     541                       )
     542                   )?
     543                  (?:
     544                      \s*
     545                      (?:  # <VAR_ENDING>
     546                          [,;]
     547                       )
     548                   )
     549               )
     550           )
     551          |
     552          {_ind(COMPOUND_STMT, 2)}
     553          |
     554          # start-of-block
     555          (?:
     556              (?:  # <BLOCK_LEADING>
     557                  (?:
     558                      [^'"{{}};]*
     559                      {_ind(STRING_LITERAL, 5)}
     560                   )*
     561                  [^'"{{}};]*
     562                  # Presumably we will not see "== {{".
     563                  [^\s='"{{}});]
     564                  \s*
     565               )?  # </BLOCK_LEADING>
     566              (?:  # <BLOCK_OPEN>
     567                  {{
     568               )
     569           )
     570          |
     571          {_ind(SIMPLE_STMT, 2)}
     572          |
     573          # end-of-block
     574          (?:  # <BLOCK_CLOSE>
     575              }}
     576           )
     577       )
     578      ''')
     579  
     580  LOCAL_STATICS = textwrap.dedent(rf'''
     581      (?:
     582          # inline type decl
     583          (?:
     584              (?:
     585                  (?:  # <INLINE_LEADING>
     586                      [^;{{}}]+?
     587                   )
     588                  \s*
     589               )?
     590              (?:  # <INLINE_PRE>
     591                  (?: {STORAGE_CLASS} \s* )?
     592                  (?: {TYPE_QUALIFIER} \s* )?
     593               )?
     594              (?:  # <INLINE_KIND>
     595                  {COMPOUND_TYPE_KIND}
     596               )
     597              (?:
     598                  \s+
     599                  (?:  # <INLINE_NAME>
     600                      {STRICT_IDENTIFIER}
     601                   )
     602               )?
     603              \s* {{
     604           )
     605          |
     606          # var decl
     607          (?:
     608              # We only look for static variables.
     609              (?:  # <STATIC_DECL>
     610                  static \b
     611                  (?: \s* {TYPE_QUALIFIER} )?
     612                  \s* {_ind(TYPE_SPEC, 4)}
     613                  \s* {_ind(DECLARATOR, 4)}
     614               )
     615              \s*
     616              (?:
     617                  (?:  # <STATIC_INIT>
     618                      = \s*
     619                      {_ind(INITIALIZER, 4)}
     620                      \s*
     621                      [,;{{]
     622                   )
     623                  |
     624                  (?:  # <STATIC_ENDING>
     625                      [,;]
     626                   )
     627               )
     628           )
     629          |
     630          # everything else
     631          (?:
     632              (?:  # <DELIM_LEADING>
     633                  (?:
     634                      [^'"{{}};]*
     635                      {_ind(STRING_LITERAL, 4)}
     636                   )*
     637                  \s* [^'"{{}};]*
     638               )
     639              (?:
     640                  (?:  # <BLOCK_OPEN>
     641                      {{
     642                   )
     643                  |
     644                  (?:  # <BLOCK_CLOSE>
     645                      }}
     646                   )
     647                  |
     648                  (?:  # <STMT_END>
     649                      ;
     650                   )
     651               )
     652           )
     653       )
     654      ''')
     655  
     656  
     657  #######################################
     658  # global declarations
     659  
     660  GLOBAL = textwrap.dedent(rf'''
     661      (?:
     662          # an empty statement
     663          (?:  # <EMPTY>
     664              ;
     665           )
     666          |
     667  
     668          # compound type decl (maybe inline)
     669          (?:
     670              (?:
     671                  (?:  # <COMPOUND_LEADING>
     672                      [^;{{}}]+?
     673                   )
     674                   \s*
     675               )?
     676              (?:  # <COMPOUND_KIND>
     677                  {COMPOUND_TYPE_KIND}
     678               )
     679              (?:
     680                  \s+
     681                  (?:  # <COMPOUND_NAME>
     682                      {STRICT_IDENTIFIER}
     683                   )
     684               )?
     685              \s* {{
     686           )
     687          |
     688          # bogus inline decl artifact
     689          # This simplifies resolving the relative syntactic ambiguity of
     690          # inline structs.
     691          (?:
     692              (?:  # <FORWARD_KIND>
     693                  {COMPOUND_TYPE_KIND}
     694               )
     695              \s*
     696              (?:  # <FORWARD_NAME>
     697                  {ANON_IDENTIFIER}
     698               )
     699              (?:  # <MAYBE_INLINE_ACTUAL>
     700                  [^=,;({{[*\]]*
     701                  [=,;({{]
     702               )
     703           )
     704          |
     705  
     706          # typedef
     707          (?:
     708              \b typedef \b \s*
     709              (?:  # <TYPEDEF_DECL>
     710                  {_ind(VAR_DECL, 4)}
     711               )
     712              (?:
     713                  # We expect no inline type definitions in the parameters.
     714                  \s* [(] \s*
     715                  (?:  # <TYPEDEF_FUNC_PARAMS>
     716                      [^{{;]*
     717                   )
     718                  \s* [)]
     719               )?
     720              \s* ;
     721           )
     722          |
     723  
     724          # func decl/definition & var decls
     725          # XXX dedicated pattern for funcs (more restricted)?
     726          (?:
     727              (?:
     728                  (?:  # <VAR_STORAGE>
     729                      {STORAGE_CLASS}
     730                   )
     731                  \s*
     732               )?
     733              (?:
     734                  (?:  # <FUNC_INLINE>
     735                      \b inline \b
     736                   )
     737                  \s*
     738               )?
     739              (?:  # <VAR_DECL>
     740                  {_ind(VAR_DECL, 4)}
     741               )
     742              (?:
     743                  # func decl / definition
     744                  (?:
     745                      (?:
     746                          # We expect no inline type definitions in the parameters.
     747                          \s* [(] \s*
     748                          (?:  # <FUNC_PARAMS>
     749                              [^{{;]*
     750                           )
     751                          \s* [)] \s*
     752                          (?:  # <FUNC_DELIM>
     753                              [{{;]
     754                           )
     755                       )
     756                      |
     757                      (?:
     758                          # This is some old-school syntax!
     759                          \s* [(] \s*
     760                          # We throw away the bare names:
     761                          {STRICT_IDENTIFIER}
     762                          (?: \s* , \s* {STRICT_IDENTIFIER} )*
     763                          \s* [)] \s*
     764  
     765                          # We keep the trailing param declarations:
     766                          (?:  # <FUNC_LEGACY_PARAMS>
     767                              # There's at least one!
     768                              (?: {TYPE_QUALIFIER} \s* )?
     769                              {_ind(TYPE_SPEC, 7)}
     770                              \s*
     771                              {_ind(DECLARATOR, 7)}
     772                              \s* ;
     773                              (?:
     774                                  \s*
     775                                  (?: {TYPE_QUALIFIER} \s* )?
     776                                  {_ind(TYPE_SPEC, 8)}
     777                                  \s*
     778                                  {_ind(DECLARATOR, 8)}
     779                                  \s* ;
     780                               )*
     781                           )
     782                          \s* {{
     783                       )
     784                   )
     785                  |
     786                  # var / typedef
     787                  (?:
     788                      (?:
     789                          # initializer
     790                          # We expect only basic initializers.
     791                          \s* = \s*
     792                          (?:  # <VAR_INIT>
     793                              {_ind(INITIALIZER, 6)}
     794                           )
     795                       )?
     796                      \s*
     797                      (?:  # <VAR_ENDING>
     798                          [,;]
     799                       )
     800                   )
     801               )
     802           )
     803       )
     804      ''')