#!/usr/bin/perl -w #=============================================================================== # # FILE: csourceparser.pl # # USAGE: ./csourceparser.pl [Option] ... File ... # # # DESCRIPTION: Parse and extract specified elements from source-code # written in the C language # # OPTIONS: --- # REQUIREMENTS: Perl Version >= 5.8.0, Parse::RecDescent, Getopt::Long, Pod::Usage # BUGS: --- # NOTES: --- # AUTHOR: # COMPANY: FH-SWF # VERSION: 0.1.0 # CREATED: 07/10/05 12:34:53 CEST # REVISION: --- #=============================================================================== require 5.008000; use strict; use Parse::RecDescent; use Getopt::Long; use Pod::Usage; # use Data::Dumper 'Dumper'; #$::RD_HINT = 1; # Print hints on errors #$::RD_ERRORS = 1; # Print errors #$::RD_WARN = 1; #$::RD_TRACE = 1; # Print tracecode to STDERR #$::RD_AUTOSTUB = 1; ############################################################### # Grammar used to find and remove comments from C source code # ############################################################### my $decomment_grammar = <<'END_OF_DECOMMENT'; program : { @{$thisparser}{qw(code)} = () } part(s) {@{$thisparser}{code};} part : comment { $thisparser->{code} .= " "; } | C_code { $thisparser->{code} .= $item[1]; } | string { $thisparser->{code} .= qq("$item[1]"); } C_code : m{( [^"/]+ # one or more non-delimiters ( # then (optionally)... / # a potential comment delimiter [^*/] # which is not an actual delimiter )? # )+ # all repeated once or more }x string : m{" # a leading delimiter (( # zero or more... \\. # escaped anything | # or [^"] # anything but a delimiter )* ) "}x { $return = $1 } comment : m{\s* # optional whitespace // # comment delimiter [^\n]* # anything except a newline \n # then a newline }x | m{\s* # optional whitespace /\* # comment opener (?:[^*]+|\*(?!/))* # anything except */ \*/ # comment closer ([ \t]*)? # trailing blanks or tabs }x END_OF_DECOMMENT ###################################################################################### # Grammar used to parse C source code (without comments and preprocessor directives) # # #################################################################################### my $Cgrammar = <<'END_OF_C_GRAMMAR'; translation_unit: external_declaration(s) | external_declaration: function_definition | declaration | { if ($::opt_SKIPPEDLINES || (defined $::opt_VERBOSE and $::opt_VERBOSE >= 1 )) { print "Skipping line $thisline\n" # Try next line if possible... } } function_definition: declaration_specifiers(?) declarator declaration_list(?) compound_statement { if($::opt_FUNCTIONS) { $::functions_output .= ::flatten_list($item[1]); $::functions_output .= ::flatten_list($item[2]); $::functions_output .= ::flatten_list($item[3]) . ";\n"; } } declaration: declaration_specifiers init_declarator_list(?) ';' { if($::opt_DECLARATIONS) { $::declarations_output .= ::flatten_list($item[1]); $::declarations_output .= ::flatten_list($item[2]); $::declarations_output .= ::flatten_list($item[3]) . "\n"; } } declaration_list: declaration(s) declaration_specifiers: type_qualifier declaration_specifiers(?) | storage_class_specifier declaration_specifiers(?) | type_specifier declaration_specifiers(?) storage_class_specifier: 'auto' | 'register' | 'static' | 'extern' | 'typedef' type_specifier: 'int' | 'double' | 'void' | 'char' | 'long' | 'float' | 'signed' | 'unsigned' | 'short' | struct_or_union_specifier | enum_specifier | typedef_name ...typedef_name_lookahead { [$item[1] ] } typedef_name_lookahead: declarator # | pointer # | ',' ...parameter_type_list # | ')' type_qualifier: 'const' | 'volatile' struct_or_union_specifier: struct_or_union IDENTIFIER(?) '{' struct_declaration_list(?) '}' { if($::opt_STRUCTS){ $::structs_output .= ::flatten_list($item[1]) . " "; $::structs_output .= ::flatten_list($item[2]); $::structs_output .= ::flatten_list($item[3]) . "\n"; $::structs_output .= ::flatten_list_beautified($item[4]); $::structs_output .= ::flatten_list($item[5]) . ";\n\n"; } } | struct_or_union IDENTIFIER struct_or_union: 'struct' | 'union' struct_declaration_list: struct_declaration(s) init_declarator_list: init_declarator(s /(,)/) init_declarator: declarator '=' initializer | declarator struct_declaration: specifier_qualifier_list struct_declarator_list ';' specifier_qualifier_list: type_specifier specifier_qualifier_list(?) | type_qualifier specifier_qualifier_list(?) struct_declarator_list: struct_declarator(s /(,)/) struct_declarator: declarator(?) ':' constant_expression | declarator enum_specifier: 'enum' IDENTIFIER(?) '{' enumerator_list '}' { if($::opt_STRUCTS){ $::structs_output .= ::flatten_list($item[1]) . " "; $::structs_output .= ::flatten_list($item[2]); $::structs_output .= ::flatten_list($item[3]) . "\n"; $::structs_output .= ::flatten_list_beautified($item[4]); $::structs_output .= ::flatten_list($item[5]) . ";\n\n"; } } | 'enum' IDENTIFIER enumerator_list: enumerator(s /(,)/) enumerator: IDENTIFIER ('=' constant_expression)(?) declarator: pointer(?) direct_declarator function_signature: '[' constant_expression(?) ']' | '(' parameter_type_list ')' | '(' identifier_list(?) ')' direct_declarator: IDENTIFIER function_signature(s?) | '(' declarator ')' function_signature(s?) pointer: '*' type_qualifier_list(?) pointer(?) type_qualifier_list: type_qualifier(s) parameter_type_list: parameter_list (',' '...')(?) parameter_list: parameter_declaration(s /(,)/) parameter_declaration: declaration_specifiers declarator | declaration_specifiers abstract_declarator(?) identifier_list: IDENTIFIER(s /(,)/) initializer: assignment_expression | '{' initializer_list (',')(?) '}' initializer_list: initializer(s /(,)/) type_name: specifier_qualifier_list abstract_declarator(?) abstract_declarator: pointer(?) direct_abstract_declarator | pointer abstract_type: '[' constant_expression(?) ']' | '(' parameter_type_list(?) ')' direct_abstract_declarator: '(' abstract_declarator ')' abstract_type(s?) | abstract_type(s) typedef_name: IDENTIFIER statement: selection_statement | expression_statement | iteration_statement | compound_statement | jump_statement | labeled_statement labeled_statement: 'case' constant_expression ':' statement | IDENTIFIER ':' statement | 'default' ':' statement expression_statement: expression(?) ';' compound_statement: '{' declaration_list(?) statement_list(?) '}' statement_list: statement(s) selection_statement: 'if' '(' expression ')' statement ('else' statement)(?) | 'switch' '(' expression ')' statement iteration_statement: 'for' '(' expression(?) ';' expression(?) ';' expression(?) ')' statement | 'while' '(' expression ')' statement | 'do' statement 'while' '(' expression ')' jump_statement: 'return' expression(?) ';' | 'break' ';' | 'continue' ';' | 'goto' IDENTIFIER ';' expression: assignment_expression(s /(,)/) assignment_expression: unary_expression ASSIGNMENT_OPERATOR assignment_expression | conditional_expression conditional_expression: logical_OR_expression ('?' expression ':' conditional_expression)(?) constant_expression: conditional_expression logical_OR_expression: logical_AND_expression(s /(\|\|)/) logical_AND_expression: inclusive_OR_expression(s /(&&)/) inclusive_OR_expression: exclusive_OR_expression(s /(\|)/) exclusive_OR_expression: AND_expression(s /(\^)/) AND_expression: equality_expression(s /(&)/) equality_expression: relational_expression(s /(==|!=)/) relational_expression: shift_expression(s /(<=|>=|<|>)/) shift_expression: additive_expression(s /(<<|>>)/) additive_expression: multiplicative_expression(s /(\+|-)/) multiplicative_expression: cast_expression(s /(\*|\/|%)/) cast_expression: unary_expression | '(' type_name ')' cast_expression unary_expression: postfix_expression | '++' unary_expression | '--' unary_expression | 'sizeof' '(' type_name ')' | UNARY_OPERATOR cast_expression | 'sizeof' unary_expression postfix_expression: primary_expression postfix_expression_token(s?) postfix_expression_token: '[' expression ']' | '(' argument_expression_list(?)')' | '.' IDENTIFIER | '->' IDENTIFIER | '++' | '--' primary_expression: IDENTIFIER | constant | STRING | '(' expression ')' argument_expression_list: assignment_expression(s /(,)/) constant: CHARACTER_CONSTANT | FLOATING_CONSTANT | INTEGER_CONSTANT | ENUMERATION_CONSTANT ### TERMINALS INTEGER_CONSTANT: /(?:0[xX][\da-fA-F]+) # Hexadecimal |(?:0[0-7]*) # Octal or Zero |(?:[1-9]\d*) # Decimal [uUlL]? # Suffix /x CHARACTER_CONSTANT: /'([^\\'"] # None of these |\\['\\ntvbrfa'"] # or a backslash followed by one of those |\\[0-7]{1,3}|\\x\d+)' # or an octal or hex constant /x FLOATING_CONSTANT: /(?:\d+|(?=\.\d+)) # No leading digits only if '.moreDigits' follows (?:\.|(?=[eE])) # There may be no floating point only if an exponent is present \d* # Zero or more floating digits ([eE][+-]?\d+)? # expontent [lLfF]? # Suffix /x ENUMERATION_CONSTANT: INTEGER_CONSTANT STRING: /"(([^\\'"]) # None of these |(\\[\\ntvbrfa'"]) # or a backslash followed by one of those |(\\[0-7]{1,3})|(\\x\d+))*"/x # or an octal or hex IDENTIFIER: /(?!(auto|break|case|char|const|continue|default|do|double|else|enum|extern|float|for|goto # LOOKAHEAD FOR KEYWORDS |if|int|long|register|return|signed|sizeof|short|static|struct|switch|typedef # NONE OF THE KEYWORDS |union|unsigned|void|volatile|while)[^a-zA-Z_]) # SHOULD FULLY MATCH! (([a-zA-Z]\w*)|(_\w+))/x # Check for valid identifier ASSIGNMENT_OPERATOR: '=' | '*=' | '/=' | '%=' | '+=' | '-=' | '<<=' | '>>=' | '&=' | '^=' | '|=' UNARY_OPERATOR: '&' | '*' | '+' | '-' | '~' | '!' END_OF_C_GRAMMAR #=== FUNCTION ================================================================ # NAME: flatten_list # DESCRIPTION: Extracts values from a recursive list. Double whitespaces will # be reduced # PARAMETER 1: Array Reference #=============================================================================== sub flatten_list { ( my $tokens = join ' ', map { ref($_) ? flatten_list(@$_) : ($_) } @_ ) =~ s/\s+/ /g; $tokens; } #=== FUNCTION ================================================================ # NAME: flatten_list_beautified # DESCRIPTION: Like flatten_list but inserts a newline after each semicolon # PARAMETER 1: Array Reference #=============================================================================== sub flatten_list_beautified { ( my $tokens = join ' ', map { ref($_) ? flatten_list(@$_) : ($_) } @_ ) =~ s/\s+/ /g; $tokens =~ s/;/;\n/g; $tokens =~ s/^\s*/\t/mg; $tokens; } #--------------------------------------------------------------------------# # Parsing variables # #--------------------------------------------------------------------------# my $decommentParser; # Parser using decomment grammar my $Cparser; # Parser using C grammar my $C_source; # C source code my $decommented_C_source; # C source code without comments my $preprocessed_C_source; # C source code without preprocessor directives # preprocessor directives are just removed, not evalued #--------------------------------------------------------------------------# # Command line options # #--------------------------------------------------------------------------# our $opt_HELP = ''; our $opt_SKIPPEDLINES = ''; our $opt_ERRORS = ''; our $opt_TRACE = ''; our $opt_CODE = ''; our $opt_VERBOSE = 0; our $opt_PRECOMPILE = ''; our $opt_FUNCTIONS = ''; our $opt_DECLARATIONS = ''; our $opt_STRUCTS = ''; Getopt::Long::Configure("bundling"); # Enables option bundling GetOptions( # Parse command line options 'help|h' => \$opt_HELP, # --help -h 'skippedlines|s' => \$opt_SKIPPEDLINES, # --skippedlines -s 'errors|e' => \$opt_ERRORS, # --errors -e 'trace|t' => \$opt_TRACE, # --trace -t 'code|c' => \$opt_CODE, # --code -c 'verbose|v+' => \$opt_VERBOSE, # --verbose -v 'functions|f' => \$opt_FUNCTIONS, # --functions -f 'declarations|d' => \$opt_DECLARATIONS, # --declarations -d 'precompile|p' => \$opt_PRECOMPILE, # --precompile -p 'structs|u' => \$opt_STRUCTS # --structs -u ); # variables for parser output our $functions_output = ''; our $declarations_output = ''; our $structs_output = ''; $opt_HELP and pod2usage( -verbose => 2 ); # Set error reporting if ($opt_ERRORS) { $::RD_HINT = 1; # Print hints on errors $::RD_ERRORS = 1; # Print errors open( Parse::RecDescent::ERROR, ">errfile" ) or die "Can't open errfile: $!"; } # Die if no input files present @ARGV or pod2usage( -message => "Error: More arguments required.", -verbose => 0 ); { local $/; $C_source = <>; } # Set trace level if ( $opt_TRACE || ( $opt_VERBOSE >= 3 ) ) { $::RD_TRACE = 1; } # Generate precompiled parser modules if($opt_PRECOMPILE) { print("\nCreating precompiled parsers... \n"); Parse::RecDescent->Precompile( $decomment_grammar,"CSourceParser::DecommentGrammar" ); Parse::RecDescent->Precompile( $Cgrammar, "CSourceParser::Cgrammar" ); print("Done\n\n"); } #--------------------------------------------------------------------------# # Parse Level 1 (removes comments) # #--------------------------------------------------------------------------# if ( -e "DecommentGrammar.pm" ) { require DecommentGrammar; $decommentParser = new CSourceParser::DecommentGrammar or die "Malformed Decomment grammar!\n"; } else { $decommentParser = new Parse::RecDescent($decomment_grammar) or die "Malformed Decomment grammar!\n"; } defined( $decommented_C_source = $decommentParser->program($C_source) ) or die "Malformed C code found at parse level 1!\n"; #--------------------------------------------------------------------------# # Parse Level 2 (Removes preprocessor directives) # #--------------------------------------------------------------------------# open( PREPROCESS, "<", \$decommented_C_source ) # open string as filehandle or die "Can't open input string for parse level 2: $!"; $preprocessed_C_source = ""; my $skip_line = 0; foreach () { # Match preprocessor directives ... if ( m{\s* # Optional whitespace \# # Preprocessor opener \s* # Optional whitespace (?:(define|include|undef|ifdef|ifndef|if|endif|else|elif|line|error|pragma)\s) # Keyword followed by one or more whitespace .* # anything (optinal) }x || $skip_line ) { $skip_line = /.*\\[\n]/; # Ignore this line AND NEXT LINE ALSO $preprocessed_C_source .= " "; # if this line ends with backslash } else { $preprocessed_C_source .= $_; # OK, parse this line } } if ( $opt_CODE || ( $opt_VERBOSE >= 1 ) ) { # print sourcecode with linenumbers my $i = 1; foreach ( split( /\n/, $preprocessed_C_source ) ) { print "$i\t$_\n"; $i++; } } if ( $opt_VERBOSE >= 2 ) { $::RD_TRACE = 1; } #--------------------------------------------------------------------------# # Parse Level 3 (parses C code) # #--------------------------------------------------------------------------# $::RD_AUTOACTION = q { [ @item[1..$#item] ] }; # set default auto-action for grammar rules if ( -e "CGrammar.pm" ) { require CGrammar; $Cparser = new CSourceParser::CGrammar or die "Malformed C grammar!\n"; } else { $Cparser = new Parse::RecDescent($Cgrammar) or die "Malformed C grammar!\n"; } defined( $Cparser->translation_unit($preprocessed_C_source) ) or die "Malformed C code found at parse level 3!\n"; print "\nDefined Functions:\n\n$functions_output\n\n" if defined $functions_output and $opt_FUNCTIONS; print "\nDeclarations:\n\n$declarations_output\n\n" if defined $declarations_output and $opt_DECLARATIONS; print "\nStructures:\n\n$structs_output\n\n" if defined $structs_output and $opt_STRUCTS; __END__ #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Application Documentation #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% =head1 NAME csourceparser.pl - extract components from sourcecode written in the C programming language =head1 VERSION This documentation refers to csourceparser.pl version 0.1.0 =head1 SYNOPSIS B<./csourceparser.pl [OPTION] ... FILE ...> B Print signatures of functions defined in a C-file: ./csourceparser.pl -f myprog.c Print all declarations in a C-file: ./csourceparser.pl -d myprog.c Print all structures in a C-file: ./csourceparser.pl -u myheader.h =head1 REQUIRED ARGUMENTS One or more C-Sourcefiles to parse. =head1 OPTIONS =over 4 =item B<-c, --code> Show parsed source code with line numbers =item B<-d, --declarations> Prints (global) declarations in the sourcefile to stdout =item B<-e, --errors> Print error messages generated by Parse::RecDescent to the file 'errfile' =item B<-f, --functions> Prints the signatures of functions defined in the source file to stdout (the function bodys are ommited) =item B<-h, --help> Print this help =item B<-p, --precompile> Generate precompiled parsers Cgrammar.pm and DecommentGrammar.pm in the current working directory. Precompiled parsers will speed up parsing. If these files are available in the current working directory they will be used automatically. Every time the --precompile option is set the precompiled parsers are generateted newly so this option should only be used only once. Also, don't forget to recreate the precompiled parseres if you modify the grammar. =item B<-s, --skippedlines> Show which lines had been skipped during parse due to parser errors or unrecognized tokens in the C source code =item B<-t, --trace> Print full tracecode generated by Parse::RecDescent. Note: this can be B =item B<-u, --structs> Print all structs and unions defined in the C sourcefile =item B<-v, --verbose> Each use encreases verbosity level by one. =over 4 =item Level 1: Print parsed sourcecode and skipped lines (same as I<-sc>) =item Level 2: Print tracecode of parse level 3 (C code without preprocessor directives and comments) =item Level 3: Print full tracecode. =back =back =head1 DIAGNOSTICS If you don't get the output you expect try to set the -sc options to see what happens. When the parser can't handle the input it will silently get skipped (for empty lines this is a normal behaviour). If the parser doesn't behave as you expect take a look at the tracecode, e.g. ./csourceparser -t file.c 2> trace for full tracecode or ./csourceparser -vv file.c 2> trace to see only parser level 3 (C-parser) trace code. Depending on the size of the input file(s) this could take some time and may occupy some hd-space. Please refer to the Parse::RecDescent documentation if you get errors after modifying the grammar. =head1 DEPENDENCIES =over 4 =item * Perl >= v5.00800 =item * Parse::RecDescent =item * Getopt::Long =item * Pod::Usage =head1 BUGS AND LIMITATIONS Macros in C-Files could not be parsed at this time. Any declarations and definitions containing macros my cause errors. Perhaps this feature could be implemented in the future by using a "real" preprocessor like m4. Please report problems to Hendrik Sirges (hendrik.sirges at fh-swf.de) Patches are welcome. =head1 AUTHOR Hendrik Sirges =head1 LICENCE AND COPYRIGHT This program is Copyright 2005 by Hendrik Sirges. This program is free software; you can redistribute it and/or modify it under the terms of the Perl Artistic License or the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. If you do not have a copy of the GNU General Public License write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. =cut