package Inline::CPP::grammar; use strict; use vars qw($TYPEMAP_KIND $VERSION $class_part $class_decl $star); $VERSION = '0.25'; # Parse::RecDescent 1.90 and later have an incompatible change # 'The key of an %item entry for a repeated subrule now includes # the repetition specifier.' # Hence various hash keys may or may not need trailing '(s?)' depending on # the version of Parse::RecDescent we are using. require Parse::RecDescent; ($class_part, $class_decl, $star) = map {($Parse::RecDescent::VERSION > 1.89) ? "$_(s?)" : $_} qw (class_part class_decl star); #============================================================================ # Regular expressions to match code blocks, numbers, strings, parenthesized # expressions, function calls, and macros. The more complex regexes are only # implemented in 5.6.0 and above, so they're in eval-blocks. # # These are all adapted from the output of Damian Conway's excellent # Regexp::Common module. In future, Inline::CPP may depend directly on it, # but for now I'll just duplicate the code. use vars qw($code_block $string $number $parens $funccall); #============================================================================ eval <<'END'; # $RE{balanced}{-parens=>q|{}()[]"'|} $code_block = qr'(?-xism:(?-xism:(?:[{](?:(?>[^][)(}{]+)|(??{$Inline::CPP::grammar::code_block}))*[}]))|(?-xism:(?-xism:(?:[(](?:(?>[^][)(}{]+)|(??{$Inline::CPP::grammar::code_block}))*[)]))|(?-xism:(?-xism:(?:[[](?:(?>[^][)(}{]+)|(??{$Inline::CPP::grammar::code_block}))*[]]))|(?-xism:(?!)))))'; END $code_block = qr'{[^}]*}' if $@; # For the stragglers: here's a lame regexp. eval <<'END'; # $RE{balanced}{-parens=>q|()"'|} $parens = qr'(?-xism:(?-xism:(?:[(](?:(?>[^)(]+)|(??{$Inline::CPP::grammar::parens}))*[)]))|(?-xism:(?!)))'; END $parens = qr'\([^)]*\)' if $@; # For the stragglers: here's another # $RE{quoted} $string = qr'(?:(?:\")(?:[^\\\"]*(?:\\.[^\\\"]*)*)(?:\")|(?:\')(?:[^\\\']*(?:\\.[^\\\']*)*)(?:\')|(?:\`)(?:[^\\\`]*(?:\\.[^\\\`]*)*)(?:\`))'; # $RE{num}{real}|$RE{num}{real}{-base=>16}|$RE{num}{int} $number = qr'(?:(?i)(?:[+-]?)(?:(?=[0123456789]|[.])(?:[0123456789]*)(?:(?:[.])(?:[0123456789]{0,}))?)(?:(?:[E])(?:(?:[+-]?)(?:[0123456789]+))|))|(?:(?i)(?:[+-]?)(?:(?=[0123456789ABCDEF]|[.])(?:[0123456789ABCDEF]*)(?:(?:[.])(?:[0123456789ABCDEF]{0,}))?)(?:(?:[G])(?:(?:[+-]?)(?:[0123456789ABCDEF]+))|))|(?:(?:[+-]?)(?:\d+))'; $funccall = qr/(?:[_a-zA-Z][_a-zA-Z0-9]*::)*[_a-zA-Z][_a-zA-Z0-9]*(?:$Inline::CPP::grammar::parens)?/; #============================================================================ # Inline::CPP's grammar #============================================================================ sub grammar { <<'END'; { use Data::Dumper; } { sub handle_class_def { my ($thisparser, $def) = @_; # print "Found a class: $def->[0]\n"; my $class = $def->[0]; my @parts; for my $part (@{$def->[1]}) { push @parts, @$_ for @$part } push @{$thisparser->{data}{classes}}, $class unless defined $thisparser->{data}{class}{$class}; $thisparser->{data}{class}{$class} = \@parts; # print "Class $class:\n", Dumper \@parts; Inline::CPP::grammar::typemap($thisparser, $class); [$class, \@parts]; } sub handle_typedef { my ($thisparser, $t) = @_; my ($name, $type) = @{$t}{qw(name type)}; # print "found a typedef: $name => $type\n"; # XXX: this doesn't handle non-class typedefs that we could handle, # e.g. "typedef int my_int_t" if ($thisparser->{data}{class}{$type} && !exists($thisparser->{data}{class}{$name})) { push @{$thisparser->{data}{classes}}, $name; $thisparser->{data}{class}{$name} = $thisparser->{data}{class}{$type}; Inline::CPP::grammar::typemap($thisparser, $name); } $t; } sub handle_enum { my ($thisparser, $t) = @_; $t; } } code: part(s) {1} part: comment | typedef { handle_typedef($thisparser, $item[1]); 1; } | enum { my $t = handle_enum($thisparser, $item[1]); push @{$thisparser->{data}{enums}}, $t; 1; } | class_def { handle_class_def($thisparser, $item[1]); 1; } | function_def { # print "found a function: $item[1]->{name}\n"; my $name = $item[1]->{name}; my $i=0; for my $arg (@{$item[1]->{args}}) { $arg->{name} = 'dummy' . ++$i unless defined $arg->{name}; } Inline::CPP::grammar::strip_ellipsis($thisparser, $item[1]->{args}); push @{$thisparser->{data}{functions}}, $name unless defined $thisparser->{data}{function}{$name}; $thisparser->{data}{function}{$name} = $item[1]; # print Dumper $item[1]; 1; } | all typedef: 'typedef' class IDENTIFIER(?) '{' class_part(s?) '}' IDENTIFIER ';' { my ($class, $parts); $class = $item[3][0] || 'anon_class'.($thisparser->{data}{anonclass}++); ($class, $parts)= handle_class_def($thisparser, [$class, $item{$Inline::CPP::grammar::class_part}]); { thing => 'typedef', name => $item[8], type => $class, body => $parts } } | 'typedef' IDENTIFIER IDENTIFIER ';' { { thing => 'typedef', name => $item[3], type => $item[2] } } | 'typedef' /[^;]*/ ';' { # dprint "Typedef $item{__DIRECTIVE1__} is too heinous\n"; { thing => 'comment'} } enum: 'enum' IDENTIFIER(?) '{' '}' ';' { { thing => 'enum', name => $item{IDENTIFIER}[0], body => $item{__DIRECTIVE1__} } } enum_item: IDENTIFIER '=' /[0-9]+/ { [$item{IDENTIFIER}, $item{__PATTERN1__}] } | IDENTIFIER { [$item{IDENTIFIER}, undef] } class_def: class IDENTIFIER '{' class_part(s?) '}' ';' { [@item{'IDENTIFIER',$Inline::CPP::grammar::class_part}] } | class IDENTIFIER ':' '{' class_part(s?) '}' ';' { push @{$item{$Inline::CPP::grammar::class_part}}, [$item{__DIRECTIVE2__}]; [@item{'IDENTIFIER',$Inline::CPP::grammar::class_part}] } inherit: scope IDENTIFIER { {thing => 'inherits', name => $item[2], scope => $item[1]} } class_part: comment { [ {thing => 'comment'} ] } | scope ':' class_decl(s?) { for my $part (@{$item{$Inline::CPP::grammar::class_decl}}) { $_->{scope} = $item[1] for @$part; } $item{$Inline::CPP::grammar::class_decl} } | class_decl(s) { for my $part (@{$item[1]}) { $_->{scope} = $thisparser->{data}{defaultscope} for @$part; } $item[1] } class_decl: comment { [{thing => 'comment'}] } | typedef { [ handle_typedef($thisparser, $item[1]) ] } | enum { [ handle_enum($thisparser, $item[1]) ] } | class_def { my ($class, $parts) = handle_class_def($thisparser, $item[1]); [{ thing => 'class', name => $class, body => $parts }]; } | method_def { $item[1]->{thing} = 'method'; # print "class_decl found a method: $item[1]->{name}\n"; my $i=0; for my $arg (@{$item[1]->{args}}) { $arg->{name} = 'dummy' . ++$i unless defined $arg->{name}; } Inline::CPP::grammar::strip_ellipsis($thisparser, $item[1]->{args}); [$item[1]]; } | member_def { # print "class_decl found one or more members:\n", Dumper(\@item); $_->{thing} = 'member' for @{$item[1]}; $item[1]; } function_def: operator ';' { $item[1] } | operator smod(?) code_block { $item[1] } | IDENTIFIER '(' (s?) ')' smod(?) code_block { {name => $item{IDENTIFIER}, args => $item{__DIRECTIVE2__}, rtype => '' } } | rtype IDENTIFIER '(' (s?) ')' ';' { {rtype => $item[1], name => $item[2], args => $item{__DIRECTIVE1__} } } | rtype IDENTIFIER '(' (s?) ')' smod(?) code_block { {rtype => $item{rtype}, name => $item[2], args => $item{__DIRECTIVE1__} } } method_def: operator method_imp { # print "method operator:\n", Dumper $item[1]; $item[1]; } | IDENTIFIER '(' (s?) ')' method_imp { # print "con-/de-structor found: $item[1]\n"; {name => $item[1], args => $item{__DIRECTIVE2__}, abstract => ${$item{method_imp}} }; } | rtype IDENTIFIER '(' (s?) ')' method_imp { # print "method found: $item[2]\n"; $return = {name => $item[2], rtype => $item[1], args => $item[4], abstract => ${$item[6]}, rconst => $thisparser->{data}{smod}{const}, }; $thisparser->{data}{smod}{const} = 0; } operator: rtype(?) 'operator' /\(\)|[^()]+/ '(' (s?) ')' { # print "Found operator: $item[1][0] operator $item[3]\n"; {name=> "operator $item[3]", args => $item[5], ret => $item[1][0]} } # By adding smod, we allow 'const' member functions. This would also bind to # incorrect C++ with the word 'static' after the argument list, but we don't # care at all because such code would never be compiled successfully. # By adding init, we allow constructors to initialize references. Again, we'll # allow them anywhere, but our goal is not to enforce c++ standards -- that's # the compiler's job. method_imp: smod(?) ';' { \0 } | smod(?) '=' '0' ';' { \1 } | smod(?) initlist(?) code_block { \0 } | smod(?) '=' '0' code_block { \0 } initlist: ':' member_def: anytype ';' { my @retval; for my $def (@{$item[2]}) { my $type = join '', $item[1], @{$def->[0]}; my $name = $def->[1]; # print "member found: type=$type, name=$name\n"; push @retval, { name => $name, type => $type }; } \@retval; } var: star(s?) IDENTIFIER '=' expr { [@item[1,2]] } | star(s?) IDENTIFIER { [@item[1,2]] } arg: type IDENTIFIER '=' expr { # print "argument $item{IDENTIFIER} found\n"; # print "expression: $item{expr}\n"; {type => $item[1], name => $item{IDENTIFIER}, optional => 1, offset => $thisoffset} } | type IDENTIFIER { # print "argument $item{IDENTIFIER} found\n"; {type => $item[1], name => $item{IDENTIFIER}, offset => $thisoffset} } | type { {type => $item[1]} } | '...' { {name => '...', type => '...', offset => $thisoffset} } ident_part: /[~_a-z]\w*/i '<' (s?) '>' { $item[1].'<'.join('', @{$item[4]}).'>' } | /[~_a-z]\w*/i { $item[1] } IDENTIFIER: { my $x = join '::', @{$item[1]}; # print "IDENTIFIER: $x\n"; $x } # Parse::RecDescent is retarded in this one case: if a subrule fails, it # gives up the entire rule. This is a stupid way to get around that. rtype: rtype2 | rtype1 rtype1: TYPE star(s?) { $return = $item[1]; $return .= join '',' ',@{$item[2]} if @{$item[2]}; # print "rtype1: $return\n"; # return undef # unless(defined$thisparser->{data}{typeconv}{valid_rtypes}{$return}); } rtype2: modifier(s) TYPE star(s?) { $return = $item[2]; $return = join ' ',grep{$_}@{$item[1]},$return if @{$item[1]}; $return .= join '',' ',@{$item[3]} if @{$item[3]}; # print "rtype2: $return\n"; # return undef # unless(defined$thisparser->{data}{typeconv}{valid_rtypes}{$return}); $return = 'static ' . $return if $thisparser->{data}{smod}{static}; $thisparser->{data}{smod}{static} = 0; } type: type2 | type1 type1: TYPE star(s?) { $return = $item[1]; $return .= join '',' ',@{$item{$Inline::CPP::grammar::star}} if @{$item{$Inline::CPP::grammar::star}}; # print "type1: $return\n"; # return undef # unless(defined$thisparser->{data}{typeconv}{valid_types}{$return}); } type2: modifier(s) TYPE star(s?) { $return = $item{TYPE}; $return = join ' ',grep{$_}@{$item[1]},$return if @{$item[1]}; $return .= join '',' ',@{$item{$Inline::CPP::grammar::star}} if @{$item{$Inline::CPP::grammar::star}}; # print "type2: $return\n"; # return undef # unless(defined$thisparser->{data}{typeconv}{valid_types}{$return}); } anytype: anytype2 | anytype1 anytype1: TYPE star(s?) { $return = $item[1]; $return .= join '',' ',@{$item[2]} if @{$item[2]}; } anytype2: modifier(s) TYPE star(s?) { $return = $item[2]; $return = join ' ',grep{$_}@{$item[1]},$return if @{$item[1]}; $return .= join '',' ',@{$item[3]} if @{$item[3]}; } comment: m{\s* // [^\n]* \n }x | m{\s* /\* (?:[^*]+|\*(?!/))* \*/ ([ \t]*)? }x # long and short aren't recognized as modifiers because they break when used # as regular types. Another Parse::RecDescent problem is greedy matching; I # need tmodifier to "give back" long or short in cases where keeping them would # cause the modifier rule to fail. One side-effect is 'long long' can never # be parsed correctly here. modifier: tmod | smod { ++$thisparser->{data}{smod}{$item[1]}; ''} | nmod { '' } tmod: 'unsigned' # | 'long' | 'short' smod: 'const' | 'static' nmod: 'extern' | 'virtual' | 'mutable' | 'volatile' | 'inline' scope: 'public' | 'private' | 'protected' class: 'class' { $thisparser->{data}{defaultscope} = 'private'; $item[1] } | 'struct' { $thisparser->{data}{defaultscope} = 'public'; $item[1] } star: '*' | '&' code_block: /$Inline::CPP::grammar::code_block/ # Consume expressions expr: { my $o = join '', @{$item[1]}; # print "expr: $o\n"; $o; } subexpr: /$Inline::CPP::grammar::funccall/ # Matches a macro, too | /$Inline::CPP::grammar::string/ | /$Inline::CPP::grammar::number/ | UOP subexpr OP: '+' | '-' | '*' | '/' | '^' | '&' | '|' | '%' | '||' | '&&' UOP: '~' | '!' | '-' | '*' | '&' TYPE: IDENTIFIER all: /.*/ END } #============================================================================ # Generate typemap code for the classes and structs we bind to. This allows # functions declared after a class to return or accept class objects as # parameters. #============================================================================ $TYPEMAP_KIND = 'O_Inline_CPP_Class'; sub typemap { my $parser = shift; my $typename = shift; # print "Inline::CPP::grammar::typemap(): typename=$typename\n"; my ($TYPEMAP, $INPUT, $OUTPUT); $TYPEMAP = "$typename *\t\t$TYPEMAP_KIND\n"; $INPUT = <{data}{typeconv}{input_expr}{$TYPEMAP_KIND} ||= $INPUT; $parser->{data}{typeconv}{output_expr}{$TYPEMAP_KIND} ||= $OUTPUT; $parser->{data}{typeconv}{type_kind}{$ctypename} = $TYPEMAP_KIND; $parser->{data}{typeconv}{valid_types}{$ctypename}++; $parser->{data}{typeconv}{valid_rtypes}{$ctypename}++; } #============================================================================ # Default action is to strip ellipses from the C++ code. This allows having # _only_ a '...' in the code, just like XS. It is the default. #============================================================================ sub strip_ellipsis { my $parser = shift; my $args = shift; return if $parser->{ILSM}{PRESERVE_ELLIPSIS}; for (my $i=0; $i<@$args; $i++) { next unless $args->[$i]{name} eq '...'; # if it's the first one, just strip it if ($i==0) { substr($parser->{ILSM}{code}, $args->[$i]{offset} - 3, 3) = " "; } else { my $prev = $i - 1; my $prev_offset = $args->[$prev]{offset}; my $length = $args->[$i]{offset} - $prev_offset; substr($parser->{ILSM}{code}, $prev_offset, $length) =~ s/\S/ /g; } } }