#!/usr/bin/perl # $Id: genxref,v 1.7 2006-12-07 04:59:38 reed%reedloden.com Exp $ # genxref.pl -- Finds identifiers in a set of C files using an # extremely fuzzy algorithm. It sort of works. # # Arne Georg Gleditsch # Per Kristian Gjermshus # # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # TODO: ns/cmd/xfe/src/MozillaApp.h, XFE_MozillaApp ###################################################################### use lib 'lib/'; use integer; use DB_File; use strict; my %fileidx; my %itype = (('macro', 'M'), ('typedef', 'T'), ('struct', 'S'), ('enum', 'E'), ('union', 'U'), ('function', 'F'), ('funcprot', 'f'), ('class', 'C'), # (C++) ('classforw', 'c'), # (C++) ('var', 'V'), ('interface', 'I'), ('reference', 'R'), ); my @reserved = ('auto', 'break', 'case', 'char', 'const', 'continue', 'default', 'do', 'double', 'else', 'enum', 'extern', 'float', 'for', 'goto', 'if', 'int', 'long', 'register', 'return', 'short', 'signed', 'sizeof', 'static', 'struct', 'switch', 'typedef', 'union', 'unsigned', 'void', 'volatile', 'while', 'fortran', 'asm', 'inline', 'operator', 'class', # (C++) # Her bør vi ha flere av disse: '__asm__','__inline__'); my @reservedJS = ( 'abstract', 'as', 'break', 'case', 'catch', 'class', 'const', 'continue', 'default', 'delete', 'do', 'else', 'export', 'extends', 'false', 'final', 'finally', 'for', 'function', 'if', 'import', 'in', 'instanceof', 'is', 'namespace', 'new', 'null', 'package', 'private', 'public', 'return', 'static', 'super', 'switch', 'this ', 'throw', 'true', 'try', 'typeof', 'use', 'var', 'void', 'while', 'with'); my @reservedXUL = (''); #nothing yet my @ft; my %xref; my @f; my @jsfiles; my $ident = '\~?_*[a-zA-Z][a-zA-Z0-9_]*'; my $fnum = 1; my $realpath = $ARGV[0]; $realpath ||= '.'; $realpath .= '/'; my $totaldefs = 0; my $totalrefs = 0; sub wash { my $towash = $_[0]; $towash =~ s/[^\n]+//gs; return($towash); } sub stripodd { my $tostrip = $_[0]; while ($tostrip =~ s/\{([^\{\}]*)\}/ "\05".&wash($1)/ges) {} $tostrip =~ s/\05/\{\}/gs; $tostrip =~ s/[\{\}]//gs; return($tostrip); } sub classes { my @c = (shift =~ /($ident)\s*(?:$|,)/gm); if (@c) { return(join(":", @c)."::"); } else { return(''); } } sub findidentJS { print(STDERR "Starting pass 1 for Javascript: Collect identifier definitions.\n"); my $start = time; my $defs = 0; my $f = ""; my $contents = ""; my @contents; foreach $f (@jsfiles) { $f =~ s/^$realpath//o; $fileidx{++$fnum} = $f; open(SRCFILE, $realpath.$f); { local $/ = undef; $contents = ; } close(SRCFILE); print(STDERR "(Pass 1 JS) $f (",length($contents), "), file $fnum of ",$#f+1,"...\n"); # Remove comments. $contents =~ s/\/\*(.*?)\*\//&wash($1)/ges; $contents =~ s/\/\/[^\n]*//g; # C++ # Unwrap continuation lines. $contents =~ s/\\\s*\n/$1\05/gs; while ($contents =~ s/\05([^\n\05]+)\05/$1\05\05/gs) {} $contents =~ s/(\05+)([^\n]*)/"$2"."\n" x length($1)/gse; # Remove nested parentheses. while ($contents =~ s/\(([^\)]*)\(/\($1\05/g || $contents =~ s/\05([^\(\)]*)\)/ $1 /g) {} # Some heuristics here: Try to recognize "code" and delete # everything up to the next block delimiter. # $contents =~ s/([\;\}\{])(\s*$ident\s*\([^\)]*\)[^\{\}]*)/ # "$1".&wash($2)/goes; # $contents =~ s/([\;\{])(\s*\**$ident\s*\=[^\{\}]*)/ # "$1".&wash($2)/goes; # Parentheses containing commas are probably not interesting. $contents =~ s/\(([^\)]*\,[^\)]*)\)/ "()".&wash($1)/ges; # This operator-stuff messes things up. (C++) $contents =~ s/operator[\<\>\=\!\+\-\*\%\/]{1,2}/operator/g; # Ranges are uninteresting (and confusing). $contents =~ s/\[.*?\]//gs; # From here on, \01 and \02 are used to encapsulate found # identifiers, =pod # Find class definitions. (C++) $contents =~ s/((class)\s+($ident)\s*(:[^;\{]*|)({}|(;)))/ "$2 "."\01".$itype{$2.($6 ? 'forw' : '')}. &classes($4).$3."\02 ".$6.&wash($1)/goes; =cut my $ct = $contents; my $ct2 = ($ct =~ s/\n//g) || 0; my ($ids, $junk); @contents = split(/[;}]/, $contents); $contents = ''; my $l = 1; foreach (@contents) { my $line = $ct = $_; $ct2 = ($ct =~ s/\n//g) || 0; my $bl = $l; if ($line =~ /^(\s*(?:var|const)\s+)($ident(?:\s*,\s*$ident)*)(.*?)$/m) { ($ct, $ids, $junk) = ($1, $2, $3); $l += ($ct =~ s/\n//g) || 0; my @idds = split /\b/, $ids; while (@idds) { my $lid = shift @idds; $lid =~ /($ident)/; $contents .= "\04$l\01".$itype{'var'}."$1\02"; my $spc; do { $spc = shift @idds; $l += $spc =~ s/[\r\n]//g; } while ($spc =~ /[,=]/); } } $l = $bl; if ($line =~ /^(.*?\s*)((?:$ident\s*[:=]*\s*)*)function(\s+)($ident|)(\s*.*)$/sm) { my ($decl, $impl, $ws1, $ws2, $ws3) = ($2, $4, $1, $3, $5); $l += $ws1 =~ s/\n//g; if ($decl) { my @idds = split /\b/, $decl; while (@idds) { my $lid = shift @idds; if ($lid =~ /($ident)/) { $contents .= "\04$l\01".$itype{'function'}."$1\02"; } else { $l += $lid =~ s/\n//g; } } } $l += $ws2 =~ s/\n//g; $contents .= "\04$l\01".$itype{'function'}."$impl\02" if $impl; $l += $ws3 =~ s/\n//g; } $l = $bl; $l += $ct2; } foreach ($contents =~ /^(.*)/gm) { while (/\04(\d+)\01(.)($ident)\02/go) { $xref{$3} .= "$2$fnum:$1\t"; $defs++; } } } # Cleanup. foreach (@reservedJS) { delete($xref{$_}); } $totaldefs = $totaldefs + $defs; print(STDERR "Completed pass 1 Javascript (",(time-$start),"s):", " $defs definitions found (total found so far: $totaldefs).\n\n"); } sub c_clean { my $contents = $_[0]; # Find macro (un)definitions. my $l = 0; my $defs; foreach ($contents =~ /^(.*)/gm) { $l++; if (/^[ \t]*\#\s*(define|undef)\s+($ident)/o) { $xref{$2} .= "$itype{'macro'}$fnum:$l\t"; $defs++; } } # We want to do some funky heuristics with preprocessor blocks # later, so mark them. (FIXME: #elif) $contents =~ s/^[ \t]*\#\s*if.*/\01/gm; $contents =~ s/^[ \t]*\#\s*else.*/\02/gm; $contents =~ s/^[ \t]*\#\s*endif.*/\03/gm; # Strip all preprocessor directives. $contents =~ s/^[ \t]*\#(.*)//gm; # Now, remove all odd block markers ({,}) we find inside # #else..#endif blocks. (And pray they matched one in the # preceding #if..#else block.) while ($contents =~ s/\02([^\01\02\03]*\03)/&stripodd($1)/ges || $contents =~ s/\01([^\01\02\03]*)\03/$1/gs) {} while ($contents =~ /([\01\02\03\04\05])/gs) { print(STDERR "\t ** stray ".($1 eq "\01" ? "#if" : ($1 eq "\02" ? "#else" : ($1 eq "\03" ? "#endif" : "control sequence" ) ) )." found.\n"); } $contents =~ s/[\01\02\03\04\05]//gs; # Remove all but outermost blocks. (No local variables.) while ($contents =~ s/\{([^\{\}]*)\}/ "\05".&wash($1)/ges) {} $contents =~ s/\05/\{\}/gs; # This operator-stuff messes things up. (C++) $contents =~ s/operator[\<\>\=\!\+\-\*\%\/]{1,2}/operator/g; # Ranges are uninteresting (and confusing). $contents =~ s/\[.*?\]//gs; # And so are assignments. $contents =~ s/\=(.*?);/";".&wash($1)/ges; return $contents; } sub java_clean { my $contents = $_[0]; while ($contents =~ s/(\{[^\{]*)\{([^\{\}]*)\}/ $1."\05".&wash($2)/ges) {} $contents =~ s/\05/\{\}/gs; # Remove imports $contents =~ s/^\s*import.*;//gm; # Remove packages $contents =~ s/^\s*package.*;//gm; return $contents; } sub c_classes { my $contents = $_[0]; # Find struct, enum and union definitions. $contents =~ s/((struct|enum|union)\s+($ident|)\s*({}|(;)))/ "$2 ".($3 ? "\01".$itype{$2}.$3."\02 " : "").$5.&wash($1)/goes; # Find class definitions. (C++) $contents =~ s/((class)\s+($ident)\s*(:[^;\{]*|)({}|(;)))/ "$2 "."\01".$itype{$2.($6 ? 'forw' : '')}. &classes($4).$3."\02 ".$6.&wash($1)/goes; return $contents; } sub java_classes { my $contents = $_[0]; # Find Java classes $contents =~ s/((class)\s+($ident)\s*(extends\s+([\.\w]+)\s*|)(implements\s+([\.\w]+)|))/ "$2 "."\01".$itype{$2}.&classes($5.", ".$7).$3."\02 ". &wash($1)/goes; # Find Java interfaces $contents =~ s/((interface)\s+($ident)\s*(extends\s+([\.\w]+)|))/ "$2 "."\01".$itype{$2}.&classes($5).$3."\02 ".&wash($1)/goes; return $contents; } sub idl_interfaces { my $contents = $_[0]; # Find IDL interfaces $contents =~ s/((interface)\s+($ident)\s*(:[^;\{]*|)({}|(;)))/ "$2 "."\01".$itype{$2.($6 ? 'forw' : '')}. &classes($4).$3."\02 ".$6.&wash($1)/goes; return $contents; } sub findident { print(STDERR "Starting pass 1 for C/C++: Collect identifier definitions.\n"); my $start = time; my $defs = 0; my $f = ""; my $contents = ""; my @contents; foreach $f (@f) { $f =~ s/^$realpath//o; my ($java, $idl) = ($ft[$fnum] == 1, $ft[$fnum] == 2); $fileidx{++$fnum} = $f; open(SRCFILE, $realpath.$f); $_ = $/; undef($/); $contents = ; $/ = $_; close(SRCFILE); print(STDERR "(Pass 1 C/C++) $f (",length($contents), "), file $fnum of ",$#f+1,"...\n"); # Remove comments. $contents =~ s/\/\*(.*?)\*\//&wash($1)/ges; $contents =~ s/\/\/[^\n]*//g; # C++ # Unwrap continuation lines. $contents =~ s/\\\s*\n/$1\05/gs; while ($contents =~ s/\05([^\n\05]+)\05/$1\05\05/gs) {} $contents =~ s/(\05+)([^\n]*)/"$2"."\n" x length($1)/gse; if ($java) { $contents = java_clean($contents); } else { $contents = c_clean($contents); } # Remove nested parentheses. while ($contents =~ s/\(([^\)]*)\(/\($1\05/g || $contents =~ s/\05([^\(\)]*)\)/ $1 /g) {} # Some heuristics here: Try to recognize "code" and delete # everything up to the next block delimiter. # $contents =~ s/([\;\}\{])(\s*$ident\s*\([^\)]*\)[^\{\}]*)/ # "$1".&wash($2)/goes; # $contents =~ s/([\;\{])(\s*\**$ident\s*\=[^\{\}]*)/ # "$1".&wash($2)/goes; # Parentheses containing commas are probably not interesting. $contents =~ s/\(([^\)]*\,[^\)]*)\)/ "()".&wash($1)/ges; # From here on, \01 and \02 are used to encapsulate found # identifiers, if ($java) { $contents = java_classes($contents); } elsif ($idl) { $contents = idl_interfaces($contents); } else { $contents = c_classes($contents); } @contents = split(/[;\}]/, $contents); $contents = ''; foreach (@contents) { if (!$java) { s/^(\s*)(struct|enum|union|inline)/$1/; if (/$ident[^a-zA-Z0-9_]+$ident/) { # It takes two, baby. my $t = /^\s*typedef/s; # Is this a type definition? s/($ident(?:\s*::\s*$ident|)) # ($1) Match the identifier ([\s\)]* # ($2) Tokens allowed after identifier (\([^\)]*\) # ($3) Function parameters? (?:\s*:[^\{]*|) # inheritage specification (C++) |) # No function parameters \s*($|,|\{))/ # ($4) Allowed termination chars. "\01". # identifier marker ($t # if type definition... ? $itype{'typedef'} # ..mark as such : ($3 # $3 is empty unless function definition. ? ($4 eq '{' # Terminating token indicates ? $itype{'function'} # function or : $itype{'funcprot'}) # function prototype. : $itype{'var'}) # Variable. )."$1\02 ".&wash($2)/goesx; } } else { s/($ident)\s*\([^\)]*\)[^\{]*($|\{)/ "\01".($2 eq '{' ? $itype{'function'} : $itype{'funcprot'})."$1\02 ". &wash($2)/goesx; s/($ident)\s*(=.*)$/ "\01".$itype{'var'}."$1\02 ".&wash($2)/goesx; } $contents .= $_; } my $l = 0; foreach ($contents =~ /^(.*)/gm) { $l++; while (/\01(.)(?:(.+?)\s*::\s*|)($ident)\02/go) { $xref{$3} .= "$1$fnum:$l".($2 ? ":$2" : "")."\t"; $defs++; } } } # Remove reserved from xref foreach (@reserved) { delete($xref{$_}); } $totaldefs = $totaldefs + $defs; print(STDERR "Completed pass 1 C/C++ (",(time-$start),"s):", " $defs definitions found (total found so far: $totaldefs).\n\n"); } sub findusageJS { print(STDERR "Starting pass 2 Javascript: Generate reference statistics.\n"); my $start = time; my $refs = 0; my $f; foreach $f (@jsfiles) { $f =~ s/^$realpath//o; $fnum++; my $lcount = 0; my %tref = (); open(SRCFILE, $realpath.$f); $_ = $/; undef($/); my $contents = ; $/ = $_; close(SRCFILE); print(STDERR "(Pass 2 JS) $f (",length($contents), "), file $fnum of ",$#f+1,"...\n"); # Remove comments $contents =~ s/\/\*(.*?)\*\//&wash($1)/ges; $contents =~ s/\/\/[^\n]*//g; # FIXME: "var" my @lines = split(/\n/, $contents); my $line; foreach $line (@lines) { $lcount++; foreach ($line =~ /(?:^|[^a-zA-Z_\#])($ident)\b/og) { $tref{$_} .= "$lcount," if $xref{$_}; } } while (($a, $b) = each(%tref)) { chop($b); $xref{$a} .= "R$fnum:$b\t"; $refs++; } } $totalrefs = $totalrefs + $refs; print(STDERR "Completed pass 2 (",(time-$start),"s):", "$refs references to known identifiers found (total: $totalrefs).\n\n"); } sub findusage { print(STDERR "Starting pass 2 C/C++: Generate reference statistics.\n"); my $start = time; my $refs = 0; my $f; foreach $f (@f) { $f =~ s/^$realpath//o; $fnum++; my $lcount = 0; my %tref = (); open(SRCFILE, $realpath.$f); $_ = $/; undef($/); my $contents = ; $/ = $_; close(SRCFILE); print(STDERR "(Pass 2 C/C++) $f (",length($contents), "), file $fnum of ",$#f+1,"...\n"); # Remove comments $contents =~ s/\/\*(.*?)\*\//&wash($1)/ges; $contents =~ s/\/\/[^\n]*//g; # Remove include statements $contents =~ s/^[ \t]*\#include[ \t]+[^\n]*//gm; # FIXME: "var" my @lines = split(/\n/, $contents); my $line; foreach $line (@lines) { $lcount++; foreach ($line =~ /(?:^|[^a-zA-Z_\#])($ident)\b/og) { $tref{$_} .= "$lcount," if $xref{$_}; } } while (($a, $b) = each(%tref)) { chop($b); $xref{$a} .= "R$fnum:$b\t"; $refs++; } } $totalrefs = $totalrefs + $refs; print(STDERR "Completed pass 2 C/C++ (",(time-$start),"s):", "$refs references to known identifiers found (total: $totalrefs).\n\n"); } sub dumpdb { print(STDERR "Starting pass 3: Dump database to disk.\n"); my $start = time; my %xrefdb; tie (%xrefdb, "DB_File" , "xref.out.$$", O_RDWR|O_CREAT, 0664, $DB_HASH) || die("Could not open \"xref\" for writing"); my $i = 0; my $k; my $v; while (($k, $v) = each(%xref)) { $i++; delete($xref{$k}); $xrefdb{$k} = $v; unless ($i % 100) { print(STDERR "(Pass 3) identifier $i of maximum $totaldefs...\n"); } } untie(%xrefdb); rename("xref.out.$$", "xref") || die "Couldn't rename xref.out.$$ to xref"; print(STDERR "Completed pass 3 (",(time-$start),"s):", "Information on $i identifiers dumped to disk.\n\n"); } tie (%fileidx, "DB_File", "fileidx.out.$$", O_RDWR|O_CREAT, 0660, $DB_HASH) || die("Could not open \"fileidx.out.$$\" for writing"); open(FILES, "find $realpath -print |"); print(STDERR "Starting pass 0: Checking for files to index.\n"); print(STDERR "looking in $realpath.\n"); while () { chop; if (/\.([ch]|cpp?|idl|cc|java)$/i) { push(@ft, ($1 eq 'java')?1:(($1 eq 'idl')?2:0)); push(@f, $_); } if (/\.(js)$/i) { push(@jsfiles, $_) } # push(@f, $_) if /\.([ch]|cpp?|idl|cc|java)$/i; # Duplicated in lib/LXR/Common.pm } close(FILES); print "Stage 0 C/C++ file count is : " . scalar(@f) . "\n"; print "Stage 0 JS file count is : " . scalar(@jsfiles) . "\n"; $fnum = 0; &findident; print "Stage 1 C/C++ XREF keycount is : " . scalar(keys %xref) . "\n"; &findidentJS; print "Stage 1 C/C++/JS XREF keycount is : " . scalar(keys %xref) . "\n"; $fnum = 0; &findusage; print "Stage 2 C/C++ XREF keycount is : " . scalar(keys %xref) . "\n"; &findusageJS; print "Stage 2 C/C++/JS XREF keycount is : " . scalar(keys %xref) . "\n"; &dumpdb; dbmclose(%fileidx); rename("fileidx.out.$$", "fileidx") || die "Couldn't rename fileidx.out.$$ to fileidx";