#!/usr/bonsaitools/bin/perl # $Id: genxref,v 1.5 1999-07-22 19:58:11 terry%mozilla.org Exp $ # genxref.pl -- Finds identifiers in a set of C files using an # extremely fuzzy algorithm. It sort of works. # # Arne Georg Gleditsch # Per Kristian Gjermshus # # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # TODO: ns/cmd/xfe/src/MozillaApp.h, XFE_MozillaApp ###################################################################### use lib 'lib/'; use integer; use DB_File; %itype = (('macro', 'M'), ('typedef', 'T'), ('struct', 'S'), ('enum', 'E'), ('union', 'U'), ('function', 'F'), ('funcprot', 'f'), ('class', 'C'), # (C++) ('classforw', 'c'), # (C++) ('var', 'V'), ('interface', 'I')); # ('reference', 'R') @reserved = ('auto', 'break', 'case', 'char', 'const', 'continue', 'default', 'do', 'double', 'else', 'enum', 'extern', 'float', 'for', 'goto', 'if', 'int', 'long', 'register', 'return', 'short', 'signed', 'sizeof', 'static', 'struct', 'switch', 'typedef', 'union', 'unsigned', 'void', 'volatile', 'while', 'fortran', 'asm', 'inline', 'operator', 'class', # (C++) # Her bør vi ha flere av disse: '__asm__','__inline__'); $ident = '\~?_*[a-zA-Z][a-zA-Z0-9_]*'; $realpath = $ARGV[0]; $realpath ||= '.'; $realpath .= '/'; sub wash { my $towash = $_[0]; $towash =~ s/[^\n]+//gs; return($towash); } sub stripodd { my $tostrip = $_[0]; while ($tostrip =~ s/\{([^\{\}]*)\}/ "\05".&wash($1)/ges) {} $tostrip =~ s/\05/\{\}/gs; $tostrip =~ s/[\{\}]//gs; return($tostrip); } sub classes { my @c = (shift =~ /($ident)\s*(?:$|,)/gm); if (@c) { return(join(":", @c)."::"); } else { return(''); } } sub c_clean { my $contents = $_[0]; # Find macro (un)definitions. $l = 0; foreach ($contents =~ /^(.*)/gm) { $l++; if (/^[ \t]*\#\s*(define|undef)\s+($ident)/o) { $xref{$2} .= "$itype{'macro'}$fnum:$l\t"; $defs++; } } # We want to do some funky heuristics with preprocessor blocks # later, so mark them. (FIXME: #elif) $contents =~ s/^[ \t]*\#\s*if.*/\01/gm; $contents =~ s/^[ \t]*\#\s*else.*/\02/gm; $contents =~ s/^[ \t]*\#\s*endif.*/\03/gm; # Strip all preprocessor directives. $contents =~ s/^[ \t]*\#(.*)//gm; # Now, remove all odd block markers ({,}) we find inside # #else..#endif blocks. (And pray they matched one in the # preceding #if..#else block.) while ($contents =~ s/\02([^\01\02\03]*\03)/&stripodd($1)/ges || $contents =~ s/\01([^\01\02\03]*)\03/$1/gs) {} while ($contents =~ /([\01\02\03\04\05])/gs) { print(STDERR "\t ** stray ".($1 eq "\01" ? "#if" : ($1 eq "\02" ? "#else" : ($1 eq "\03" ? "#endif" : "control sequence" ) ) )." found.\n"); } $contents =~ s/[\01\02\03\04\05]//gs; # Remove all but outermost blocks. (No local variables.) while ($contents =~ s/\{([^\{\}]*)\}/ "\05".&wash($1)/ges) {} $contents =~ s/\05/\{\}/gs; # This operator-stuff messes things up. (C++) $contents =~ s/operator[\<\>\=\!\+\-\*\%\/]{1,2}/operator/g; # Ranges are uninteresting (and confusing). $contents =~ s/\[.*?\]//gs; # And so are assignments. $contents =~ s/\=(.*?);/";".&wash($1)/ges; return $contents; } sub java_clean { my $contents = $_[0]; while ($contents =~ s/(\{[^\{]*)\{([^\{\}]*)\}/ $1."\05".&wash($2)/ges) {} $contents =~ s/\05/\{\}/gs; # Remove imports $contents =~ s/^\s*import.*;//gm; # Remove packages $contents =~ s/^\s*package.*;//gm; return $contents; } sub c_classes { my $contents = $_[0]; # Find struct, enum and union definitions. $contents =~ s/((struct|enum|union)\s+($ident|)\s*({}|(;)))/ "$2 ".($3 ? "\01".$itype{$2}.$3."\02 " : "").$5.&wash($1)/goes; # Find class definitions. (C++) $contents =~ s/((class)\s+($ident)\s*(:[^;\{]*|)({}|(;)))/ "$2 "."\01".$itype{$2.($6 ? 'forw' : '')}. &classes($4).$3."\02 ".$6.&wash($1)/goes; return $contents; } sub java_classes { my $contents = $_[0]; # Find Java classes $contents =~ s/((class)\s+($ident)\s*(extends\s+([\.\w]+)\s*|)(implements\s+([\.\w]+)|))/ "$2 "."\01".$itype{$2}.&classes($5.", ".$7).$3."\02 ". &wash($1)/goes; # Find Java interfaces $contents =~ s/((interface)\s+($ident)\s*(extends\s+([\.\w]+)|))/ "$2 "."\01".$itype{$2}.&classes($5).$3."\02 ".&wash($1)/goes; return $contents; } sub findident { print(STDERR "Starting pass 1: Collect identifier definitions.\n"); $start = time; $fnum = 0; $defs = 0; foreach $f (@f) { $f =~ s/^$realpath//o; $java = $ft[$fnum]; $fileidx{++$fnum} = $f; open(SRCFILE, $realpath.$f); $_ = $/; undef($/); $contents = ; $/ = $_; close(SRCFILE); print(STDERR "(Pass 1) $f (",length($contents), "), file $fnum of ",$#f+1,"...\n"); # Remove comments. $contents =~ s/\/\*(.*?)\*\//&wash($1)/ges; $contents =~ s/\/\/[^\n]*//g; # C++ # Unwrap continunation lines. $contents =~ s/\\\s*\n/$1\05/gs; while ($contents =~ s/\05([^\n\05]+)\05/$1\05\05/gs) {} $contents =~ s/(\05+)([^\n]*)/"$2"."\n" x length($1)/gse; if ($java) { $contents = java_clean($contents); } else { $contents = c_clean($contents); } # Remove nested parentheses. while ($contents =~ s/\(([^\)]*)\(/\($1\05/g || $contents =~ s/\05([^\(\)]*)\)/ $1 /g) {} # Some heuristics here: Try to recognize "code" and delete # everything up to the next block delimiter. # $contents =~ s/([\;\}\{])(\s*$ident\s*\([^\)]*\)[^\{\}]*)/ # "$1".&wash($2)/goes; # $contents =~ s/([\;\{])(\s*\**$ident\s*\=[^\{\}]*)/ # "$1".&wash($2)/goes; # Parentheses containing commas are probably not interesting. $contents =~ s/\(([^\)]*\,[^\)]*)\)/ "()".&wash($1)/ges; # From here on, \01 and \02 are used to encapsulate found # identifiers, if ($java) { $contents = java_classes($contents); } else { $contents = c_classes($contents); } @contents = split(/[;\}]/, $contents); $contents = ''; foreach (@contents) { if (!$java) { s/^(\s*)(struct|enum|union|inline)/$1/; if (/$ident[^a-zA-Z0-9_]+$ident/) { # It takes two, baby. $t = /^\s*typedef/s; # Is this a type definition? s/($ident(?:\s*::\s*$ident|)) # ($1) Match the identifier ([\s\)]* # ($2) Tokens allowed after identifier (\([^\)]*\) # ($3) Function parameters? (?:\s*:[^\{]*|) # inheritage specification (C++) |) # No function parameters \s*($|,|\{))/ # ($4) Allowed termination chars. "\01". # identifier marker ($t # if type definition... ? $itype{'typedef'} # ..mark as such : ($3 # $3 is empty unless function definition. ? ($4 eq '{' # Terminating token indicates ? $itype{'function'} # function or : $itype{'funcprot'}) # function prototype. : $itype{'var'}) # Variable. )."$1\02 ".&wash($2)/goesx; } } else { s/($ident)\s*\([^\)]*\)[^\{]*($|\{)/ "\01".($2 eq '{' ? $itype{'function'} : $itype{'funcprot'})."$1\02 ". &wash($2)/goesx; s/($ident)\s*(=.*)$/ "\01".$itype{'var'}."$1\02 ".&wash($2)/goesx; } $contents .= $_; } $l = 0; foreach ($contents =~ /^(.*)/gm) { $l++; while (/\01(.)(?:(.+?)\s*::\s*|)($ident)\02/go) { $xref{$3} .= "$1$fnum:$l".($2 ? ":$2" : "")."\t"; $defs++; } } } # Så juksar me litt. foreach (@reserved) { delete($xref{$_}); } print(STDERR "Completed pass 1 (",(time-$start),"s):", " $defs definitions found.\n\n"); } sub findusage { print(STDERR "Starting pass 2: Generate reference statistics.\n"); $start = time; $fnum = 0; $refs = 0; foreach $f (@f) { $f =~ s/^$realpath//o; $fnum++; $lcount = 0; %tref = (); open(SRCFILE, $realpath.$f); $_ = $/; undef($/); $contents = ; $/ = $_; close(SRCFILE); print(STDERR "(Pass 2) $f (",length($contents), "), file $fnum of ",$#f+1,"...\n"); # Remove comments $contents =~ s/\/\*(.*?)\*\//&wash($1)/ges; $contents =~ s/\/\/[^\n]*//g; # Remove include statements $contents =~ s/^[ \t]*\#include[ \t]+[^\n]*//gm; # FIXME: "var" @lines = split(/\n/, $contents); foreach $line (@lines) { $lcount++; foreach ($line =~ /(?:^|[^a-zA-Z_\#])($ident)\b/og) { $tref{$_} .= "$lcount," if $xref{$_}; } } while (($a, $b) = each(%tref)) { chop($b); $xref{$a} .= "R$fnum:$b\t"; $refs++; } } print(STDERR "Completed pass 2 (",(time-$start),"s):", "$refs references to known identifiers found.\n\n"); } sub dumpdb { print(STDERR "Starting pass 3: Dump database to disk.\n"); $start = time; tie (%xrefdb, "DB_File" , "xref.out.$$", O_RDWR|O_CREAT, 0664, $DB_HASH) || die("Could not open \"xref\" for writing"); $i = 0; while (($k, $v) = each(%xref)) { $i++; delete($xref{$k}); $xrefdb{$k} = $v; unless ($i % 100) { print(STDERR "(Pass 3) identifier $i of maximum $defs...\n"); } } untie(%xrefdb); rename("xref.out.$$", "xref") || die "Couldn't rename xref.out.$$ to xref"; print(STDERR "Completed pass 3 (",(time-$start),"s):", "Information on $i identifiers dumped to disk.\n\n"); } tie (%fileidx, "DB_File", "fileidx.out.$$", O_RDWR|O_CREAT, 0660, $DB_HASH) || die("Could not open \"fileidx.out.$$\" for writing"); open(FILES, "find $realpath -print |"); while () { chop; if (/\.([ch]|cpp?|idl|cc)$/i) { push(@f, $_); push(@t, 0); } if (/\.(java)$/i) { push(@f, $_); push(@ft, 1); } # push(@f, $_) if /\.([ch]|cpp?|idl|cc|java)$/i; # Duplicated in lib/LXR/Common.pm } close(FILES); &findident; &findusage; &dumpdb; dbmclose(%fileidx); rename("fileidx.out.$$", "fileidx") || die "Couldn't rename fileidx.out.$$ to fileidx";