1998-06-17 01:43:24 +04:00
|
|
|
|
# -*- Mode: perl; tab-width: 4; indent-tabs-mode: nil -*-
|
|
|
|
|
#
|
1999-11-02 02:33:56 +03:00
|
|
|
|
# The contents of this file are subject to the Netscape Public
|
|
|
|
|
# License Version 1.1 (the "License"); you may not use this file
|
|
|
|
|
# except in compliance with the License. You may obtain a copy of
|
|
|
|
|
# the License at http://www.mozilla.org/NPL/
|
1998-06-17 01:43:24 +04:00
|
|
|
|
#
|
1999-11-02 02:33:56 +03:00
|
|
|
|
# Software distributed under the License is distributed on an "AS
|
|
|
|
|
# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
|
|
|
|
|
# implied. See the License for the specific language governing
|
|
|
|
|
# rights and limitations under the License.
|
1998-06-17 01:43:24 +04:00
|
|
|
|
#
|
|
|
|
|
# The Original Code is the Bonsai CVS tool.
|
|
|
|
|
#
|
|
|
|
|
# The Initial Developer of the Original Code is Netscape Communications
|
1999-11-02 02:33:56 +03:00
|
|
|
|
# Corporation. Portions created by Netscape are
|
|
|
|
|
# Copyright (C) 1998 Netscape Communications Corporation. All
|
|
|
|
|
# Rights Reserved.
|
|
|
|
|
#
|
|
|
|
|
# Contributor(s):
|
1998-06-17 01:43:24 +04:00
|
|
|
|
|
|
|
|
|
package SourceChecker;
|
|
|
|
|
require Exporter;
|
|
|
|
|
@ISA = qw(Exporter);
|
1999-07-28 03:19:02 +04:00
|
|
|
|
@EXPORT = qw(%token_dictionary add_good_english add_good_words add_bad_words add_names tokenize_line markup_line);
|
|
|
|
|
@EXPORT_OK = qw($GOOD_TOKEN $UNKNOWN_TOKEN $BAD_TOKEN $NAME_TOKEN add_token canonical_token @markup_prefix @markup_suffix);
|
1998-06-17 01:43:24 +04:00
|
|
|
|
|
|
|
|
|
$GOOD_TOKEN = \-1;
|
|
|
|
|
$UNKNOWN_TOKEN = \0;
|
|
|
|
|
$NAME_TOKEN = \1;
|
|
|
|
|
$BAD_TOKEN = \2;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@markup_prefix = ('<FONT COLOR="green">', '<FONT COLOR="red">', '<FONT COLOR="blue">');
|
|
|
|
|
@markup_suffix = ('</FONT>', '</FONT>', '</FONT>');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub canonical_token($)
|
|
|
|
|
{
|
|
|
|
|
my $token = shift;
|
|
|
|
|
|
|
|
|
|
if ( defined $token )
|
|
|
|
|
{
|
|
|
|
|
$token =~ s/[\'<27>\&]+//g;
|
|
|
|
|
$token = length($token)>2 ? lc $token : undef;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$token;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub _push_tokens($$)
|
|
|
|
|
{
|
|
|
|
|
# Note: inherits |@exploded_phrases| and |@exploded_tokens| from caller(s)
|
|
|
|
|
push @exploded_phrases, shift;
|
|
|
|
|
push @exploded_tokens, canonical_token(shift);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub _explode_line($)
|
|
|
|
|
{
|
|
|
|
|
# Note: inherits (and returns results into) |@exploded_phrases| and |@exploded_tokens| from caller(s)
|
|
|
|
|
|
|
|
|
|
my $line = shift;
|
|
|
|
|
|
|
|
|
|
my $between_tokens = 0;
|
|
|
|
|
foreach $phrase ( split /([A-Za-z\'<27>\&]+)/, $line )
|
|
|
|
|
{
|
|
|
|
|
if ( $between_tokens = !$between_tokens )
|
|
|
|
|
{
|
|
|
|
|
_push_tokens($phrase, undef);
|
|
|
|
|
next;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for ( $_ = $phrase; $_; )
|
|
|
|
|
{
|
|
|
|
|
m/^[A-Z\'<27>\&]*[a-z\'<27>\&]*/;
|
|
|
|
|
$token = $&;
|
|
|
|
|
$_ = $';
|
|
|
|
|
|
|
|
|
|
if ( ($token =~ m/[A-Z][a-z\'<27>]+/) && $` )
|
|
|
|
|
{
|
|
|
|
|
$token = $&;
|
|
|
|
|
_push_tokens($`, $`);
|
|
|
|
|
}
|
|
|
|
|
_push_tokens($token, $token);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$#exploded_phrases;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub tokenize_line($)
|
|
|
|
|
{
|
|
|
|
|
my $line = shift;
|
|
|
|
|
local @exploded_tokens;
|
|
|
|
|
_explode_line($line);
|
|
|
|
|
|
|
|
|
|
my $i = -1;
|
|
|
|
|
foreach $token ( @exploded_tokens )
|
|
|
|
|
{
|
|
|
|
|
$exploded_tokens[++$i] = $token if defined $token;
|
|
|
|
|
}
|
|
|
|
|
$#exploded_tokens = $i;
|
|
|
|
|
@exploded_tokens;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub markup_line($)
|
|
|
|
|
{
|
|
|
|
|
my $line = shift;
|
|
|
|
|
|
|
|
|
|
local @exploded_phrases;
|
|
|
|
|
local @exploded_tokens;
|
|
|
|
|
|
|
|
|
|
_explode_line($line);
|
|
|
|
|
|
|
|
|
|
$i = 0;
|
|
|
|
|
foreach $phrase ( @exploded_phrases )
|
|
|
|
|
{
|
|
|
|
|
$phrase =~ s/&/&/g;
|
|
|
|
|
$phrase =~ s/</</g;
|
|
|
|
|
$phrase =~ s/>/>/g;
|
|
|
|
|
|
|
|
|
|
my $token = $exploded_tokens[$i];
|
|
|
|
|
if ( defined $token && ($token_kind = $token_dictionary{$token}) >= 0 )
|
|
|
|
|
{
|
|
|
|
|
$phrase = $markup_prefix[$token_kind] . $phrase . $markup_suffix[$token_kind];
|
|
|
|
|
}
|
|
|
|
|
++$i;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
join '', @exploded_phrases;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub add_token($$)
|
|
|
|
|
{
|
|
|
|
|
(my $token, my $token_kind) = @_;
|
|
|
|
|
if ( !defined $token_dictionary{$token} || ($token_kind > $token_dictionary{$token}) )
|
|
|
|
|
{
|
|
|
|
|
$token_dictionary{$token} = $token_kind;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sub add_good_english($)
|
|
|
|
|
{
|
|
|
|
|
my $line = shift;
|
|
|
|
|
|
|
|
|
|
foreach $token ( tokenize_line($line) )
|
|
|
|
|
{
|
|
|
|
|
add_token($token, $$GOOD_TOKEN);
|
|
|
|
|
|
|
|
|
|
my $initial_char = substr($token, 0, 1);
|
|
|
|
|
(my $remainder = substr($token, 1)) =~ s/[aeiouy]+//g;
|
|
|
|
|
|
|
|
|
|
$abbreviated_length = length($remainder) + 1;
|
|
|
|
|
if ( $abbreviated_length != length($token) && $abbreviated_length > 2 )
|
|
|
|
|
{
|
|
|
|
|
add_token("$initial_char$remainder", $$GOOD_TOKEN);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sub _add_tokens($$)
|
|
|
|
|
{
|
|
|
|
|
(my $line, my $token_kind) = @_;
|
|
|
|
|
|
|
|
|
|
foreach $token ( tokenize_line($line) )
|
|
|
|
|
{
|
|
|
|
|
add_token($token, $token_kind);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sub add_good_words($)
|
|
|
|
|
{
|
|
|
|
|
_add_tokens(shift, $$GOOD_TOKEN);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sub add_bad_words($)
|
|
|
|
|
{
|
|
|
|
|
_add_tokens(shift, $$BAD_TOKEN);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sub add_names($)
|
|
|
|
|
{
|
|
|
|
|
_add_tokens(shift, $$NAME_TOKEN);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
1;
|