pjs/webtools/bonsai/SourceChecker.pm

# -*- Mode: perl; tab-width: 4; indent-tabs-mode: nil -*-
#
# The contents of this file are subject to the Netscape Public
# License Version 1.1 (the "License"); you may not use this file
# except in compliance with the License. You may obtain a copy of
# the License at http://www.mozilla.org/NPL/
#
# Software distributed under the License is distributed on an "AS
# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
# implied. See the License for the specific language governing
# rights and limitations under the License.
#
# The Original Code is the Bonsai CVS tool.
#
# The Initial Developer of the Original Code is Netscape Communications
# Corporation. Portions created by Netscape are
# Copyright (C) 1998 Netscape Communications Corporation. All
# Rights Reserved.
#
# Contributor(s): 

package				SourceChecker;
require				Exporter;
@ISA				=	qw(Exporter);
@EXPORT			= qw(%token_dictionary add_good_english add_good_words add_bad_words add_names tokenize_line markup_line);
@EXPORT_OK	=	qw($GOOD_TOKEN $UNKNOWN_TOKEN $BAD_TOKEN $NAME_TOKEN add_token canonical_token @markup_prefix @markup_suffix);

$GOOD_TOKEN			= \-1;
$UNKNOWN_TOKEN	= \0;
$NAME_TOKEN			= \1;
$BAD_TOKEN			= \2;


@markup_prefix	= ('<FONT COLOR="green">', '<FONT COLOR="red">', '<FONT COLOR="blue">');
@markup_suffix	= ('</FONT>', '</FONT>', '</FONT>');


sub canonical_token($)
	{
		my $token = shift;

		if ( defined $token )
			{
				$token =~ s/[\'<27>\&]+//g;
				$token = length($token)>2 ? lc $token : undef;
			}

		$token;
	}


sub _push_tokens($$)
	{
			# Note: inherits |@exploded_phrases| and |@exploded_tokens| from caller(s)
		push @exploded_phrases, shift;
		push @exploded_tokens, canonical_token(shift);
	}


sub _explode_line($)
	{
			# Note: inherits (and returns results into) |@exploded_phrases| and |@exploded_tokens| from caller(s)

		my $line = shift;

		my $between_tokens = 0;
		foreach $phrase ( split /([A-Za-z\'<27>\&]+)/, $line )
			{
				if ( $between_tokens = !$between_tokens )
					{
						_push_tokens($phrase, undef);
						next;
					}

				for ( $_ = $phrase; $_; )
					{
						m/^[A-Z\'<27>\&]*[a-z\'<27>\&]*/;
						$token = $&;
						$_ = $';

						if ( ($token =~ m/[A-Z][a-z\'<27>]+/) && $` )
							{
								$token = $&;
								_push_tokens($`, $`);
							}
						_push_tokens($token, $token);
					}
			}

		$#exploded_phrases;
	}


sub tokenize_line($)
	{
		my $line = shift;
		local @exploded_tokens;
		_explode_line($line);

		my $i = -1;
		foreach $token ( @exploded_tokens )
			{
				$exploded_tokens[++$i] = $token if defined $token;
			}
		$#exploded_tokens = $i;
		@exploded_tokens;
	}


sub markup_line($)
	{
		my $line = shift;

		local @exploded_phrases;
		local @exploded_tokens;

		_explode_line($line);

		$i = 0;
		foreach $phrase ( @exploded_phrases )
			{
				$phrase =~ s/&/&amp;/g;
				$phrase =~ s/</&lt;/g;
				$phrase =~ s/>/&gt;/g;

				my $token = $exploded_tokens[$i];
				if ( defined $token && ($token_kind = $token_dictionary{$token}) >= 0 )
					{
						$phrase = $markup_prefix[$token_kind] . $phrase . $markup_suffix[$token_kind];
					}
				++$i;
			}

		join '', @exploded_phrases;
	}


sub add_token($$)
	{
		(my $token, my $token_kind) = @_;
		if ( !defined $token_dictionary{$token} || ($token_kind > $token_dictionary{$token}) )
			{
				$token_dictionary{$token} = $token_kind;
			}
	}

sub add_good_english($)
	{
		my $line = shift;
 
		foreach $token ( tokenize_line($line) )
			{
				add_token($token, $$GOOD_TOKEN);

				my $initial_char = substr($token, 0, 1);
				(my $remainder = substr($token, 1)) =~ s/[aeiouy]+//g;

				$abbreviated_length = length($remainder) + 1;
				if ( $abbreviated_length != length($token) && $abbreviated_length > 2 )
					{
						add_token("$initial_char$remainder", $$GOOD_TOKEN);
					}
			}
	}

sub _add_tokens($$)
	{
		(my $line, my $token_kind) = @_;

		foreach $token ( tokenize_line($line) )
			{
				add_token($token, $token_kind);
			}
	}

sub add_good_words($)
	{
		_add_tokens(shift, $$GOOD_TOKEN);
	}

sub add_bad_words($)
	{
		_add_tokens(shift, $$BAD_TOKEN);
	}

sub add_names($)
	{
		_add_tokens(shift, $$NAME_TOKEN);
	}

1;
-												Bonsai and Tinderbox have been freed.

											
										
										
											1998-06-17 01:43:24 +04:00
+								# -*- Mode: perl; tab-width: 4; indent-tabs-mode: nil -*-
 								#
-												updated license boilerplate

											
										
										
											1999-11-02 02:33:56 +03:00
+								# The contents of this file are subject to the Netscape Public
 								# License Version 1.1 (the "License"); you may not use this file
 								# except in compliance with the License. You may obtain a copy of
 								# the License at http://www.mozilla.org/NPL/
-												Bonsai and Tinderbox have been freed.

											
										
										
											1998-06-17 01:43:24 +04:00
+								#
-												updated license boilerplate

											
										
										
											1999-11-02 02:33:56 +03:00
+								# Software distributed under the License is distributed on an "AS
 								# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 								# implied. See the License for the specific language governing
 								# rights and limitations under the License.
-												Bonsai and Tinderbox have been freed.

											
										
										
											1998-06-17 01:43:24 +04:00
+								#
 								# The Original Code is the Bonsai CVS tool.
 								#
 								# The Initial Developer of the Original Code is Netscape Communications
-												updated license boilerplate

											
										
										
											1999-11-02 02:33:56 +03:00
+								# Corporation. Portions created by Netscape are
 								# Copyright (C) 1998 Netscape Communications Corporation. All
 								# Rights Reserved.
 								#
 								# Contributor(s):
-												Bonsai and Tinderbox have been freed.

											
										
										
											1998-06-17 01:43:24 +04:00
 								package				SourceChecker;
 								require				Exporter;
 								@ISA				=	qw(Exporter);
-												Got rid of some silly warnings.

											
										
										
											1999-07-28 03:19:02 +04:00
+								@EXPORT			= qw(%token_dictionary add_good_english add_good_words add_bad_words add_names tokenize_line markup_line);
 								@EXPORT_OK	=	qw($GOOD_TOKEN $UNKNOWN_TOKEN $BAD_TOKEN $NAME_TOKEN add_token canonical_token @markup_prefix @markup_suffix);
-												Bonsai and Tinderbox have been freed.

											
										
										
											1998-06-17 01:43:24 +04:00
 								$GOOD_TOKEN			= \-1;
 								$UNKNOWN_TOKEN	= \0;
 								$NAME_TOKEN			= \1;
 								$BAD_TOKEN			= \2;
 								@markup_prefix	= ('<FONT COLOR="green">', '<FONT COLOR="red">', '<FONT COLOR="blue">');
 								@markup_suffix	= ('</FONT>', '</FONT>', '</FONT>');
 								sub canonical_token($)
 									{
 										my $token = shift;
 										if ( defined $token )
 											{
 												$token =~ s/[\'<27>\&]+//g;
 												$token = length($token)>2 ? lc $token : undef;
 											}
 										$token;
 									}
 								sub _push_tokens($$)
 									{
 											# Note: inherits |@exploded_phrases| and |@exploded_tokens| from caller(s)
 										push @exploded_phrases, shift;
 										push @exploded_tokens, canonical_token(shift);
 									}
 								sub _explode_line($)
 									{
 											# Note: inherits (and returns results into) |@exploded_phrases| and |@exploded_tokens| from caller(s)
 										my $line = shift;
 										my $between_tokens = 0;
 										foreach $phrase ( split /([A-Za-z\'<27>\&]+)/, $line )
 											{
 												if ( $between_tokens = !$between_tokens )
 													{
 														_push_tokens($phrase, undef);
 														next;
 													}
 												for ( $_ = $phrase; $_; )
 													{
 														m/^[A-Z\'<27>\&]*[a-z\'<27>\&]*/;
 														$token = $&;
 														$_ = $';
 														if ( ($token =~ m/[A-Z][a-z\'<27>]+/) && $` )
 															{
 																$token = $&;
 																_push_tokens($`, $`);
 															}
 														_push_tokens($token, $token);
 													}
 											}
 										$#exploded_phrases;
 									}
 								sub tokenize_line($)
 									{
 										my $line = shift;
 										local @exploded_tokens;
 										_explode_line($line);
 										my $i = -1;
 										foreach $token ( @exploded_tokens )
 											{
 												$exploded_tokens[++$i] = $token if defined $token;
 											}
 										$#exploded_tokens = $i;
 										@exploded_tokens;
 									}
 								sub markup_line($)
 									{
 										my $line = shift;
 										local @exploded_phrases;
 										local @exploded_tokens;
 										_explode_line($line);
 										$i = 0;
 										foreach $phrase ( @exploded_phrases )
 											{
 												$phrase =~ s/&/&amp;/g;
 												$phrase =~ s/</&lt;/g;
 												$phrase =~ s/>/&gt;/g;
 												my $token = $exploded_tokens[$i];
 												if ( defined $token && ($token_kind = $token_dictionary{$token}) >= 0 )
 													{
 														$phrase = $markup_prefix[$token_kind] . $phrase . $markup_suffix[$token_kind];
 													}
 												++$i;
 											}
 										join '', @exploded_phrases;
 									}
 								sub add_token($$)
 									{
 										(my $token, my $token_kind) = @_;
 										if ( !defined $token_dictionary{$token} || ($token_kind > $token_dictionary{$token}) )
 											{
 												$token_dictionary{$token} = $token_kind;
 											}
 									}
 								sub add_good_english($)
 									{
 										my $line = shift;
 										foreach $token ( tokenize_line($line) )
 											{
 												add_token($token, $$GOOD_TOKEN);
 												my $initial_char = substr($token, 0, 1);
 												(my $remainder = substr($token, 1)) =~ s/[aeiouy]+//g;
 												$abbreviated_length = length($remainder) + 1;
 												if ( $abbreviated_length != length($token) && $abbreviated_length > 2 )
 													{
 														add_token("$initial_char$remainder", $$GOOD_TOKEN);
 													}
 											}
 									}
 								sub _add_tokens($$)
 									{
 										(my $line, my $token_kind) = @_;
 										foreach $token ( tokenize_line($line) )
 											{
 												add_token($token, $token_kind);
 											}
 									}
 								sub add_good_words($)
 									{
 										_add_tokens(shift, $$GOOD_TOKEN);
 									}
 								sub add_bad_words($)
 									{
 										_add_tokens(shift, $$BAD_TOKEN);
 									}
 								sub add_names($)
 									{
 										_add_tokens(shift, $$NAME_TOKEN);
 									}
 ;