ruby/test/racc/assets/namae.y

303 строки
8.0 KiB
Plaintext

# -*- ruby -*-
# vi: set ft=ruby :
# Copyright (C) 2012 President and Fellows of Harvard College
# Copyright (C) 2013-2014 Sylvester Keil
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of the copyright holder.
class Namae::Parser
token COMMA UWORD LWORD PWORD NICK AND APPELLATION TITLE SUFFIX
expect 0
rule
names : { result = [] }
| name { result = [val[0]] }
| names AND name { result = val[0] << val[2] }
name : word { result = Name.new(:given => val[0]) }
| display_order
| honorific word { result = val[0].merge(:family => val[1]) }
| honorific display_order { result = val[1].merge(val[0]) }
| sort_order
honorific : APPELLATION { result = Name.new(:appellation => val[0]) }
| TITLE { result = Name.new(:title => val[0]) }
display_order : u_words word opt_suffices opt_titles
{
result = Name.new(:given => val[0], :family => val[1],
:suffix => val[2], :title => val[3])
}
| u_words NICK last opt_suffices opt_titles
{
result = Name.new(:given => val[0], :nick => val[1],
:family => val[2], :suffix => val[3], :title => val[4])
}
| u_words NICK von last opt_suffices opt_titles
{
result = Name.new(:given => val[0], :nick => val[1],
:particle => val[2], :family => val[3],
:suffix => val[4], :title => val[5])
}
| u_words von last
{
result = Name.new(:given => val[0], :particle => val[1],
:family => val[2])
}
| von last
{
result = Name.new(:particle => val[0], :family => val[1])
}
sort_order : last COMMA first
{
result = Name.new({ :family => val[0], :suffix => val[2][0],
:given => val[2][1] }, !!val[2][0])
}
| von last COMMA first
{
result = Name.new({ :particle => val[0], :family => val[1],
:suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
}
| u_words von last COMMA first
{
result = Name.new({ :particle => val[0,2].join(' '), :family => val[2],
:suffix => val[4][0], :given => val[4][1] }, !!val[4][0])
}
;
von : LWORD
| von LWORD { result = val.join(' ') }
| von u_words LWORD { result = val.join(' ') }
last : LWORD | u_words
first : opt_words { result = [nil,val[0]] }
| words opt_comma suffices { result = [val[2],val[0]] }
| suffices { result = [val[0],nil] }
| suffices COMMA words { result = [val[0],val[2]] }
u_words : u_word
| u_words u_word { result = val.join(' ') }
u_word : UWORD | PWORD
words : word
| words word { result = val.join(' ') }
opt_comma : /* empty */ | COMMA
opt_words : /* empty */ | words
word : LWORD | UWORD | PWORD
opt_suffices : /* empty */ | suffices
suffices : SUFFIX
| suffices SUFFIX { result = val.join(' ') }
opt_titles : /* empty */ | titles
titles : TITLE
| titles TITLE { result = val.join(' ') }
---- header
require 'singleton'
require 'strscan'
---- inner
include Singleton
attr_reader :options, :input
def initialize
@input, @options = StringScanner.new(''), {
:debug => false,
:prefer_comma_as_separator => false,
:comma => ',',
:stops => ',;',
:separator => /\s*(\band\b|\&|;)\s*/i,
:title => /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|prof|dr|md|ph\.?d)\.?)(\s+|$)/i,
:suffix => /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/,
:appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
}
end
def debug?
options[:debug] || ENV['DEBUG']
end
def separator
options[:separator]
end
def comma
options[:comma]
end
def stops
options[:stops]
end
def title
options[:title]
end
def suffix
options[:suffix]
end
def appellation
options[:appellation]
end
def prefer_comma_as_separator?
options[:prefer_comma_as_separator]
end
def parse(input)
parse!(input)
rescue => e
warn e.message if debug?
[]
end
def parse!(string)
input.string = normalize(string)
reset
do_parse
end
def normalize(string)
string = string.strip
string
end
def reset
@commas, @words, @initials, @suffices, @yydebug = 0, 0, 0, 0, debug?
self
end
private
def stack
@vstack || @racc_vstack || []
end
def last_token
stack[-1]
end
def consume_separator
return next_token if seen_separator?
@commas, @words, @initials, @suffices = 0, 0, 0, 0
[:AND, :AND]
end
def consume_comma
@commas += 1
[:COMMA, :COMMA]
end
def consume_word(type, word)
@words += 1
case type
when :UWORD
@initials += 1 if word =~ /^[[:upper:]]+\b/
when :SUFFIX
@suffices += 1
end
[type, word]
end
def seen_separator?
!stack.empty? && last_token == :AND
end
def suffix?
!@suffices.zero? || will_see_suffix?
end
def will_see_suffix?
input.peek(8).to_s.strip.split(/\s+/)[0] =~ suffix
end
def will_see_initial?
input.peek(6).to_s.strip.split(/\s+/)[0] =~ /^[[:upper:]]+\b/
end
def seen_full_name?
prefer_comma_as_separator? && @words > 1 &&
(@initials > 0 || !will_see_initial?) && !will_see_suffix?
end
def next_token
case
when input.nil?, input.eos?
nil
when input.scan(separator)
consume_separator
when input.scan(/\s*#{comma}\s*/)
if @commas.zero? && !seen_full_name? || @commas == 1 && suffix?
consume_comma
else
consume_separator
end
when input.scan(/\s+/)
next_token
when input.scan(title)
consume_word(:TITLE, input.matched.strip)
when input.scan(suffix)
consume_word(:SUFFIX, input.matched.strip)
when input.scan(appellation)
[:APPELLATION, input.matched.strip]
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:upper:]][^\s#{stops}]*/)
consume_word(:UWORD, input.matched)
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:lower:]][^\s#{stops}]*/)
consume_word(:LWORD, input.matched)
when input.scan(/(\\\w+)?\{[^\}]*\}[^\s#{stops}]*/)
consume_word(:PWORD, input.matched)
when input.scan(/('[^'\n]+')|("[^"\n]+")/)
consume_word(:NICK, input.matched[1...-1])
else
raise ArgumentError,
"Failed to parse name #{input.string.inspect}: unmatched data at offset #{input.pos}"
end
end
def on_error(tid, value, stack)
raise ArgumentError,
"Failed to parse name: unexpected '#{value}' at #{stack.inspect}"
end
# -*- racc -*-