[Bug #19728] Auto-generate unicode property docs

https://bugs.ruby-lang.org/issues/19728
This commit is contained in:
Janosch Müller 2023-07-01 16:22:17 +02:00 коммит произвёл GitHub
Родитель 3fd1968d6f
Коммит 08b3fb1152
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 728 добавлений и 861 удалений

Просмотреть файл

@ -430,7 +430,7 @@ ruby.imp: $(COMMONOBJS)
sort -u -o $@
install: install-$(INSTALLDOC)
docs: $(DOCTARGETS)
docs: srcs-doc $(DOCTARGETS)
pkgconfig-data: $(ruby_pc)
$(ruby_pc): $(srcdir)/template/ruby.pc.in config.status
@ -624,15 +624,15 @@ do-install-dbg: $(PROGRAM) pre-install-dbg
post-install-dbg::
@$(NULLCMD)
rdoc: PHONY main
rdoc: PHONY main srcs-doc
@echo Generating RDoc documentation
$(Q) $(RDOC) --ri --op "$(RDOCOUT)" $(RDOC_GEN_OPTS) $(RDOCFLAGS) "$(srcdir)"
html: PHONY main
html: PHONY main srcs-doc
@echo Generating RDoc HTML files
$(Q) $(RDOC) --op "$(HTMLOUT)" $(RDOC_GEN_OPTS) $(RDOCFLAGS) "$(srcdir)"
rdoc-coverage: PHONY main
rdoc-coverage: PHONY main srcs-doc
@echo Generating RDoc coverage report
$(Q) $(RDOC) --quiet -C $(RDOCFLAGS) "$(srcdir)"
@ -1142,7 +1142,7 @@ common-srcs: $(srcs_vpath)parse.c $(srcs_vpath)lex.c $(srcs_vpath)enc/trans/newl
missing-srcs: $(srcdir)/missing/des_tables.c
srcs: common-srcs missing-srcs srcs-enc
srcs: common-srcs missing-srcs srcs-enc srcs-doc
RIPPER_SRCS = $(srcdir)/ext/ripper/ripper.c \
$(srcdir)/ext/ripper/ripper_init.c \
@ -1730,6 +1730,12 @@ $(UNICODE_HDR_DIR)/name2ctype.h:
$(UNICODE_SRC_DATA_DIR) $(UNICODE_SRC_EMOJI_DATA_DIR) > $@.new
$(MV) $@.new $@
srcs-doc: $(srcdir)/doc/regexp/unicode_properties.rdoc
$(srcdir)/doc/regexp/unicode_properties.rdoc: $(UNICODE_HDR_DIR)/name2ctype.h $(UNICODE_PROPERTY_FILES)
$(Q) $(BOOTSTRAPRUBY) $(tooldir)/generic_erb.rb -c -o $@ \
$(srcdir)/template/unicode_properties.rdoc.tmpl \
$(UNICODE_SRC_DATA_DIR) $(UNICODE_HDR_DIR)/name2ctype.h
# the next non-comment line was:
# $(UNICODE_HDR_DIR)/casefold.h: $(tooldir)/enc-case-folding.rb \
# but was changed to make sure CI works on systems that don't have gperf

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -5402,7 +5402,7 @@ static const OnigCodePoint CR_ASCII[] = {
0x0000, 0x007f,
}; /* CR_ASCII */
/* 'Punct' */
/* 'Punct': [[:Punct:]] */
static const OnigCodePoint CR_Punct[] = {
191,
0x0021, 0x0023,

Просмотреть файл

@ -0,0 +1,59 @@
== \Regexps Based on Unicode Properties
The properties shown here are those currently supported in Ruby.
Older versions may not support all of these.
<%
# Generate a documentation file for the unicode properties.
#
# Usage:
#
# Get PropertyAliases.txt, PropertyValueAliases.txt from unicode.org
# (http://unicode.org/Public/UNIDATA/) and run
# ```
# ruby tool/generic_erb.rb template/unicode_properties.rdoc.tmpl data_dir name2ctype.h
# ```
data_dir = ARGV.shift&.tap { |d| Dir.exist?(d) } ||
abort("Usage: #{$0} data_directory [name2ctype.h]")
# Map group names, given as last argument to #make_const in enc-unicode.rb,
# to sections in the doc. The order in this hash controls the order in the doc.
map = {
/\[\[:/ => 'POSIX brackets',
'-' => 'Special',
/.+ Category/ => 'Major and General Categories',
'Binary Property' => 'Prop List',
/Derived Property/ => 'Derived Core Properties',
'Script' => 'Scripts',
'Block' => 'Blocks',
'Emoji' => 'Emoji',
/Grapheme/ => 'Graphemes',
/Derived Age/ => 'Derived Ages',
}
# aliases in the form { short => long }, e.g. { 'Hex' => 'Hex_Digit', 'L' => 'Letter' }
aliases = (
File.binread(File.join(data_dir, 'PropertyAliases.txt')).scan(/^(\w+)\s*; (\w+)/) +
File.binread(File.join(data_dir, 'PropertyValueAliases.txt')).scan(/^(?:gc|sc)\s*; (\w+)\s*; (\w+)/)
).to_h
props_by_section = {}
ARGF.each_line do |line|
next unless /'(?<prop>[^']+)': (?<name>.+) \*/ =~ line
next if prop == 'NEWLINE' # ignore custom internal prop
section = map.find { |k, v| k === name }&.last || warn("no doc section for #{name}")
# normalize prop names - the header file uses a mix of short and long names
long_prop_name = aliases[prop] || prop
(props_by_section[section] ||= []) << long_prop_name
end
map.each_value do |section| -%>
=== <%=section%>
% props_by_section[section].sort.each do |prop|
- <%= [prop, aliases.key(prop)].compact.uniq.map { |v| "<tt>\\p{#{v}}</tt>" }.join(', ') %>
% end
% end

Просмотреть файл

@ -269,23 +269,12 @@ def parse_block(data)
blocks << constname
end
# shim for Ruby 1.8
unless {}.respond_to?(:key)
class Hash
alias key index
end
end
$const_cache = {}
# make_const(property, pairs, name): Prints a 'static const' structure for a
# given property, group of paired codepoints, and a human-friendly name for
# the group
def make_const(prop, data, name)
if name.empty?
puts "\n/* '#{prop}' */"
else
puts "\n/* '#{prop}': #{name} */"
end
puts "\n/* '#{prop}': #{name} */" # comment used to generate documentation
if origprop = $const_cache.key(data)
puts "#define CR_#{prop} CR_#{origprop}"
else
@ -437,8 +426,6 @@ define_posix_props(data)
POSIX_NAMES.each do |name|
if name == 'XPosixPunct'
make_const(name, data[name], "[[:Punct:]]")
elsif name == 'Punct'
make_const(name, data[name], "")
else
make_const(name, data[name], "[[:#{name}:]]")
end