[Bug #19728] Auto-generate unicode property docs

https://bugs.ruby-lang.org/issues/19728
2023-07-01 16:22:17 +02:00 · 2023-07-01 16:22:17 +02:00 · 08b3fb1152
--- a/common.mk
+++ b/common.mk
@ -430,7 +430,7 @@ ruby.imp: $(COMMONOBJS)
 	sort -u -o $@

 install: install-$(INSTALLDOC)
-docs: $(DOCTARGETS)
+docs: srcs-doc $(DOCTARGETS)
 pkgconfig-data: $(ruby_pc)
 $(ruby_pc): $(srcdir)/template/ruby.pc.in config.status

@ -624,15 +624,15 @@ do-install-dbg: $(PROGRAM) pre-install-dbg
 post-install-dbg::
 	@$(NULLCMD)

-rdoc: PHONY main
+rdoc: PHONY main srcs-doc
 	@echo Generating RDoc documentation
 	$(Q) $(RDOC) --ri --op "$(RDOCOUT)" $(RDOC_GEN_OPTS) $(RDOCFLAGS) "$(srcdir)"

-html: PHONY main
+html: PHONY main srcs-doc
 	@echo Generating RDoc HTML files
 	$(Q) $(RDOC) --op "$(HTMLOUT)" $(RDOC_GEN_OPTS) $(RDOCFLAGS) "$(srcdir)"

-rdoc-coverage: PHONY main
+rdoc-coverage: PHONY main srcs-doc
 	@echo Generating RDoc coverage report
 	$(Q) $(RDOC) --quiet -C $(RDOCFLAGS) "$(srcdir)"

@ -1142,7 +1142,7 @@ common-srcs: $(srcs_vpath)parse.c $(srcs_vpath)lex.c $(srcs_vpath)enc/trans/newl

 missing-srcs: $(srcdir)/missing/des_tables.c

-srcs: common-srcs missing-srcs srcs-enc
+srcs: common-srcs missing-srcs srcs-enc srcs-doc

 RIPPER_SRCS = $(srcdir)/ext/ripper/ripper.c \
 	      $(srcdir)/ext/ripper/ripper_init.c \
@ -1730,6 +1730,12 @@ $(UNICODE_HDR_DIR)/name2ctype.h:
 		$(UNICODE_SRC_DATA_DIR) $(UNICODE_SRC_EMOJI_DATA_DIR) > $@.new
 	$(MV) $@.new $@

+srcs-doc: $(srcdir)/doc/regexp/unicode_properties.rdoc
+$(srcdir)/doc/regexp/unicode_properties.rdoc: $(UNICODE_HDR_DIR)/name2ctype.h $(UNICODE_PROPERTY_FILES)
+	$(Q) $(BOOTSTRAPRUBY) $(tooldir)/generic_erb.rb -c -o $@ \
+		$(srcdir)/template/unicode_properties.rdoc.tmpl \
+		$(UNICODE_SRC_DATA_DIR) $(UNICODE_HDR_DIR)/name2ctype.h
+
 # the next non-comment line was:
 # $(UNICODE_HDR_DIR)/casefold.h: $(tooldir)/enc-case-folding.rb \
 # but was changed to make sure CI works on systems that don't have gperf
--- a/doc/regexp/unicode_properties.rdoc
+++ b/doc/regexp/unicode_properties.rdoc
--- a/enc/unicode/15.0.0/name2ctype.h
+++ b/enc/unicode/15.0.0/name2ctype.h
@ -5402,7 +5402,7 @@ static const OnigCodePoint CR_ASCII[] = {
 	0x0000, 0x007f,
 }; /* CR_ASCII */

-/* 'Punct' */
+/* 'Punct': [[:Punct:]] */
 static const OnigCodePoint CR_Punct[] = {
 	191,
 	0x0021, 0x0023,
--- a/template/unicode_properties.rdoc.tmpl
+++ b/template/unicode_properties.rdoc.tmpl
@ -0,0 +1,59 @@
+== \Regexps Based on Unicode Properties
+
+The properties shown here are those currently supported in Ruby.
+Older versions may not support all of these.
+<%
+# Generate a documentation file for the unicode properties.
+#
+# Usage:
+#
+# Get PropertyAliases.txt, PropertyValueAliases.txt from unicode.org
+# (http://unicode.org/Public/UNIDATA/) and run
+# ```
+# ruby tool/generic_erb.rb template/unicode_properties.rdoc.tmpl data_dir name2ctype.h
+# ```
+
+data_dir = ARGV.shift&.tap { |d| Dir.exist?(d) } ||
+  abort("Usage: #{$0} data_directory [name2ctype.h]")
+
+# Map group names, given as last argument to #make_const in enc-unicode.rb,
+# to sections in the doc. The order in this hash controls the order in the doc.
+map = {
+  /\[\[:/            => 'POSIX brackets',
+  '-'                => 'Special',
+  /.+ Category/      => 'Major and General Categories',
+  'Binary Property'  => 'Prop List',
+  /Derived Property/ => 'Derived Core Properties',
+  'Script'           => 'Scripts',
+  'Block'            => 'Blocks',
+  'Emoji'            => 'Emoji',
+  /Grapheme/         => 'Graphemes',
+  /Derived Age/      => 'Derived Ages',
+}
+
+# aliases in the form { short => long }, e.g. { 'Hex' => 'Hex_Digit', 'L' => 'Letter' }
+aliases = (
+  File.binread(File.join(data_dir, 'PropertyAliases.txt')).scan(/^(\w+)\s*; (\w+)/) +
+  File.binread(File.join(data_dir, 'PropertyValueAliases.txt')).scan(/^(?:gc|sc)\s*; (\w+)\s*; (\w+)/)
+).to_h
+
+props_by_section = {}
+ARGF.each_line do |line|
+  next unless /'(?<prop>[^']+)': (?<name>.+) \*/ =~ line
+  next if prop == 'NEWLINE' # ignore custom internal prop
+
+  section = map.find { |k, v| k === name }&.last || warn("no doc section for #{name}")
+
+  # normalize prop names - the header file uses a mix of short and long names
+  long_prop_name = aliases[prop] || prop
+  (props_by_section[section] ||= []) << long_prop_name
+end
+
+map.each_value do |section| -%>
+
+=== <%=section%>
+
+%   props_by_section[section].sort.each do |prop|
+- <%= [prop, aliases.key(prop)].compact.uniq.map { |v| "<tt>\\p{#{v}}</tt>" }.join(', ') %>
+%   end
+% end
--- a/tool/enc-unicode.rb
+++ b/tool/enc-unicode.rb
@ -269,23 +269,12 @@ def parse_block(data)
  blocks << constname
 end

-# shim for Ruby 1.8
-unless {}.respond_to?(:key)
-  class Hash
-    alias key index
-  end
-end
-
 $const_cache = {}
 # make_const(property, pairs, name): Prints a 'static const' structure for a
 # given property, group of paired codepoints, and a human-friendly name for
 # the group
 def make_const(prop, data, name)
-  if name.empty?
-    puts "\n/* '#{prop}' */"
-  else
-    puts "\n/* '#{prop}': #{name} */"
-  end
+  puts "\n/* '#{prop}': #{name} */" # comment used to generate documentation
  if origprop = $const_cache.key(data)
    puts "#define CR_#{prop} CR_#{origprop}"
  else
@ -437,8 +426,6 @@ define_posix_props(data)
 POSIX_NAMES.each do |name|
  if name == 'XPosixPunct'
    make_const(name, data[name], "[[:Punct:]]")
-  elsif name == 'Punct'
-    make_const(name, data[name], "")
  else
    make_const(name, data[name], "[[:#{name}:]]")
  end