From 08b3fb11524e6cde453476f24ac80fd60457dfef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Janosch=20M=C3=BCller?= Date: Sat, 1 Jul 2023 16:22:17 +0200 Subject: [PATCH] [Bug #19728] Auto-generate unicode property docs https://bugs.ruby-lang.org/issues/19728 --- common.mk | 16 +- doc/regexp/unicode_properties.rdoc | 1497 +++++++++++-------------- enc/unicode/15.0.0/name2ctype.h | 2 +- template/unicode_properties.rdoc.tmpl | 59 + tool/enc-unicode.rb | 15 +- 5 files changed, 728 insertions(+), 861 deletions(-) create mode 100755 template/unicode_properties.rdoc.tmpl diff --git a/common.mk b/common.mk index ddceb96fbf..4f153059f4 100644 --- a/common.mk +++ b/common.mk @@ -430,7 +430,7 @@ ruby.imp: $(COMMONOBJS) sort -u -o $@ install: install-$(INSTALLDOC) -docs: $(DOCTARGETS) +docs: srcs-doc $(DOCTARGETS) pkgconfig-data: $(ruby_pc) $(ruby_pc): $(srcdir)/template/ruby.pc.in config.status @@ -624,15 +624,15 @@ do-install-dbg: $(PROGRAM) pre-install-dbg post-install-dbg:: @$(NULLCMD) -rdoc: PHONY main +rdoc: PHONY main srcs-doc @echo Generating RDoc documentation $(Q) $(RDOC) --ri --op "$(RDOCOUT)" $(RDOC_GEN_OPTS) $(RDOCFLAGS) "$(srcdir)" -html: PHONY main +html: PHONY main srcs-doc @echo Generating RDoc HTML files $(Q) $(RDOC) --op "$(HTMLOUT)" $(RDOC_GEN_OPTS) $(RDOCFLAGS) "$(srcdir)" -rdoc-coverage: PHONY main +rdoc-coverage: PHONY main srcs-doc @echo Generating RDoc coverage report $(Q) $(RDOC) --quiet -C $(RDOCFLAGS) "$(srcdir)" @@ -1142,7 +1142,7 @@ common-srcs: $(srcs_vpath)parse.c $(srcs_vpath)lex.c $(srcs_vpath)enc/trans/newl missing-srcs: $(srcdir)/missing/des_tables.c -srcs: common-srcs missing-srcs srcs-enc +srcs: common-srcs missing-srcs srcs-enc srcs-doc RIPPER_SRCS = $(srcdir)/ext/ripper/ripper.c \ $(srcdir)/ext/ripper/ripper_init.c \ @@ -1730,6 +1730,12 @@ $(UNICODE_HDR_DIR)/name2ctype.h: $(UNICODE_SRC_DATA_DIR) $(UNICODE_SRC_EMOJI_DATA_DIR) > $@.new $(MV) $@.new $@ +srcs-doc: $(srcdir)/doc/regexp/unicode_properties.rdoc +$(srcdir)/doc/regexp/unicode_properties.rdoc: $(UNICODE_HDR_DIR)/name2ctype.h $(UNICODE_PROPERTY_FILES) + $(Q) $(BOOTSTRAPRUBY) $(tooldir)/generic_erb.rb -c -o $@ \ + $(srcdir)/template/unicode_properties.rdoc.tmpl \ + $(UNICODE_SRC_DATA_DIR) $(UNICODE_HDR_DIR)/name2ctype.h + # the next non-comment line was: # $(UNICODE_HDR_DIR)/casefold.h: $(tooldir)/enc-case-folding.rb \ # but was changed to make sure CI works on systems that don't have gperf diff --git a/doc/regexp/unicode_properties.rdoc b/doc/regexp/unicode_properties.rdoc index 354ed3a83c..a1d7ecc380 100644 --- a/doc/regexp/unicode_properties.rdoc +++ b/doc/regexp/unicode_properties.rdoc @@ -1,863 +1,678 @@ == \Regexps Based on Unicode Properties The properties shown here are those currently supported in Ruby. -Older versions may not support all of these; -newer versions may support additional properties. +Older versions may not support all of these. === POSIX brackets -- /\p{Alpha}/ -- /\p{Blank}/ -- /\p{Cntrl}/ -- /\p{Digit}/ -- /\p{Graph}/ -- /\p{Lower}/ -- /\p{Print}/ -- /\p{Punct}/ -- /\p{Space}/ -- /\p{Upper}/ -- /\p{XDigit}/ -- /\p{Word}/ -- /\p{Alnum}/ -- /\p{ASCII}/ -- /\p{XPosixPunct}/ +- \p{ASCII} +- \p{Alnum} +- \p{Alphabetic}, \p{Alpha} +- \p{Blank} +- \p{Cntrl} +- \p{Digit} +- \p{Graph} +- \p{Lowercase}, \p{Lower} +- \p{Print} +- \p{Punct} +- \p{Space} +- \p{Uppercase}, \p{Upper} +- \p{Word} +- \p{XDigit} +- \p{XPosixPunct} === Special -- /\p{Any}/ -- /\p{Assigned}/ +- \p{Any} +- \p{Assigned} === Major and General Categories -- /\p{C}/ -- /\p{Cc}/ -- /\p{Cf}/ -- /\p{Cn}/ -- /\p{Co}/ -- /\p{Cs}/ -- /\p{L}/ -- /\p{LC}/ -- /\p{Ll}/ -- /\p{Lm}/ -- /\p{Lo}/ -- /\p{Lt}/ -- /\p{Lu}/ -- /\p{M}/ -- /\p{Mc}/ -- /\p{Me}/ -- /\p{Mn}/ -- /\p{N}/ -- /\p{Nd}/ -- /\p{Nl}/ -- /\p{No}/ -- /\p{P}/ -- /\p{Pc}/ -- /\p{Pd}/ -- /\p{Pe}/ -- /\p{Pf}/ -- /\p{Pi}/ -- /\p{Po}/ -- /\p{Ps}/ -- /\p{S}/ -- /\p{Sc}/ -- /\p{Sk}/ -- /\p{Sm}/ -- /\p{So}/ -- /\p{Z}/ -- /\p{Zl}/ -- /\p{Zp}/ -- /\p{Zs}/ - -=== Scripts - -- /\p{Adlam}/ -- /\p{Ahom}/ -- /\p{Anatolian_Hieroglyphs}/ -- /\p{Arabic}/ -- /\p{Armenian}/ -- /\p{Avestan}/ -- /\p{Balinese}/ -- /\p{Bamum}/ -- /\p{Bassa_Vah}/ -- /\p{Batak}/ -- /\p{Bengali}/ -- /\p{Bhaiksuki}/ -- /\p{Bopomofo}/ -- /\p{Brahmi}/ -- /\p{Braille}/ -- /\p{Buginese}/ -- /\p{Buhid}/ -- /\p{Canadian_Aboriginal}/ -- /\p{Carian}/ -- /\p{Caucasian_Albanian}/ -- /\p{Chakma}/ -- /\p{Cham}/ -- /\p{Cherokee}/ -- /\p{Common}/ -- /\p{Coptic}/ -- /\p{Cuneiform}/ -- /\p{Cypriot}/ -- /\p{Cyrillic}/ -- /\p{Deseret}/ -- /\p{Devanagari}/ -- /\p{Dogra}/ -- /\p{Duployan}/ -- /\p{Egyptian_Hieroglyphs}/ -- /\p{Elbasan}/ -- /\p{Elymaic}/ -- /\p{Ethiopic}/ -- /\p{Georgian}/ -- /\p{Glagolitic}/ -- /\p{Gothic}/ -- /\p{Grantha}/ -- /\p{Greek}/ -- /\p{Gujarati}/ -- /\p{Gunjala_Gondi}/ -- /\p{Gurmukhi}/ -- /\p{Han}/ -- /\p{Hangul}/ -- /\p{Hanifi_Rohingya}/ -- /\p{Hanunoo}/ -- /\p{Hatran}/ -- /\p{Hebrew}/ -- /\p{Hiragana}/ -- /\p{Imperial_Aramaic}/ -- /\p{Inherited}/ -- /\p{Inscriptional_Pahlavi}/ -- /\p{Inscriptional_Parthian}/ -- /\p{Javanese}/ -- /\p{Kaithi}/ -- /\p{Kannada}/ -- /\p{Katakana}/ -- /\p{Kayah_Li}/ -- /\p{Kharoshthi}/ -- /\p{Khmer}/ -- /\p{Khojki}/ -- /\p{Khudawadi}/ -- /\p{Lao}/ -- /\p{Latin}/ -- /\p{Lepcha}/ -- /\p{Limbu}/ -- /\p{Linear_A}/ -- /\p{Linear_B}/ -- /\p{Lisu}/ -- /\p{Lycian}/ -- /\p{Lydian}/ -- /\p{Mahajani}/ -- /\p{Makasar}/ -- /\p{Malayalam}/ -- /\p{Mandaic}/ -- /\p{Manichaean}/ -- /\p{Marchen}/ -- /\p{Masaram_Gondi}/ -- /\p{Medefaidrin}/ -- /\p{Meetei_Mayek}/ -- /\p{Mende_Kikakui}/ -- /\p{Meroitic_Cursive}/ -- /\p{Meroitic_Hieroglyphs}/ -- /\p{Miao}/ -- /\p{Modi}/ -- /\p{Mongolian}/ -- /\p{Mro}/ -- /\p{Multani}/ -- /\p{Myanmar}/ -- /\p{Nabataean}/ -- /\p{Nandinagari}/ -- /\p{New_Tai_Lue}/ -- /\p{Newa}/ -- /\p{Nko}/ -- /\p{Nushu}/ -- /\p{Nyiakeng_Puachue_Hmong}/ -- /\p{Ogham}/ -- /\p{Ol_Chiki}/ -- /\p{Old_Hungarian}/ -- /\p{Old_Italic}/ -- /\p{Old_North_Arabian}/ -- /\p{Old_Permic}/ -- /\p{Old_Persian}/ -- /\p{Old_Sogdian}/ -- /\p{Old_South_Arabian}/ -- /\p{Old_Turkic}/ -- /\p{Oriya}/ -- /\p{Osage}/ -- /\p{Osmanya}/ -- /\p{Pahawh_Hmong}/ -- /\p{Palmyrene}/ -- /\p{Pau_Cin_Hau}/ -- /\p{Phags_Pa}/ -- /\p{Phoenician}/ -- /\p{Psalter_Pahlavi}/ -- /\p{Rejang}/ -- /\p{Runic}/ -- /\p{Samaritan}/ -- /\p{Saurashtra}/ -- /\p{Sharada}/ -- /\p{Shavian}/ -- /\p{Siddham}/ -- /\p{SignWriting}/ -- /\p{Sinhala}/ -- /\p{Sogdian}/ -- /\p{Sora_Sompeng}/ -- /\p{Soyombo}/ -- /\p{Sundanese}/ -- /\p{Syloti_Nagri}/ -- /\p{Syriac}/ -- /\p{Tagalog}/ -- /\p{Tagbanwa}/ -- /\p{Tai_Le}/ -- /\p{Tai_Tham}/ -- /\p{Tai_Viet}/ -- /\p{Takri}/ -- /\p{Tamil}/ -- /\p{Tangut}/ -- /\p{Telugu}/ -- /\p{Thaana}/ -- /\p{Thai}/ -- /\p{Tibetan}/ -- /\p{Tifinagh}/ -- /\p{Tirhuta}/ -- /\p{Ugaritic}/ -- /\p{Unknown}/ -- /\p{Vai}/ -- /\p{Wancho}/ -- /\p{Warang_Citi}/ -- /\p{Yi}/ -- /\p{Zanabazar_Square}/ - -=== Derived Core Properties - -- /\p{Alphabetic}/ -- /\p{Case_Ignorable}/ -- /\p{Cased}/ -- /\p{Changes_When_Casefolded}/ -- /\p{Changes_When_Casemapped}/ -- /\p{Changes_When_Lowercased}/ -- /\p{Changes_When_Titlecased}/ -- /\p{Changes_When_Uppercased}/ -- /\p{Default_Ignorable_Code_Point}/ -- /\p{Grapheme_Base}/ -- /\p{Grapheme_Extend}/ -- /\p{Grapheme_Link}/ -- /\p{ID_Continue}/ -- /\p{ID_Start}/ -- /\p{Lowercase}/ -- /\p{Math}/ -- /\p{Uppercase}/ -- /\p{XID_Continue}/ -- /\p{XID_Start}/ +- \p{Cased_Letter}, \p{LC} +- \p{Close_Punctuation}, \p{Pe} +- \p{Connector_Punctuation}, \p{Pc} +- \p{Control}, \p{Cc} +- \p{Currency_Symbol}, \p{Sc} +- \p{Dash_Punctuation}, \p{Pd} +- \p{Decimal_Number}, \p{Nd} +- \p{Enclosing_Mark}, \p{Me} +- \p{Final_Punctuation}, \p{Pf} +- \p{Format}, \p{Cf} +- \p{Initial_Punctuation}, \p{Pi} +- \p{Letter}, \p{L} +- \p{Letter_Number}, \p{Nl} +- \p{Line_Separator}, \p{Zl} +- \p{Lowercase_Letter}, \p{Ll} +- \p{Mark}, \p{M} +- \p{Math_Symbol}, \p{Sm} +- \p{Modifier_Letter}, \p{Lm} +- \p{Modifier_Symbol}, \p{Sk} +- \p{Nonspacing_Mark}, \p{Mn} +- \p{Number}, \p{N} +- \p{Open_Punctuation}, \p{Ps} +- \p{Other}, \p{C} +- \p{Other_Letter}, \p{Lo} +- \p{Other_Number}, \p{No} +- \p{Other_Punctuation}, \p{Po} +- \p{Other_Symbol}, \p{So} +- \p{Paragraph_Separator}, \p{Zp} +- \p{Private_Use}, \p{Co} +- \p{Punctuation}, \p{P} +- \p{Separator}, \p{Z} +- \p{Space_Separator}, \p{Zs} +- \p{Spacing_Mark}, \p{Mc} +- \p{Surrogate}, \p{Cs} +- \p{Symbol}, \p{S} +- \p{Titlecase_Letter}, \p{Lt} +- \p{Unassigned}, \p{Cn} +- \p{Uppercase_Letter}, \p{Lu} === Prop List -- /\p{ASCII_Hex_Digit}/ -- /\p{Bidi_Control}/ -- /\p{Dash}/ -- /\p{Deprecated}/ -- /\p{Diacritic}/ -- /\p{Extender}/ -- /\p{Hex_Digit}/ -- /\p{Hyphen}/ -- /\p{IDS_Binary_Operator}/ -- /\p{IDS_Trinary_Operator}/ -- /\p{Ideographic}/ -- /\p{Join_Control}/ -- /\p{Logical_Order_Exception}/ -- /\p{Noncharacter_Code_Point}/ -- /\p{Other_Alphabetic}/ -- /\p{Other_Default_Ignorable_Code_Point}/ -- /\p{Other_Grapheme_Extend}/ -- /\p{Other_ID_Continue}/ -- /\p{Other_ID_Start}/ -- /\p{Other_Lowercase}/ -- /\p{Other_Math}/ -- /\p{Other_Uppercase}/ -- /\p{Pattern_Syntax}/ -- /\p{Pattern_White_Space}/ -- /\p{Prepended_Concatenation_Mark}/ -- /\p{Quotation_Mark}/ -- /\p{Radical}/ -- /\p{Regional_Indicator}/ -- /\p{Sentence_Terminal}/ -- /\p{Soft_Dotted}/ -- /\p{Terminal_Punctuation}/ -- /\p{Unified_Ideograph}/ -- /\p{Variation_Selector}/ -- /\p{White_Space}/ +- \p{ASCII_Hex_Digit}, \p{AHex} +- \p{Bidi_Control}, \p{Bidi_C} +- \p{Dash} +- \p{Deprecated}, \p{Dep} +- \p{Diacritic}, \p{Dia} +- \p{Extender}, \p{Ext} +- \p{Hex_Digit}, \p{Hex} +- \p{Hyphen} +- \p{IDS_Binary_Operator}, \p{IDSB} +- \p{IDS_Trinary_Operator}, \p{IDST} +- \p{Ideographic}, \p{Ideo} +- \p{Join_Control}, \p{Join_C} +- \p{Logical_Order_Exception}, \p{LOE} +- \p{Noncharacter_Code_Point}, \p{NChar} +- \p{Other_Alphabetic}, \p{OAlpha} +- \p{Other_Default_Ignorable_Code_Point}, \p{ODI} +- \p{Other_Grapheme_Extend}, \p{OGr_Ext} +- \p{Other_ID_Continue}, \p{OIDC} +- \p{Other_ID_Start}, \p{OIDS} +- \p{Other_Lowercase}, \p{OLower} +- \p{Other_Math}, \p{OMath} +- \p{Other_Uppercase}, \p{OUpper} +- \p{Pattern_Syntax}, \p{Pat_Syn} +- \p{Pattern_White_Space}, \p{Pat_WS} +- \p{Prepended_Concatenation_Mark}, \p{PCM} +- \p{Quotation_Mark}, \p{QMark} +- \p{Radical} +- \p{Regional_Indicator}, \p{RI} +- \p{Sentence_Terminal}, \p{STerm} +- \p{Soft_Dotted}, \p{SD} +- \p{Terminal_Punctuation}, \p{Term} +- \p{Unified_Ideograph}, \p{UIdeo} +- \p{Variation_Selector}, \p{VS} +- \p{White_Space}, \p{WSpace} -=== Emoji +=== Derived Core Properties -- /\p{Emoji}/ -- /\p{Emoji_Component}/ -- /\p{Emoji_Modifier}/ -- /\p{Emoji_Modifier_Base}/ -- /\p{Emoji_Presentation}/ +- \p{Alphabetic}, \p{Alpha} +- \p{Case_Ignorable}, \p{CI} +- \p{Cased} +- \p{Changes_When_Casefolded}, \p{CWCF} +- \p{Changes_When_Casemapped}, \p{CWCM} +- \p{Changes_When_Lowercased}, \p{CWL} +- \p{Changes_When_Titlecased}, \p{CWT} +- \p{Changes_When_Uppercased}, \p{CWU} +- \p{Default_Ignorable_Code_Point}, \p{DI} +- \p{Grapheme_Base}, \p{Gr_Base} +- \p{Grapheme_Extend}, \p{Gr_Ext} +- \p{Grapheme_Link}, \p{Gr_Link} +- \p{ID_Continue}, \p{IDC} +- \p{ID_Start}, \p{IDS} +- \p{Lowercase}, \p{Lower} +- \p{Math} +- \p{Uppercase}, \p{Upper} +- \p{XID_Continue}, \p{XIDC} +- \p{XID_Start}, \p{XIDS} -=== Property Aliases +=== Scripts -- /\p{AHex}/ -- /\p{Bidi_C}/ -- /\p{CI}/ -- /\p{CWCF}/ -- /\p{CWCM}/ -- /\p{CWL}/ -- /\p{CWT}/ -- /\p{CWU}/ -- /\p{DI}/ -- /\p{Dep}/ -- /\p{Dia}/ -- /\p{Ext}/ -- /\p{Gr_Base}/ -- /\p{Gr_Ext}/ -- /\p{Gr_Link}/ -- /\p{Hex}/ -- /\p{IDC}/ -- /\p{IDS}/ -- /\p{IDSB}/ -- /\p{IDST}/ -- /\p{Ideo}/ -- /\p{Join_C}/ -- /\p{LOE}/ -- /\p{NChar}/ -- /\p{OAlpha}/ -- /\p{ODI}/ -- /\p{OGr_Ext}/ -- /\p{OIDC}/ -- /\p{OIDS}/ -- /\p{OLower}/ -- /\p{OMath}/ -- /\p{OUpper}/ -- /\p{PCM}/ -- /\p{Pat_Syn}/ -- /\p{Pat_WS}/ -- /\p{QMark}/ -- /\p{RI}/ -- /\p{SD}/ -- /\p{STerm}/ -- /\p{Term}/ -- /\p{UIdeo}/ -- /\p{VS}/ -- /\p{WSpace}/ -- /\p{XIDC}/ -- /\p{XIDS}/ - -=== Property Value Aliases (General Category) - -- /\p{Other}/ -- /\p{Control}/ -- /\p{Format}/ -- /\p{Unassigned}/ -- /\p{Private_Use}/ -- /\p{Surrogate}/ -- /\p{Letter}/ -- /\p{Cased_Letter}/ -- /\p{Lowercase_Letter}/ -- /\p{Modifier_Letter}/ -- /\p{Other_Letter}/ -- /\p{Titlecase_Letter}/ -- /\p{Uppercase_Letter}/ -- /\p{Mark}/ -- /\p{Combining_Mark}/ -- /\p{Spacing_Mark}/ -- /\p{Enclosing_Mark}/ -- /\p{Nonspacing_Mark}/ -- /\p{Number}/ -- /\p{Decimal_Number}/ -- /\p{Letter_Number}/ -- /\p{Other_Number}/ -- /\p{Punctuation}/ -- /\p{Connector_Punctuation}/ -- /\p{Dash_Punctuation}/ -- /\p{Close_Punctuation}/ -- /\p{Final_Punctuation}/ -- /\p{Initial_Punctuation}/ -- /\p{Other_Punctuation}/ -- /\p{Open_Punctuation}/ -- /\p{Symbol}/ -- /\p{Currency_Symbol}/ -- /\p{Modifier_Symbol}/ -- /\p{Math_Symbol}/ -- /\p{Other_Symbol}/ -- /\p{Separator}/ -- /\p{Line_Separator}/ -- /\p{Paragraph_Separator}/ -- /\p{Space_Separator}/ - -=== Property Value Aliases (Script) - -- /\p{Adlm}/ -- /\p{Aghb}/ -- /\p{Arab}/ -- /\p{Armi}/ -- /\p{Armn}/ -- /\p{Avst}/ -- /\p{Bali}/ -- /\p{Bamu}/ -- /\p{Bass}/ -- /\p{Batk}/ -- /\p{Beng}/ -- /\p{Bhks}/ -- /\p{Bopo}/ -- /\p{Brah}/ -- /\p{Brai}/ -- /\p{Bugi}/ -- /\p{Buhd}/ -- /\p{Cakm}/ -- /\p{Cans}/ -- /\p{Cari}/ -- /\p{Cher}/ -- /\p{Copt}/ -- /\p{Qaac}/ -- /\p{Cprt}/ -- /\p{Cyrl}/ -- /\p{Deva}/ -- /\p{Dogr}/ -- /\p{Dsrt}/ -- /\p{Dupl}/ -- /\p{Egyp}/ -- /\p{Elba}/ -- /\p{Elym}/ -- /\p{Ethi}/ -- /\p{Geor}/ -- /\p{Glag}/ -- /\p{Gong}/ -- /\p{Gonm}/ -- /\p{Goth}/ -- /\p{Gran}/ -- /\p{Grek}/ -- /\p{Gujr}/ -- /\p{Guru}/ -- /\p{Hang}/ -- /\p{Hani}/ -- /\p{Hano}/ -- /\p{Hatr}/ -- /\p{Hebr}/ -- /\p{Hira}/ -- /\p{Hluw}/ -- /\p{Hmng}/ -- /\p{Hmnp}/ -- /\p{Hung}/ -- /\p{Ital}/ -- /\p{Java}/ -- /\p{Kali}/ -- /\p{Kana}/ -- /\p{Khar}/ -- /\p{Khmr}/ -- /\p{Khoj}/ -- /\p{Knda}/ -- /\p{Kthi}/ -- /\p{Lana}/ -- /\p{Laoo}/ -- /\p{Latn}/ -- /\p{Lepc}/ -- /\p{Limb}/ -- /\p{Lina}/ -- /\p{Linb}/ -- /\p{Lyci}/ -- /\p{Lydi}/ -- /\p{Mahj}/ -- /\p{Maka}/ -- /\p{Mand}/ -- /\p{Mani}/ -- /\p{Marc}/ -- /\p{Medf}/ -- /\p{Mend}/ -- /\p{Merc}/ -- /\p{Mero}/ -- /\p{Mlym}/ -- /\p{Mong}/ -- /\p{Mroo}/ -- /\p{Mtei}/ -- /\p{Mult}/ -- /\p{Mymr}/ -- /\p{Nand}/ -- /\p{Narb}/ -- /\p{Nbat}/ -- /\p{Nkoo}/ -- /\p{Nshu}/ -- /\p{Ogam}/ -- /\p{Olck}/ -- /\p{Orkh}/ -- /\p{Orya}/ -- /\p{Osge}/ -- /\p{Osma}/ -- /\p{Palm}/ -- /\p{Pauc}/ -- /\p{Perm}/ -- /\p{Phag}/ -- /\p{Phli}/ -- /\p{Phlp}/ -- /\p{Phnx}/ -- /\p{Plrd}/ -- /\p{Prti}/ -- /\p{Rjng}/ -- /\p{Rohg}/ -- /\p{Runr}/ -- /\p{Samr}/ -- /\p{Sarb}/ -- /\p{Saur}/ -- /\p{Sgnw}/ -- /\p{Shaw}/ -- /\p{Shrd}/ -- /\p{Sidd}/ -- /\p{Sind}/ -- /\p{Sinh}/ -- /\p{Sogd}/ -- /\p{Sogo}/ -- /\p{Sora}/ -- /\p{Soyo}/ -- /\p{Sund}/ -- /\p{Sylo}/ -- /\p{Syrc}/ -- /\p{Tagb}/ -- /\p{Takr}/ -- /\p{Tale}/ -- /\p{Talu}/ -- /\p{Taml}/ -- /\p{Tang}/ -- /\p{Tavt}/ -- /\p{Telu}/ -- /\p{Tfng}/ -- /\p{Tglg}/ -- /\p{Thaa}/ -- /\p{Tibt}/ -- /\p{Tirh}/ -- /\p{Ugar}/ -- /\p{Vaii}/ -- /\p{Wara}/ -- /\p{Wcho}/ -- /\p{Xpeo}/ -- /\p{Xsux}/ -- /\p{Yiii}/ -- /\p{Zanb}/ -- /\p{Zinh}/ -- /\p{Qaai}/ -- /\p{Zyyy}/ -- /\p{Zzzz}/ - -=== Derived Ages - -- /\p{Age=1.1}/ -- /\p{Age=10.0}/ -- /\p{Age=11.0}/ -- /\p{Age=12.0}/ -- /\p{Age=12.1}/ -- /\p{Age=2.0}/ -- /\p{Age=2.1}/ -- /\p{Age=3.0}/ -- /\p{Age=3.1}/ -- /\p{Age=3.2}/ -- /\p{Age=4.0}/ -- /\p{Age=4.1}/ -- /\p{Age=5.0}/ -- /\p{Age=5.1}/ -- /\p{Age=5.2}/ -- /\p{Age=6.0}/ -- /\p{Age=6.1}/ -- /\p{Age=6.2}/ -- /\p{Age=6.3}/ -- /\p{Age=7.0}/ -- /\p{Age=8.0}/ -- /\p{Age=9.0}/ +- \p{Adlam}, \p{Adlm} +- \p{Ahom} +- \p{Anatolian_Hieroglyphs}, \p{Hluw} +- \p{Arabic}, \p{Arab} +- \p{Armenian}, \p{Armn} +- \p{Avestan}, \p{Avst} +- \p{Balinese}, \p{Bali} +- \p{Bamum}, \p{Bamu} +- \p{Bassa_Vah}, \p{Bass} +- \p{Batak}, \p{Batk} +- \p{Bengali}, \p{Beng} +- \p{Bhaiksuki}, \p{Bhks} +- \p{Bopomofo}, \p{Bopo} +- \p{Brahmi}, \p{Brah} +- \p{Braille}, \p{Brai} +- \p{Buginese}, \p{Bugi} +- \p{Buhid}, \p{Buhd} +- \p{Canadian_Aboriginal}, \p{Cans} +- \p{Carian}, \p{Cari} +- \p{Caucasian_Albanian}, \p{Aghb} +- \p{Chakma}, \p{Cakm} +- \p{Cham} +- \p{Cherokee}, \p{Cher} +- \p{Chorasmian}, \p{Chrs} +- \p{Common}, \p{Zyyy} +- \p{Coptic}, \p{Copt} +- \p{Cuneiform}, \p{Xsux} +- \p{Cypriot}, \p{Cprt} +- \p{Cypro_Minoan}, \p{Cpmn} +- \p{Cyrillic}, \p{Cyrl} +- \p{Deseret}, \p{Dsrt} +- \p{Devanagari}, \p{Deva} +- \p{Dives_Akuru}, \p{Diak} +- \p{Dogra}, \p{Dogr} +- \p{Duployan}, \p{Dupl} +- \p{Egyptian_Hieroglyphs}, \p{Egyp} +- \p{Elbasan}, \p{Elba} +- \p{Elymaic}, \p{Elym} +- \p{Ethiopic}, \p{Ethi} +- \p{Georgian}, \p{Geor} +- \p{Glagolitic}, \p{Glag} +- \p{Gothic}, \p{Goth} +- \p{Grantha}, \p{Gran} +- \p{Greek}, \p{Grek} +- \p{Gujarati}, \p{Gujr} +- \p{Gunjala_Gondi}, \p{Gong} +- \p{Gurmukhi}, \p{Guru} +- \p{Han}, \p{Hani} +- \p{Hangul}, \p{Hang} +- \p{Hanifi_Rohingya}, \p{Rohg} +- \p{Hanunoo}, \p{Hano} +- \p{Hatran}, \p{Hatr} +- \p{Hebrew}, \p{Hebr} +- \p{Hiragana}, \p{Hira} +- \p{Imperial_Aramaic}, \p{Armi} +- \p{Inherited}, \p{Zinh} +- \p{Inscriptional_Pahlavi}, \p{Phli} +- \p{Inscriptional_Parthian}, \p{Prti} +- \p{Javanese}, \p{Java} +- \p{Kaithi}, \p{Kthi} +- \p{Kannada}, \p{Knda} +- \p{Katakana}, \p{Kana} +- \p{Kawi} +- \p{Kayah_Li}, \p{Kali} +- \p{Kharoshthi}, \p{Khar} +- \p{Khitan_Small_Script}, \p{Kits} +- \p{Khmer}, \p{Khmr} +- \p{Khojki}, \p{Khoj} +- \p{Khudawadi}, \p{Sind} +- \p{Lao}, \p{Laoo} +- \p{Latin}, \p{Latn} +- \p{Lepcha}, \p{Lepc} +- \p{Limbu}, \p{Limb} +- \p{Linear_A}, \p{Lina} +- \p{Linear_B}, \p{Linb} +- \p{Lisu} +- \p{Lycian}, \p{Lyci} +- \p{Lydian}, \p{Lydi} +- \p{Mahajani}, \p{Mahj} +- \p{Makasar}, \p{Maka} +- \p{Malayalam}, \p{Mlym} +- \p{Mandaic}, \p{Mand} +- \p{Manichaean}, \p{Mani} +- \p{Marchen}, \p{Marc} +- \p{Masaram_Gondi}, \p{Gonm} +- \p{Medefaidrin}, \p{Medf} +- \p{Meetei_Mayek}, \p{Mtei} +- \p{Mende_Kikakui}, \p{Mend} +- \p{Meroitic_Cursive}, \p{Merc} +- \p{Meroitic_Hieroglyphs}, \p{Mero} +- \p{Miao}, \p{Plrd} +- \p{Modi} +- \p{Mongolian}, \p{Mong} +- \p{Mro}, \p{Mroo} +- \p{Multani}, \p{Mult} +- \p{Myanmar}, \p{Mymr} +- \p{Nabataean}, \p{Nbat} +- \p{Nag_Mundari}, \p{Nagm} +- \p{Nandinagari}, \p{Nand} +- \p{New_Tai_Lue}, \p{Talu} +- \p{Newa} +- \p{Nko}, \p{Nkoo} +- \p{Nushu}, \p{Nshu} +- \p{Nyiakeng_Puachue_Hmong}, \p{Hmnp} +- \p{Ogham}, \p{Ogam} +- \p{Ol_Chiki}, \p{Olck} +- \p{Old_Hungarian}, \p{Hung} +- \p{Old_Italic}, \p{Ital} +- \p{Old_North_Arabian}, \p{Narb} +- \p{Old_Permic}, \p{Perm} +- \p{Old_Persian}, \p{Xpeo} +- \p{Old_Sogdian}, \p{Sogo} +- \p{Old_South_Arabian}, \p{Sarb} +- \p{Old_Turkic}, \p{Orkh} +- \p{Old_Uyghur}, \p{Ougr} +- \p{Oriya}, \p{Orya} +- \p{Osage}, \p{Osge} +- \p{Osmanya}, \p{Osma} +- \p{Pahawh_Hmong}, \p{Hmng} +- \p{Palmyrene}, \p{Palm} +- \p{Pau_Cin_Hau}, \p{Pauc} +- \p{Phags_Pa}, \p{Phag} +- \p{Phoenician}, \p{Phnx} +- \p{Psalter_Pahlavi}, \p{Phlp} +- \p{Rejang}, \p{Rjng} +- \p{Runic}, \p{Runr} +- \p{Samaritan}, \p{Samr} +- \p{Saurashtra}, \p{Saur} +- \p{Sharada}, \p{Shrd} +- \p{Shavian}, \p{Shaw} +- \p{Siddham}, \p{Sidd} +- \p{SignWriting}, \p{Sgnw} +- \p{Sinhala}, \p{Sinh} +- \p{Sogdian}, \p{Sogd} +- \p{Sora_Sompeng}, \p{Sora} +- \p{Soyombo}, \p{Soyo} +- \p{Sundanese}, \p{Sund} +- \p{Syloti_Nagri}, \p{Sylo} +- \p{Syriac}, \p{Syrc} +- \p{Tagalog}, \p{Tglg} +- \p{Tagbanwa}, \p{Tagb} +- \p{Tai_Le}, \p{Tale} +- \p{Tai_Tham}, \p{Lana} +- \p{Tai_Viet}, \p{Tavt} +- \p{Takri}, \p{Takr} +- \p{Tamil}, \p{Taml} +- \p{Tangsa}, \p{Tnsa} +- \p{Tangut}, \p{Tang} +- \p{Telugu}, \p{Telu} +- \p{Thaana}, \p{Thaa} +- \p{Thai} +- \p{Tibetan}, \p{Tibt} +- \p{Tifinagh}, \p{Tfng} +- \p{Tirhuta}, \p{Tirh} +- \p{Toto} +- \p{Ugaritic}, \p{Ugar} +- \p{Unknown}, \p{Zzzz} +- \p{Vai}, \p{Vaii} +- \p{Vithkuqi}, \p{Vith} +- \p{Wancho}, \p{Wcho} +- \p{Warang_Citi}, \p{Wara} +- \p{Yezidi}, \p{Yezi} +- \p{Yi}, \p{Yiii} +- \p{Zanabazar_Square}, \p{Zanb} === Blocks -- /\p{In_Basic_Latin}/ -- /\p{In_Latin_1_Supplement}/ -- /\p{In_Latin_Extended_A}/ -- /\p{In_Latin_Extended_B}/ -- /\p{In_IPA_Extensions}/ -- /\p{In_Spacing_Modifier_Letters}/ -- /\p{In_Combining_Diacritical_Marks}/ -- /\p{In_Greek_and_Coptic}/ -- /\p{In_Cyrillic}/ -- /\p{In_Cyrillic_Supplement}/ -- /\p{In_Armenian}/ -- /\p{In_Hebrew}/ -- /\p{In_Arabic}/ -- /\p{In_Syriac}/ -- /\p{In_Arabic_Supplement}/ -- /\p{In_Thaana}/ -- /\p{In_NKo}/ -- /\p{In_Samaritan}/ -- /\p{In_Mandaic}/ -- /\p{In_Syriac_Supplement}/ -- /\p{In_Arabic_Extended_A}/ -- /\p{In_Devanagari}/ -- /\p{In_Bengali}/ -- /\p{In_Gurmukhi}/ -- /\p{In_Gujarati}/ -- /\p{In_Oriya}/ -- /\p{In_Tamil}/ -- /\p{In_Telugu}/ -- /\p{In_Kannada}/ -- /\p{In_Malayalam}/ -- /\p{In_Sinhala}/ -- /\p{In_Thai}/ -- /\p{In_Lao}/ -- /\p{In_Tibetan}/ -- /\p{In_Myanmar}/ -- /\p{In_Georgian}/ -- /\p{In_Hangul_Jamo}/ -- /\p{In_Ethiopic}/ -- /\p{In_Ethiopic_Supplement}/ -- /\p{In_Cherokee}/ -- /\p{In_Unified_Canadian_Aboriginal_Syllabics}/ -- /\p{In_Ogham}/ -- /\p{In_Runic}/ -- /\p{In_Tagalog}/ -- /\p{In_Hanunoo}/ -- /\p{In_Buhid}/ -- /\p{In_Tagbanwa}/ -- /\p{In_Khmer}/ -- /\p{In_Mongolian}/ -- /\p{In_Unified_Canadian_Aboriginal_Syllabics_Extended}/ -- /\p{In_Limbu}/ -- /\p{In_Tai_Le}/ -- /\p{In_New_Tai_Lue}/ -- /\p{In_Khmer_Symbols}/ -- /\p{In_Buginese}/ -- /\p{In_Tai_Tham}/ -- /\p{In_Combining_Diacritical_Marks_Extended}/ -- /\p{In_Balinese}/ -- /\p{In_Sundanese}/ -- /\p{In_Batak}/ -- /\p{In_Lepcha}/ -- /\p{In_Ol_Chiki}/ -- /\p{In_Cyrillic_Extended_C}/ -- /\p{In_Georgian_Extended}/ -- /\p{In_Sundanese_Supplement}/ -- /\p{In_Vedic_Extensions}/ -- /\p{In_Phonetic_Extensions}/ -- /\p{In_Phonetic_Extensions_Supplement}/ -- /\p{In_Combining_Diacritical_Marks_Supplement}/ -- /\p{In_Latin_Extended_Additional}/ -- /\p{In_Greek_Extended}/ -- /\p{In_General_Punctuation}/ -- /\p{In_Superscripts_and_Subscripts}/ -- /\p{In_Currency_Symbols}/ -- /\p{In_Combining_Diacritical_Marks_for_Symbols}/ -- /\p{In_Letterlike_Symbols}/ -- /\p{In_Number_Forms}/ -- /\p{In_Arrows}/ -- /\p{In_Mathematical_Operators}/ -- /\p{In_Miscellaneous_Technical}/ -- /\p{In_Control_Pictures}/ -- /\p{In_Optical_Character_Recognition}/ -- /\p{In_Enclosed_Alphanumerics}/ -- /\p{In_Box_Drawing}/ -- /\p{In_Block_Elements}/ -- /\p{In_Geometric_Shapes}/ -- /\p{In_Miscellaneous_Symbols}/ -- /\p{In_Dingbats}/ -- /\p{In_Miscellaneous_Mathematical_Symbols_A}/ -- /\p{In_Supplemental_Arrows_A}/ -- /\p{In_Braille_Patterns}/ -- /\p{In_Supplemental_Arrows_B}/ -- /\p{In_Miscellaneous_Mathematical_Symbols_B}/ -- /\p{In_Supplemental_Mathematical_Operators}/ -- /\p{In_Miscellaneous_Symbols_and_Arrows}/ -- /\p{In_Glagolitic}/ -- /\p{In_Latin_Extended_C}/ -- /\p{In_Coptic}/ -- /\p{In_Georgian_Supplement}/ -- /\p{In_Tifinagh}/ -- /\p{In_Ethiopic_Extended}/ -- /\p{In_Cyrillic_Extended_A}/ -- /\p{In_Supplemental_Punctuation}/ -- /\p{In_CJK_Radicals_Supplement}/ -- /\p{In_Kangxi_Radicals}/ -- /\p{In_Ideographic_Description_Characters}/ -- /\p{In_CJK_Symbols_and_Punctuation}/ -- /\p{In_Hiragana}/ -- /\p{In_Katakana}/ -- /\p{In_Bopomofo}/ -- /\p{In_Hangul_Compatibility_Jamo}/ -- /\p{In_Kanbun}/ -- /\p{In_Bopomofo_Extended}/ -- /\p{In_CJK_Strokes}/ -- /\p{In_Katakana_Phonetic_Extensions}/ -- /\p{In_Enclosed_CJK_Letters_and_Months}/ -- /\p{In_CJK_Compatibility}/ -- /\p{In_CJK_Unified_Ideographs_Extension_A}/ -- /\p{In_Yijing_Hexagram_Symbols}/ -- /\p{In_CJK_Unified_Ideographs}/ -- /\p{In_Yi_Syllables}/ -- /\p{In_Yi_Radicals}/ -- /\p{In_Lisu}/ -- /\p{In_Vai}/ -- /\p{In_Cyrillic_Extended_B}/ -- /\p{In_Bamum}/ -- /\p{In_Modifier_Tone_Letters}/ -- /\p{In_Latin_Extended_D}/ -- /\p{In_Syloti_Nagri}/ -- /\p{In_Common_Indic_Number_Forms}/ -- /\p{In_Phags_pa}/ -- /\p{In_Saurashtra}/ -- /\p{In_Devanagari_Extended}/ -- /\p{In_Kayah_Li}/ -- /\p{In_Rejang}/ -- /\p{In_Hangul_Jamo_Extended_A}/ -- /\p{In_Javanese}/ -- /\p{In_Myanmar_Extended_B}/ -- /\p{In_Cham}/ -- /\p{In_Myanmar_Extended_A}/ -- /\p{In_Tai_Viet}/ -- /\p{In_Meetei_Mayek_Extensions}/ -- /\p{In_Ethiopic_Extended_A}/ -- /\p{In_Latin_Extended_E}/ -- /\p{In_Cherokee_Supplement}/ -- /\p{In_Meetei_Mayek}/ -- /\p{In_Hangul_Syllables}/ -- /\p{In_Hangul_Jamo_Extended_B}/ -- /\p{In_High_Surrogates}/ -- /\p{In_High_Private_Use_Surrogates}/ -- /\p{In_Low_Surrogates}/ -- /\p{In_Private_Use_Area}/ -- /\p{In_CJK_Compatibility_Ideographs}/ -- /\p{In_Alphabetic_Presentation_Forms}/ -- /\p{In_Arabic_Presentation_Forms_A}/ -- /\p{In_Variation_Selectors}/ -- /\p{In_Vertical_Forms}/ -- /\p{In_Combining_Half_Marks}/ -- /\p{In_CJK_Compatibility_Forms}/ -- /\p{In_Small_Form_Variants}/ -- /\p{In_Arabic_Presentation_Forms_B}/ -- /\p{In_Halfwidth_and_Fullwidth_Forms}/ -- /\p{In_Specials}/ -- /\p{In_Linear_B_Syllabary}/ -- /\p{In_Linear_B_Ideograms}/ -- /\p{In_Aegean_Numbers}/ -- /\p{In_Ancient_Greek_Numbers}/ -- /\p{In_Ancient_Symbols}/ -- /\p{In_Phaistos_Disc}/ -- /\p{In_Lycian}/ -- /\p{In_Carian}/ -- /\p{In_Coptic_Epact_Numbers}/ -- /\p{In_Old_Italic}/ -- /\p{In_Gothic}/ -- /\p{In_Old_Permic}/ -- /\p{In_Ugaritic}/ -- /\p{In_Old_Persian}/ -- /\p{In_Deseret}/ -- /\p{In_Shavian}/ -- /\p{In_Osmanya}/ -- /\p{In_Osage}/ -- /\p{In_Elbasan}/ -- /\p{In_Caucasian_Albanian}/ -- /\p{In_Linear_A}/ -- /\p{In_Cypriot_Syllabary}/ -- /\p{In_Imperial_Aramaic}/ -- /\p{In_Palmyrene}/ -- /\p{In_Nabataean}/ -- /\p{In_Hatran}/ -- /\p{In_Phoenician}/ -- /\p{In_Lydian}/ -- /\p{In_Meroitic_Hieroglyphs}/ -- /\p{In_Meroitic_Cursive}/ -- /\p{In_Kharoshthi}/ -- /\p{In_Old_South_Arabian}/ -- /\p{In_Old_North_Arabian}/ -- /\p{In_Manichaean}/ -- /\p{In_Avestan}/ -- /\p{In_Inscriptional_Parthian}/ -- /\p{In_Inscriptional_Pahlavi}/ -- /\p{In_Psalter_Pahlavi}/ -- /\p{In_Old_Turkic}/ -- /\p{In_Old_Hungarian}/ -- /\p{In_Hanifi_Rohingya}/ -- /\p{In_Rumi_Numeral_Symbols}/ -- /\p{In_Old_Sogdian}/ -- /\p{In_Sogdian}/ -- /\p{In_Elymaic}/ -- /\p{In_Brahmi}/ -- /\p{In_Kaithi}/ -- /\p{In_Sora_Sompeng}/ -- /\p{In_Chakma}/ -- /\p{In_Mahajani}/ -- /\p{In_Sharada}/ -- /\p{In_Sinhala_Archaic_Numbers}/ -- /\p{In_Khojki}/ -- /\p{In_Multani}/ -- /\p{In_Khudawadi}/ -- /\p{In_Grantha}/ -- /\p{In_Newa}/ -- /\p{In_Tirhuta}/ -- /\p{In_Siddham}/ -- /\p{In_Modi}/ -- /\p{In_Mongolian_Supplement}/ -- /\p{In_Takri}/ -- /\p{In_Ahom}/ -- /\p{In_Dogra}/ -- /\p{In_Warang_Citi}/ -- /\p{In_Nandinagari}/ -- /\p{In_Zanabazar_Square}/ -- /\p{In_Soyombo}/ -- /\p{In_Pau_Cin_Hau}/ -- /\p{In_Bhaiksuki}/ -- /\p{In_Marchen}/ -- /\p{In_Masaram_Gondi}/ -- /\p{In_Gunjala_Gondi}/ -- /\p{In_Makasar}/ -- /\p{In_Tamil_Supplement}/ -- /\p{In_Cuneiform}/ -- /\p{In_Cuneiform_Numbers_and_Punctuation}/ -- /\p{In_Early_Dynastic_Cuneiform}/ -- /\p{In_Egyptian_Hieroglyphs}/ -- /\p{In_Egyptian_Hieroglyph_Format_Controls}/ -- /\p{In_Anatolian_Hieroglyphs}/ -- /\p{In_Bamum_Supplement}/ -- /\p{In_Mro}/ -- /\p{In_Bassa_Vah}/ -- /\p{In_Pahawh_Hmong}/ -- /\p{In_Medefaidrin}/ -- /\p{In_Miao}/ -- /\p{In_Ideographic_Symbols_and_Punctuation}/ -- /\p{In_Tangut}/ -- /\p{In_Tangut_Components}/ -- /\p{In_Kana_Supplement}/ -- /\p{In_Kana_Extended_A}/ -- /\p{In_Small_Kana_Extension}/ -- /\p{In_Nushu}/ -- /\p{In_Duployan}/ -- /\p{In_Shorthand_Format_Controls}/ -- /\p{In_Byzantine_Musical_Symbols}/ -- /\p{In_Musical_Symbols}/ -- /\p{In_Ancient_Greek_Musical_Notation}/ -- /\p{In_Mayan_Numerals}/ -- /\p{In_Tai_Xuan_Jing_Symbols}/ -- /\p{In_Counting_Rod_Numerals}/ -- /\p{In_Mathematical_Alphanumeric_Symbols}/ -- /\p{In_Sutton_SignWriting}/ -- /\p{In_Glagolitic_Supplement}/ -- /\p{In_Nyiakeng_Puachue_Hmong}/ -- /\p{In_Wancho}/ -- /\p{In_Mende_Kikakui}/ -- /\p{In_Adlam}/ -- /\p{In_Indic_Siyaq_Numbers}/ -- /\p{In_Ottoman_Siyaq_Numbers}/ -- /\p{In_Arabic_Mathematical_Alphabetic_Symbols}/ -- /\p{In_Mahjong_Tiles}/ -- /\p{In_Domino_Tiles}/ -- /\p{In_Playing_Cards}/ -- /\p{In_Enclosed_Alphanumeric_Supplement}/ -- /\p{In_Enclosed_Ideographic_Supplement}/ -- /\p{In_Miscellaneous_Symbols_and_Pictographs}/ -- /\p{In_Emoticons}/ -- /\p{In_Ornamental_Dingbats}/ -- /\p{In_Transport_and_Map_Symbols}/ -- /\p{In_Alchemical_Symbols}/ -- /\p{In_Geometric_Shapes_Extended}/ -- /\p{In_Supplemental_Arrows_C}/ -- /\p{In_Supplemental_Symbols_and_Pictographs}/ -- /\p{In_Chess_Symbols}/ -- /\p{In_Symbols_and_Pictographs_Extended_A}/ -- /\p{In_CJK_Unified_Ideographs_Extension_B}/ -- /\p{In_CJK_Unified_Ideographs_Extension_C}/ -- /\p{In_CJK_Unified_Ideographs_Extension_D}/ -- /\p{In_CJK_Unified_Ideographs_Extension_E}/ -- /\p{In_CJK_Unified_Ideographs_Extension_F}/ -- /\p{In_CJK_Compatibility_Ideographs_Supplement}/ -- /\p{In_Tags}/ -- /\p{In_Variation_Selectors_Supplement}/ -- /\p{In_Supplementary_Private_Use_Area_A}/ -- /\p{In_Supplementary_Private_Use_Area_B}/ -- /\p{In_No_Block}/ +- \p{In_Adlam} +- \p{In_Aegean_Numbers} +- \p{In_Ahom} +- \p{In_Alchemical_Symbols} +- \p{In_Alphabetic_Presentation_Forms} +- \p{In_Anatolian_Hieroglyphs} +- \p{In_Ancient_Greek_Musical_Notation} +- \p{In_Ancient_Greek_Numbers} +- \p{In_Ancient_Symbols} +- \p{In_Arabic} +- \p{In_Arabic_Extended_A} +- \p{In_Arabic_Extended_B} +- \p{In_Arabic_Extended_C} +- \p{In_Arabic_Mathematical_Alphabetic_Symbols} +- \p{In_Arabic_Presentation_Forms_A} +- \p{In_Arabic_Presentation_Forms_B} +- \p{In_Arabic_Supplement} +- \p{In_Armenian} +- \p{In_Arrows} +- \p{In_Avestan} +- \p{In_Balinese} +- \p{In_Bamum} +- \p{In_Bamum_Supplement} +- \p{In_Basic_Latin} +- \p{In_Bassa_Vah} +- \p{In_Batak} +- \p{In_Bengali} +- \p{In_Bhaiksuki} +- \p{In_Block_Elements} +- \p{In_Bopomofo} +- \p{In_Bopomofo_Extended} +- \p{In_Box_Drawing} +- \p{In_Brahmi} +- \p{In_Braille_Patterns} +- \p{In_Buginese} +- \p{In_Buhid} +- \p{In_Byzantine_Musical_Symbols} +- \p{In_CJK_Compatibility} +- \p{In_CJK_Compatibility_Forms} +- \p{In_CJK_Compatibility_Ideographs} +- \p{In_CJK_Compatibility_Ideographs_Supplement} +- \p{In_CJK_Radicals_Supplement} +- \p{In_CJK_Strokes} +- \p{In_CJK_Symbols_and_Punctuation} +- \p{In_CJK_Unified_Ideographs} +- \p{In_CJK_Unified_Ideographs_Extension_A} +- \p{In_CJK_Unified_Ideographs_Extension_B} +- \p{In_CJK_Unified_Ideographs_Extension_C} +- \p{In_CJK_Unified_Ideographs_Extension_D} +- \p{In_CJK_Unified_Ideographs_Extension_E} +- \p{In_CJK_Unified_Ideographs_Extension_F} +- \p{In_CJK_Unified_Ideographs_Extension_G} +- \p{In_CJK_Unified_Ideographs_Extension_H} +- \p{In_Carian} +- \p{In_Caucasian_Albanian} +- \p{In_Chakma} +- \p{In_Cham} +- \p{In_Cherokee} +- \p{In_Cherokee_Supplement} +- \p{In_Chess_Symbols} +- \p{In_Chorasmian} +- \p{In_Combining_Diacritical_Marks} +- \p{In_Combining_Diacritical_Marks_Extended} +- \p{In_Combining_Diacritical_Marks_Supplement} +- \p{In_Combining_Diacritical_Marks_for_Symbols} +- \p{In_Combining_Half_Marks} +- \p{In_Common_Indic_Number_Forms} +- \p{In_Control_Pictures} +- \p{In_Coptic} +- \p{In_Coptic_Epact_Numbers} +- \p{In_Counting_Rod_Numerals} +- \p{In_Cuneiform} +- \p{In_Cuneiform_Numbers_and_Punctuation} +- \p{In_Currency_Symbols} +- \p{In_Cypriot_Syllabary} +- \p{In_Cypro_Minoan} +- \p{In_Cyrillic} +- \p{In_Cyrillic_Extended_A} +- \p{In_Cyrillic_Extended_B} +- \p{In_Cyrillic_Extended_C} +- \p{In_Cyrillic_Extended_D} +- \p{In_Cyrillic_Supplement} +- \p{In_Deseret} +- \p{In_Devanagari} +- \p{In_Devanagari_Extended} +- \p{In_Devanagari_Extended_A} +- \p{In_Dingbats} +- \p{In_Dives_Akuru} +- \p{In_Dogra} +- \p{In_Domino_Tiles} +- \p{In_Duployan} +- \p{In_Early_Dynastic_Cuneiform} +- \p{In_Egyptian_Hieroglyph_Format_Controls} +- \p{In_Egyptian_Hieroglyphs} +- \p{In_Elbasan} +- \p{In_Elymaic} +- \p{In_Emoticons} +- \p{In_Enclosed_Alphanumeric_Supplement} +- \p{In_Enclosed_Alphanumerics} +- \p{In_Enclosed_CJK_Letters_and_Months} +- \p{In_Enclosed_Ideographic_Supplement} +- \p{In_Ethiopic} +- \p{In_Ethiopic_Extended} +- \p{In_Ethiopic_Extended_A} +- \p{In_Ethiopic_Extended_B} +- \p{In_Ethiopic_Supplement} +- \p{In_General_Punctuation} +- \p{In_Geometric_Shapes} +- \p{In_Geometric_Shapes_Extended} +- \p{In_Georgian} +- \p{In_Georgian_Extended} +- \p{In_Georgian_Supplement} +- \p{In_Glagolitic} +- \p{In_Glagolitic_Supplement} +- \p{In_Gothic} +- \p{In_Grantha} +- \p{In_Greek_Extended} +- \p{In_Greek_and_Coptic} +- \p{In_Gujarati} +- \p{In_Gunjala_Gondi} +- \p{In_Gurmukhi} +- \p{In_Halfwidth_and_Fullwidth_Forms} +- \p{In_Hangul_Compatibility_Jamo} +- \p{In_Hangul_Jamo} +- \p{In_Hangul_Jamo_Extended_A} +- \p{In_Hangul_Jamo_Extended_B} +- \p{In_Hangul_Syllables} +- \p{In_Hanifi_Rohingya} +- \p{In_Hanunoo} +- \p{In_Hatran} +- \p{In_Hebrew} +- \p{In_High_Private_Use_Surrogates} +- \p{In_High_Surrogates} +- \p{In_Hiragana} +- \p{In_IPA_Extensions} +- \p{In_Ideographic_Description_Characters} +- \p{In_Ideographic_Symbols_and_Punctuation} +- \p{In_Imperial_Aramaic} +- \p{In_Indic_Siyaq_Numbers} +- \p{In_Inscriptional_Pahlavi} +- \p{In_Inscriptional_Parthian} +- \p{In_Javanese} +- \p{In_Kaithi} +- \p{In_Kaktovik_Numerals} +- \p{In_Kana_Extended_A} +- \p{In_Kana_Extended_B} +- \p{In_Kana_Supplement} +- \p{In_Kanbun} +- \p{In_Kangxi_Radicals} +- \p{In_Kannada} +- \p{In_Katakana} +- \p{In_Katakana_Phonetic_Extensions} +- \p{In_Kawi} +- \p{In_Kayah_Li} +- \p{In_Kharoshthi} +- \p{In_Khitan_Small_Script} +- \p{In_Khmer} +- \p{In_Khmer_Symbols} +- \p{In_Khojki} +- \p{In_Khudawadi} +- \p{In_Lao} +- \p{In_Latin_1_Supplement} +- \p{In_Latin_Extended_A} +- \p{In_Latin_Extended_Additional} +- \p{In_Latin_Extended_B} +- \p{In_Latin_Extended_C} +- \p{In_Latin_Extended_D} +- \p{In_Latin_Extended_E} +- \p{In_Latin_Extended_F} +- \p{In_Latin_Extended_G} +- \p{In_Lepcha} +- \p{In_Letterlike_Symbols} +- \p{In_Limbu} +- \p{In_Linear_A} +- \p{In_Linear_B_Ideograms} +- \p{In_Linear_B_Syllabary} +- \p{In_Lisu} +- \p{In_Lisu_Supplement} +- \p{In_Low_Surrogates} +- \p{In_Lycian} +- \p{In_Lydian} +- \p{In_Mahajani} +- \p{In_Mahjong_Tiles} +- \p{In_Makasar} +- \p{In_Malayalam} +- \p{In_Mandaic} +- \p{In_Manichaean} +- \p{In_Marchen} +- \p{In_Masaram_Gondi} +- \p{In_Mathematical_Alphanumeric_Symbols} +- \p{In_Mathematical_Operators} +- \p{In_Mayan_Numerals} +- \p{In_Medefaidrin} +- \p{In_Meetei_Mayek} +- \p{In_Meetei_Mayek_Extensions} +- \p{In_Mende_Kikakui} +- \p{In_Meroitic_Cursive} +- \p{In_Meroitic_Hieroglyphs} +- \p{In_Miao} +- \p{In_Miscellaneous_Mathematical_Symbols_A} +- \p{In_Miscellaneous_Mathematical_Symbols_B} +- \p{In_Miscellaneous_Symbols} +- \p{In_Miscellaneous_Symbols_and_Arrows} +- \p{In_Miscellaneous_Symbols_and_Pictographs} +- \p{In_Miscellaneous_Technical} +- \p{In_Modi} +- \p{In_Modifier_Tone_Letters} +- \p{In_Mongolian} +- \p{In_Mongolian_Supplement} +- \p{In_Mro} +- \p{In_Multani} +- \p{In_Musical_Symbols} +- \p{In_Myanmar} +- \p{In_Myanmar_Extended_A} +- \p{In_Myanmar_Extended_B} +- \p{In_NKo} +- \p{In_Nabataean} +- \p{In_Nag_Mundari} +- \p{In_Nandinagari} +- \p{In_New_Tai_Lue} +- \p{In_Newa} +- \p{In_No_Block} +- \p{In_Number_Forms} +- \p{In_Nushu} +- \p{In_Nyiakeng_Puachue_Hmong} +- \p{In_Ogham} +- \p{In_Ol_Chiki} +- \p{In_Old_Hungarian} +- \p{In_Old_Italic} +- \p{In_Old_North_Arabian} +- \p{In_Old_Permic} +- \p{In_Old_Persian} +- \p{In_Old_Sogdian} +- \p{In_Old_South_Arabian} +- \p{In_Old_Turkic} +- \p{In_Old_Uyghur} +- \p{In_Optical_Character_Recognition} +- \p{In_Oriya} +- \p{In_Ornamental_Dingbats} +- \p{In_Osage} +- \p{In_Osmanya} +- \p{In_Ottoman_Siyaq_Numbers} +- \p{In_Pahawh_Hmong} +- \p{In_Palmyrene} +- \p{In_Pau_Cin_Hau} +- \p{In_Phags_pa} +- \p{In_Phaistos_Disc} +- \p{In_Phoenician} +- \p{In_Phonetic_Extensions} +- \p{In_Phonetic_Extensions_Supplement} +- \p{In_Playing_Cards} +- \p{In_Private_Use_Area} +- \p{In_Psalter_Pahlavi} +- \p{In_Rejang} +- \p{In_Rumi_Numeral_Symbols} +- \p{In_Runic} +- \p{In_Samaritan} +- \p{In_Saurashtra} +- \p{In_Sharada} +- \p{In_Shavian} +- \p{In_Shorthand_Format_Controls} +- \p{In_Siddham} +- \p{In_Sinhala} +- \p{In_Sinhala_Archaic_Numbers} +- \p{In_Small_Form_Variants} +- \p{In_Small_Kana_Extension} +- \p{In_Sogdian} +- \p{In_Sora_Sompeng} +- \p{In_Soyombo} +- \p{In_Spacing_Modifier_Letters} +- \p{In_Specials} +- \p{In_Sundanese} +- \p{In_Sundanese_Supplement} +- \p{In_Superscripts_and_Subscripts} +- \p{In_Supplemental_Arrows_A} +- \p{In_Supplemental_Arrows_B} +- \p{In_Supplemental_Arrows_C} +- \p{In_Supplemental_Mathematical_Operators} +- \p{In_Supplemental_Punctuation} +- \p{In_Supplemental_Symbols_and_Pictographs} +- \p{In_Supplementary_Private_Use_Area_A} +- \p{In_Supplementary_Private_Use_Area_B} +- \p{In_Sutton_SignWriting} +- \p{In_Syloti_Nagri} +- \p{In_Symbols_and_Pictographs_Extended_A} +- \p{In_Symbols_for_Legacy_Computing} +- \p{In_Syriac} +- \p{In_Syriac_Supplement} +- \p{In_Tagalog} +- \p{In_Tagbanwa} +- \p{In_Tags} +- \p{In_Tai_Le} +- \p{In_Tai_Tham} +- \p{In_Tai_Viet} +- \p{In_Tai_Xuan_Jing_Symbols} +- \p{In_Takri} +- \p{In_Tamil} +- \p{In_Tamil_Supplement} +- \p{In_Tangsa} +- \p{In_Tangut} +- \p{In_Tangut_Components} +- \p{In_Tangut_Supplement} +- \p{In_Telugu} +- \p{In_Thaana} +- \p{In_Thai} +- \p{In_Tibetan} +- \p{In_Tifinagh} +- \p{In_Tirhuta} +- \p{In_Toto} +- \p{In_Transport_and_Map_Symbols} +- \p{In_Ugaritic} +- \p{In_Unified_Canadian_Aboriginal_Syllabics} +- \p{In_Unified_Canadian_Aboriginal_Syllabics_Extended} +- \p{In_Unified_Canadian_Aboriginal_Syllabics_Extended_A} +- \p{In_Vai} +- \p{In_Variation_Selectors} +- \p{In_Variation_Selectors_Supplement} +- \p{In_Vedic_Extensions} +- \p{In_Vertical_Forms} +- \p{In_Vithkuqi} +- \p{In_Wancho} +- \p{In_Warang_Citi} +- \p{In_Yezidi} +- \p{In_Yi_Radicals} +- \p{In_Yi_Syllables} +- \p{In_Yijing_Hexagram_Symbols} +- \p{In_Zanabazar_Square} +- \p{In_Znamenny_Musical_Notation} + +=== Emoji + +- \p{Emoji} +- \p{Emoji_Component}, \p{EComp} +- \p{Emoji_Modifier}, \p{EMod} +- \p{Emoji_Modifier_Base}, \p{EBase} +- \p{Emoji_Presentation}, \p{EPres} +- \p{Extended_Pictographic}, \p{ExtPict} + +=== Graphemes + +- \p{Grapheme_Cluster_Break_CR} +- \p{Grapheme_Cluster_Break_Control} +- \p{Grapheme_Cluster_Break_Extend} +- \p{Grapheme_Cluster_Break_L} +- \p{Grapheme_Cluster_Break_LF} +- \p{Grapheme_Cluster_Break_LV} +- \p{Grapheme_Cluster_Break_LVT} +- \p{Grapheme_Cluster_Break_Prepend} +- \p{Grapheme_Cluster_Break_Regional_Indicator} +- \p{Grapheme_Cluster_Break_SpacingMark} +- \p{Grapheme_Cluster_Break_T} +- \p{Grapheme_Cluster_Break_V} +- \p{Grapheme_Cluster_Break_ZWJ} + +=== Derived Ages + +- \p{Age_10_0} +- \p{Age_11_0} +- \p{Age_12_0} +- \p{Age_12_1} +- \p{Age_13_0} +- \p{Age_14_0} +- \p{Age_15_0} +- \p{Age_1_1} +- \p{Age_2_0} +- \p{Age_2_1} +- \p{Age_3_0} +- \p{Age_3_1} +- \p{Age_3_2} +- \p{Age_4_0} +- \p{Age_4_1} +- \p{Age_5_0} +- \p{Age_5_1} +- \p{Age_5_2} +- \p{Age_6_0} +- \p{Age_6_1} +- \p{Age_6_2} +- \p{Age_6_3} +- \p{Age_7_0} +- \p{Age_8_0} +- \p{Age_9_0} diff --git a/enc/unicode/15.0.0/name2ctype.h b/enc/unicode/15.0.0/name2ctype.h index a2c996423d..6bbbb3512f 100644 --- a/enc/unicode/15.0.0/name2ctype.h +++ b/enc/unicode/15.0.0/name2ctype.h @@ -5402,7 +5402,7 @@ static const OnigCodePoint CR_ASCII[] = { 0x0000, 0x007f, }; /* CR_ASCII */ -/* 'Punct' */ +/* 'Punct': [[:Punct:]] */ static const OnigCodePoint CR_Punct[] = { 191, 0x0021, 0x0023, diff --git a/template/unicode_properties.rdoc.tmpl b/template/unicode_properties.rdoc.tmpl new file mode 100755 index 0000000000..7bbedc780c --- /dev/null +++ b/template/unicode_properties.rdoc.tmpl @@ -0,0 +1,59 @@ +== \Regexps Based on Unicode Properties + +The properties shown here are those currently supported in Ruby. +Older versions may not support all of these. +<% +# Generate a documentation file for the unicode properties. +# +# Usage: +# +# Get PropertyAliases.txt, PropertyValueAliases.txt from unicode.org +# (http://unicode.org/Public/UNIDATA/) and run +# ``` +# ruby tool/generic_erb.rb template/unicode_properties.rdoc.tmpl data_dir name2ctype.h +# ``` + +data_dir = ARGV.shift&.tap { |d| Dir.exist?(d) } || + abort("Usage: #{$0} data_directory [name2ctype.h]") + +# Map group names, given as last argument to #make_const in enc-unicode.rb, +# to sections in the doc. The order in this hash controls the order in the doc. +map = { + /\[\[:/ => 'POSIX brackets', + '-' => 'Special', + /.+ Category/ => 'Major and General Categories', + 'Binary Property' => 'Prop List', + /Derived Property/ => 'Derived Core Properties', + 'Script' => 'Scripts', + 'Block' => 'Blocks', + 'Emoji' => 'Emoji', + /Grapheme/ => 'Graphemes', + /Derived Age/ => 'Derived Ages', +} + +# aliases in the form { short => long }, e.g. { 'Hex' => 'Hex_Digit', 'L' => 'Letter' } +aliases = ( + File.binread(File.join(data_dir, 'PropertyAliases.txt')).scan(/^(\w+)\s*; (\w+)/) + + File.binread(File.join(data_dir, 'PropertyValueAliases.txt')).scan(/^(?:gc|sc)\s*; (\w+)\s*; (\w+)/) +).to_h + +props_by_section = {} +ARGF.each_line do |line| + next unless /'(?[^']+)': (?.+) \*/ =~ line + next if prop == 'NEWLINE' # ignore custom internal prop + + section = map.find { |k, v| k === name }&.last || warn("no doc section for #{name}") + + # normalize prop names - the header file uses a mix of short and long names + long_prop_name = aliases[prop] || prop + (props_by_section[section] ||= []) << long_prop_name +end + +map.each_value do |section| -%> + +=== <%=section%> + +% props_by_section[section].sort.each do |prop| +- <%= [prop, aliases.key(prop)].compact.uniq.map { |v| "\\p{#{v}}" }.join(', ') %> +% end +% end diff --git a/tool/enc-unicode.rb b/tool/enc-unicode.rb index 3fdbe71634..2224ce7149 100755 --- a/tool/enc-unicode.rb +++ b/tool/enc-unicode.rb @@ -269,23 +269,12 @@ def parse_block(data) blocks << constname end -# shim for Ruby 1.8 -unless {}.respond_to?(:key) - class Hash - alias key index - end -end - $const_cache = {} # make_const(property, pairs, name): Prints a 'static const' structure for a # given property, group of paired codepoints, and a human-friendly name for # the group def make_const(prop, data, name) - if name.empty? - puts "\n/* '#{prop}' */" - else - puts "\n/* '#{prop}': #{name} */" - end + puts "\n/* '#{prop}': #{name} */" # comment used to generate documentation if origprop = $const_cache.key(data) puts "#define CR_#{prop} CR_#{origprop}" else @@ -437,8 +426,6 @@ define_posix_props(data) POSIX_NAMES.each do |name| if name == 'XPosixPunct' make_const(name, data[name], "[[:Punct:]]") - elsif name == 'Punct' - make_const(name, data[name], "") else make_const(name, data[name], "[[:#{name}:]]") end