From e7dab68ee665bad8369e3ea783ca5583bdac5a9f Mon Sep 17 00:00:00 2001 From: Martin Atkins Date: Wed, 4 Mar 2020 17:51:14 -0800 Subject: [PATCH] textseg: Update tables for Unicode 11.0.0 As of Unicode 11 the grapheme cluster definition is derived from a property defined in the emoji data, so we need to introduce a table from there too now in addition to the table for the text segmentation specification. This also includes an updated table of grapheme cluster segmentation tests derived from the Unicode character database version 11.0.0. --- go.mod | 4 +- go.sum | 2 - textseg/emoji_table.rl | 269 + textseg/generate.go | 3 +- textseg/grapheme_clusters.go | 7311 +++++++++++---------------- textseg/grapheme_clusters.rl | 11 +- textseg/grapheme_clusters_table.rl | 101 +- textseg/make_tables.go | 2 +- textseg/make_test_tables.go | 2 +- textseg/tables.go | 396 +- textseg/tables_test.go | 7482 +++++++++++----------------- textseg/unicode2ragel.rb | 2 +- 12 files changed, 6347 insertions(+), 9238 deletions(-) create mode 100644 textseg/emoji_table.rl diff --git a/go.mod b/go.mod index 7af1bff..56c9616 100644 --- a/go.mod +++ b/go.mod @@ -1,3 +1,3 @@ -module github.com/apparentlymart/go-textseg/v10 +module github.com/apparentlymart/go-textseg/v11 -go 1.12 +go 1.13 diff --git a/go.sum b/go.sum index 961f090..e69de29 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +0,0 @@ -github.com/apparentlymart/go-textseg v1.0.0 h1:rRmlIsPEEhUTIKQb7T++Nz/A5Q6C9IuX2wFoYVvnCs0= -github.com/apparentlymart/go-textseg v1.0.0/go.mod h1:z96Txxhf3xSFMPmb5X/1W05FF/Nj9VFpLOpjS5yuumk= diff --git a/textseg/emoji_table.rl b/textseg/emoji_table.rl new file mode 100644 index 0000000..192df2a --- /dev/null +++ b/textseg/emoji_table.rl @@ -0,0 +1,269 @@ +# The following Ragel file was autogenerated with unicode2ragel.rb +# from: https://www.unicode.org/Public/emoji/11.0/emoji-data.txt +# +# It defines ["Extended_Pictographic"]. +# +# To use this, make sure that your alphtype is set to byte, +# and that your input is in utf8. + +%%{ + machine Emoji; + + Extended_Pictographic = + 0xC2 0xA9 #1.1 [1] (©️) copyright + | 0xC2 0xAE #1.1 [1] (®️) registered + | 0xE2 0x80 0xBC #1.1 [1] (‼️) double exclamation mark + | 0xE2 0x81 0x89 #3.0 [1] (⁉️) exclamation question mark + | 0xE2 0x84 0xA2 #1.1 [1] (™️) trade mark + | 0xE2 0x84 0xB9 #3.0 [1] (ℹ️) information + | 0xE2 0x86 0x94..0x99 #1.1 [6] (↔️..↙️) left-right arrow..down... + | 0xE2 0x86 0xA9..0xAA #1.1 [2] (↩️..↪️) right arrow curving le... + | 0xE2 0x8C 0x9A..0x9B #1.1 [2] (⌚..⌛) watch..hourglass done + | 0xE2 0x8C 0xA8 #1.1 [1] (⌨️) keyboard + | 0xE2 0x8E 0x88 #3.0 [1] (⎈️) HELM SYMBOL + | 0xE2 0x8F 0x8F #4.0 [1] (⏏️) eject button + | 0xE2 0x8F 0xA9..0xB3 #6.0 [11] (⏩..⏳) fast-forward button..hou... + | 0xE2 0x8F 0xB8..0xBA #7.0 [3] (⏸️..⏺️) pause button..record b... + | 0xE2 0x93 0x82 #1.1 [1] (Ⓜ️) circled M + | 0xE2 0x96 0xAA..0xAB #1.1 [2] (▪️..▫️) black small square..wh... + | 0xE2 0x96 0xB6 #1.1 [1] (▶️) play button + | 0xE2 0x97 0x80 #1.1 [1] (◀️) reverse button + | 0xE2 0x97 0xBB..0xBE #3.2 [4] (◻️..◾) white medium square..bl... + | 0xE2 0x98 0x80..0x85 #1.1 [6] (☀️..★️) sun..BLACK STAR + | 0xE2 0x98 0x87..0x92 #1.1 [12] (☇️..☒️) LIGHTNING..BALLOT BOX ... + | 0xE2 0x98 0x94..0x95 #4.0 [2] (☔..☕) umbrella with rain drops... + | 0xE2 0x98 0x96..0x97 #3.2 [2] (☖️..☗️) WHITE SHOGI PIECE..BLA... + | 0xE2 0x98 0x98 #4.1 [1] (☘️) shamrock + | 0xE2 0x98 0x99 #3.0 [1] (☙️) REVERSED ROTATED FLORAL... + | 0xE2 0x98 0x9A..0xFF #1.1 [86] (☚️..♯️) BLACK LEFT POINTING IN... + | 0xE2 0x99 0x00..0xAF # + | 0xE2 0x99 0xB0..0xB1 #3.0 [2] (♰️..♱️) WEST SYRIAC CROSS..EAS... + | 0xE2 0x99 0xB2..0xBD #3.2 [12] (♲️..♽️) UNIVERSAL RECYCLING SY... + | 0xE2 0x99 0xBE..0xBF #4.1 [2] (♾️..♿) infinity..wheelchair sy... + | 0xE2 0x9A 0x80..0x85 #3.2 [6] (⚀️..⚅️) DIE FACE-1..DIE FACE-6 + | 0xE2 0x9A 0x90..0x91 #4.0 [2] (⚐️..⚑️) WHITE FLAG..BLACK FLAG + | 0xE2 0x9A 0x92..0x9C #4.1 [11] (⚒️..⚜️) hammer and pick..fleur... + | 0xE2 0x9A 0x9D #5.1 [1] (⚝️) OUTLINED WHITE STAR + | 0xE2 0x9A 0x9E..0x9F #5.2 [2] (⚞️..⚟️) THREE LINES CONVERGING... + | 0xE2 0x9A 0xA0..0xA1 #4.0 [2] (⚠️..⚡) warning..high voltage + | 0xE2 0x9A 0xA2..0xB1 #4.1 [16] (⚢️..⚱️) DOUBLED FEMALE SIGN..f... + | 0xE2 0x9A 0xB2 #5.0 [1] (⚲️) NEUTER + | 0xE2 0x9A 0xB3..0xBC #5.1 [10] (⚳️..⚼️) CERES..SESQUIQUADRATE + | 0xE2 0x9A 0xBD..0xBF #5.2 [3] (⚽..⚿️) soccer ball..SQUARED KEY + | 0xE2 0x9B 0x80..0x83 #5.1 [4] (⛀️..⛃️) WHITE DRAUGHTS MAN..BL... + | 0xE2 0x9B 0x84..0x8D #5.2 [10] (⛄..⛍️) snowman without snow..D... + | 0xE2 0x9B 0x8E #6.0 [1] (⛎) Ophiuchus + | 0xE2 0x9B 0x8F..0xA1 #5.2 [19] (⛏️..⛡️) pick..RESTRICTED LEFT ... + | 0xE2 0x9B 0xA2 #6.0 [1] (⛢️) ASTRONOMICAL SYMBOL FOR... + | 0xE2 0x9B 0xA3 #5.2 [1] (⛣️) HEAVY CIRCLE WITH STROK... + | 0xE2 0x9B 0xA4..0xA7 #6.0 [4] (⛤️..⛧️) PENTAGRAM..INVERTED PE... + | 0xE2 0x9B 0xA8..0xBF #5.2 [24] (⛨️..⛿️) BLACK CROSS ON SHIELD.... + | 0xE2 0x9C 0x80 #7.0 [1] (✀️) BLACK SAFETY SCISSORS + | 0xE2 0x9C 0x81..0x84 #1.1 [4] (✁️..✄️) UPPER BLADE SCISSORS..... + | 0xE2 0x9C 0x85 #6.0 [1] (✅) white heavy check mark + | 0xE2 0x9C 0x88..0x89 #1.1 [2] (✈️..✉️) airplane..envelope + | 0xE2 0x9C 0x8A..0x8B #6.0 [2] (✊..✋) raised fist..raised hand + | 0xE2 0x9C 0x8C..0x92 #1.1 [7] (✌️..✒️) victory hand..black nib + | 0xE2 0x9C 0x94 #1.1 [1] (✔️) heavy check mark + | 0xE2 0x9C 0x96 #1.1 [1] (✖️) heavy multiplication x + | 0xE2 0x9C 0x9D #1.1 [1] (✝️) latin cross + | 0xE2 0x9C 0xA1 #1.1 [1] (✡️) star of David + | 0xE2 0x9C 0xA8 #6.0 [1] (✨) sparkles + | 0xE2 0x9C 0xB3..0xB4 #1.1 [2] (✳️..✴️) eight-spoked asterisk.... + | 0xE2 0x9D 0x84 #1.1 [1] (❄️) snowflake + | 0xE2 0x9D 0x87 #1.1 [1] (❇️) sparkle + | 0xE2 0x9D 0x8C #6.0 [1] (❌) cross mark + | 0xE2 0x9D 0x8E #6.0 [1] (❎) cross mark button + | 0xE2 0x9D 0x93..0x95 #6.0 [3] (❓..❕) question mark..white exc... + | 0xE2 0x9D 0x97 #5.2 [1] (❗) exclamation mark + | 0xE2 0x9D 0xA3..0xA7 #1.1 [5] (❣️..❧️) heavy heart exclamatio... + | 0xE2 0x9E 0x95..0x97 #6.0 [3] (➕..➗) heavy plus sign..heavy d... + | 0xE2 0x9E 0xA1 #1.1 [1] (➡️) right arrow + | 0xE2 0x9E 0xB0 #6.0 [1] (➰) curly loop + | 0xE2 0x9E 0xBF #6.0 [1] (➿) double curly loop + | 0xE2 0xA4 0xB4..0xB5 #3.2 [2] (⤴️..⤵️) right arrow curving up... + | 0xE2 0xAC 0x85..0x87 #4.0 [3] (⬅️..⬇️) left arrow..down arrow + | 0xE2 0xAC 0x9B..0x9C #5.1 [2] (⬛..⬜) black large square..whit... + | 0xE2 0xAD 0x90 #5.1 [1] (⭐) star + | 0xE2 0xAD 0x95 #5.2 [1] (⭕) heavy large circle + | 0xE3 0x80 0xB0 #1.1 [1] (〰️) wavy dash + | 0xE3 0x80 0xBD #3.2 [1] (〽️) part alternation mark + | 0xE3 0x8A 0x97 #1.1 [1] (㊗️) Japanese “congratulatio... + | 0xE3 0x8A 0x99 #1.1 [1] (㊙️) Japanese “secret” button + | 0xF0 0x9F 0x80 0x80..0xAB #5.1 [44] (🀀️..🀫️) MAHJONG TILE EAST W... + | 0xF0 0x9F 0x80 0xAC..0xAF #NA [4] (🀬️..🀯️) ...... + | 0xF0 0x9F 0x83 0x81..0x8F #6.0 [15] (🃁️..🃏) PLAYING CARD ACE OF ... + | 0xF0 0x9F 0x83 0x90 #NA [1] (🃐️) + | 0xF0 0x9F 0x83 0x91..0x9F #6.0 [15] (🃑️..🃟️) PLAYING CARD ACE OF... + | 0xF0 0x9F 0x83 0xA0..0xB5 #7.0 [22] (🃠️..🃵️) PLAYING CARD FOOL..... + | 0xF0 0x9F 0x83 0xB6..0xBF #NA [10] (🃶️..🃿️) ...................................... + | 0xF0 0x9F 0xA5 0x80..0x85 #9.0 [6] (🥀..🥅) wilted flower..goal net + | 0xF0 0x9F 0xA5 0x87..0x8B #9.0 [5] (🥇..🥋) 1st place medal..mart... + | 0xF0 0x9F 0xA5 0x8C #10.0 [1] (🥌) curling stone + | 0xF0 0x9F 0xA5 0x8D..0x8F #11.0 [3] (🥍..🥏) lacrosse..flying disc + | 0xF0 0x9F 0xA5 0x90..0x9E #9.0 [15] (🥐..🥞) croissant..pancakes + | 0xF0 0x9F 0xA5 0x9F..0xAB #10.0 [13] (🥟..🥫) dumpling..canned food + | 0xF0 0x9F 0xA5 0xAC..0xB0 #11.0 [5] (🥬..🥰) leafy green..smiling... + | 0xF0 0x9F 0xA5 0xB1..0xB2 #NA [2] (🥱️..🥲️) .... + | 0xF0 0x9F 0xA5 0xBC..0xBF #11.0 [4] (🥼..🥿) lab coat..woman’s fl... + | 0xF0 0x9F 0xA6 0x80..0x84 #8.0 [5] (🦀..🦄) crab..unicorn face + | 0xF0 0x9F 0xA6 0x85..0x91 #9.0 [13] (🦅..🦑) eagle..squid + | 0xF0 0x9F 0xA6 0x92..0x97 #10.0 [6] (🦒..🦗) giraffe..cricket + | 0xF0 0x9F 0xA6 0x98..0xA2 #11.0 [11] (🦘..🦢) kangaroo..swan + | 0xF0 0x9F 0xA6 0xA3..0xAF #NA [13] (🦣️..🦯️) ........