| # |
| # Copyright (C) 2002-2016, International Business Machines Corporation and others. |
| # All Rights Reserved. |
| # |
| # file: char.txt |
| # |
| # ICU Character Break Rules, also known as Grapheme Cluster Boundaries |
| # See Unicode Standard Annex #29. |
| # These rules are based on UAX #29 Revision 28 (Draft 3) for Unicode Version 9.0 |
| # |
| |
| # |
| # Character Class Definitions. |
| # |
| $CR = [\p{Grapheme_Cluster_Break = CR}]; |
| $LF = [\p{Grapheme_Cluster_Break = LF}]; |
| $Control = [[\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]]; |
| # TODO: Restore if the Prepend set becomes non-empty again: $Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; |
| $Extend = [[\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]; |
| $SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; |
| $Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}]; |
| |
| # |
| # Korean Syllable Definitions |
| # |
| $L = [\p{Grapheme_Cluster_Break = L}]; |
| $V = [\p{Grapheme_Cluster_Break = V}]; |
| $T = [\p{Grapheme_Cluster_Break = T}]; |
| |
| $LV = [\p{Grapheme_Cluster_Break = LV}]; |
| $LVT = [\p{Grapheme_Cluster_Break = LVT}]; |
| |
| # Emoji defintions scraped from http://www.unicode.org/Public/emoji/2.0//emoji-data.txt |
| |
| $E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918]; |
| |
| $E_Modifier = [\U0001F3FB-\U0001F3FF]; |
| |
| $ZWJ = [\u200D]; |
| $GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764]; |
| |
| ## ------------------------------------------------- |
| !!chain; |
| !!lookAheadHardBreak; |
| !!forward; |
| |
| $CR $LF; |
| |
| $L ($L | $V | $LV | $LVT); |
| ($LV | $V) ($V | $T); |
| ($LVT | $T) $T; |
| |
| # GB 8. Keep pairs of regional indicators together |
| # Note that hard break '/' rule triggers only if there are three or more initial RIs, |
| |
| ^$Regional_Indicator $Regional_Indicator / $Regional_Indicator; |
| ^$Regional_Indicator $Regional_Indicator; |
| |
| # GB 9 |
| [^$Control $CR $LF] ($Extend | $ZWJ); |
| |
| # GB 9a (only for extended grapheme clusters) |
| [^$Control $CR $LF] $SpacingMark; |
| # GB 9b Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF]; |
| |
| # GB9c Emoji proposal |
| ($E_Base | $GAZ) $E_Modifier; |
| |
| # GB 9d Don't break between ZWJ and Glue_After_Zwj |
| $ZWJ $GAZ; |
| |
| ## ------------------------------------------------- |
| |
| !!reverse; |
| $LF $CR; |
| ($L | $V | $LV | $LVT) $L; |
| ($V | $T) ($LV | $V); |
| $T ($LVT | $T); |
| |
| # GB 8. Going backwards, we must scan through any number of regional indicators as pairs. |
| # |
| $Regional_Indicator $Regional_Indicator / ($Regional_Indicator $Regional_Indicator)* [{eof}[^$Regional_Indicator]]; |
| |
| # GB 9 |
| ($Extend | $ZWJ) [^$Control $CR $LF]; #note that this will chain into Regional_Indicator when needed. |
| |
| # GB 9a |
| $SpacingMark [^$Control $CR $LF]; |
| # GB 9b Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend; |
| |
| # GB 9c |
| $E_Modifier ($E_Base | $GAZ); |
| |
| # GB 9d Don't break between ZWJ and Glue_After_Zwj |
| $GAZ $ZWJ; |
| |
| ## ------------------------------------------------- |
| |
| !!safe_reverse; |
| $Regional_Indicator $Regional_Indicator; |
| |
| ## ------------------------------------------------- |
| |
| !!safe_forward; |
| $Regional_Indicator $Regional_Indicator; |