blob: 9a2d843cd5371bdabfac71c4332febedffb2e8ea [file] [log] [blame]
#
# Copyright (C) 2002-2016, International Business Machines Corporation and others.
# All Rights Reserved.
#
# file: char.txt
#
# ICU Character Break Rules, also known as Grapheme Cluster Boundaries
# See Unicode Standard Annex #29.
# These rules are based on UAX #29 Revision 28 (Draft 3) for Unicode Version 9.0
#
#
# Character Class Definitions.
#
$CR = [\p{Grapheme_Cluster_Break = CR}];
$LF = [\p{Grapheme_Cluster_Break = LF}];
$Control = [[\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]];
# TODO: Restore if the Prepend set becomes non-empty again: $Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
$Extend = [[\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]];
$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
$Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
#
# Korean Syllable Definitions
#
$L = [\p{Grapheme_Cluster_Break = L}];
$V = [\p{Grapheme_Cluster_Break = V}];
$T = [\p{Grapheme_Cluster_Break = T}];
$LV = [\p{Grapheme_Cluster_Break = LV}];
$LVT = [\p{Grapheme_Cluster_Break = LVT}];
# Emoji defintions scraped from http://www.unicode.org/Public/emoji/2.0//emoji-data.txt
$E_Base = [\u261D\u26F9\u270A-\u270D\U0001F385\U0001F3C3-\U0001F3C4\U0001F3CA-\U0001F3CB\U0001F442-\U0001F443\U0001F446-\U0001F450\U0001F466-\U0001F469\U0001F46E\U0001F470-\U0001F478\U0001F47C\U0001F481-\U0001F483\U0001F485-\U0001F487\U0001F4AA\U0001F575\U0001F590\U0001F595-\U0001F596\U0001F645-\U0001F647\U0001F64B-\U0001F64F\U0001F6A3\U0001F6B4-\U0001F6B6\U0001F6C0\U0001F918];
$E_Modifier = [\U0001F3FB-\U0001F3FF];
$ZWJ = [\u200D];
$GAZ = [\U0001F466-\U0001F469\U0001F48B\U0001F5E8\u2764];
## -------------------------------------------------
!!chain;
!!lookAheadHardBreak;
!!forward;
$CR $LF;
$L ($L | $V | $LV | $LVT);
($LV | $V) ($V | $T);
($LVT | $T) $T;
# GB 8. Keep pairs of regional indicators together
# Note that hard break '/' rule triggers only if there are three or more initial RIs,
^$Regional_Indicator $Regional_Indicator / $Regional_Indicator;
^$Regional_Indicator $Regional_Indicator;
# GB 9
[^$Control $CR $LF] ($Extend | $ZWJ);
# GB 9a (only for extended grapheme clusters)
[^$Control $CR $LF] $SpacingMark;
# GB 9b Restore if the Prepend set becomes non-empty again: $Prepend [^$Control $CR $LF];
# GB9c Emoji proposal
($E_Base | $GAZ) $E_Modifier;
# GB 9d Don't break between ZWJ and Glue_After_Zwj
$ZWJ $GAZ;
## -------------------------------------------------
!!reverse;
$LF $CR;
($L | $V | $LV | $LVT) $L;
($V | $T) ($LV | $V);
$T ($LVT | $T);
# GB 8. Going backwards, we must scan through any number of regional indicators as pairs.
#
$Regional_Indicator $Regional_Indicator / ($Regional_Indicator $Regional_Indicator)* [{eof}[^$Regional_Indicator]];
# GB 9
($Extend | $ZWJ) [^$Control $CR $LF]; #note that this will chain into Regional_Indicator when needed.
# GB 9a
$SpacingMark [^$Control $CR $LF];
# GB 9b Restore if the Prepend set becomes non-empty again: [^$Control $CR $LF] $Prepend;
# GB 9c
$E_Modifier ($E_Base | $GAZ);
# GB 9d Don't break between ZWJ and Glue_After_Zwj
$GAZ $ZWJ;
## -------------------------------------------------
!!safe_reverse;
$Regional_Indicator $Regional_Indicator;
## -------------------------------------------------
!!safe_forward;
$Regional_Indicator $Regional_Indicator;