graphemerules.go

package uniseg

// The states of the grapheme cluster parser.
const (
	grAny = iota
	grCR
	grControlLF
	grL
	grLVV
	grLVTT
	grPrepend
	grExtendedPictographic
	grExtendedPictographicZWJ
	grRIOdd
	grRIEven
)

// The grapheme cluster parser's breaking instructions.
const (
	grNoBoundary = iota
	grBoundary
)

// The grapheme cluster parser's state transitions. Maps (state, property) to
// (new state, breaking instruction, rule number). The breaking instruction
// always refers to the boundary between the last and next code point.
//
// This map is queried as follows:
//
//  1. Find specific state + specific property. Stop if found.
//  2. Find specific state + any property.
//  3. Find any state + specific property.
//  4. If only (2) or (3) (but not both) was found, stop.
//  5. If both (2) and (3) were found, use state from (3) and breaking instruction
//     from the transition with the lower rule number, prefer (3) if rule numbers
//     are equal. Stop.
//  6. Assume grAny and grBoundary.
//
// Unicode version 14.0.0.
var grTransitions = map[[2]int][3]int{
	// GB5
	{grAny, prCR}:      {grCR, grBoundary, 50},
	{grAny, prLF}:      {grControlLF, grBoundary, 50},
	{grAny, prControl}: {grControlLF, grBoundary, 50},

	// GB4
	{grCR, prAny}:        {grAny, grBoundary, 40},
	{grControlLF, prAny}: {grAny, grBoundary, 40},

	// GB3.
	{grCR, prLF}: {grControlLF, grNoBoundary, 30},

	// GB6.
	{grAny, prL}: {grL, grBoundary, 9990},
	{grL, prL}:   {grL, grNoBoundary, 60},
	{grL, prV}:   {grLVV, grNoBoundary, 60},
	{grL, prLV}:  {grLVV, grNoBoundary, 60},
	{grL, prLVT}: {grLVTT, grNoBoundary, 60},

	// GB7.
	{grAny, prLV}: {grLVV, grBoundary, 9990},
	{grAny, prV}:  {grLVV, grBoundary, 9990},
	{grLVV, prV}:  {grLVV, grNoBoundary, 70},
	{grLVV, prT}:  {grLVTT, grNoBoundary, 70},

	// GB8.
	{grAny, prLVT}: {grLVTT, grBoundary, 9990},
	{grAny, prT}:   {grLVTT, grBoundary, 9990},
	{grLVTT, prT}:  {grLVTT, grNoBoundary, 80},

	// GB9.
	{grAny, prExtend}: {grAny, grNoBoundary, 90},
	{grAny, prZWJ}:    {grAny, grNoBoundary, 90},

	// GB9a.
	{grAny, prSpacingMark}: {grAny, grNoBoundary, 91},

	// GB9b.
	{grAny, prPrepend}: {grPrepend, grBoundary, 9990},
	{grPrepend, prAny}: {grAny, grNoBoundary, 92},

	// GB11.
	{grAny, prExtendedPictographic}:                     {grExtendedPictographic, grBoundary, 9990},
	{grExtendedPictographic, prExtend}:                  {grExtendedPictographic, grNoBoundary, 110},
	{grExtendedPictographic, prZWJ}:                     {grExtendedPictographicZWJ, grNoBoundary, 110},
	{grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110},

	// GB12 / GB13.
	{grAny, prRegionalIndicator}:    {grRIOdd, grBoundary, 9990},
	{grRIOdd, prRegionalIndicator}:  {grRIEven, grNoBoundary, 120},
	{grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120},
}

// transitionGraphemeState determines the new state of the grapheme cluster
// parser given the current state and the next code point. It also returns the
// code point's grapheme property (the value mapped by the [graphemeCodePoints]
// table) and whether a cluster boundary was detected.
func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) {
	// Determine the property of the next character.
	prop = property(graphemeCodePoints, r)

	// Find the applicable transition.
	transition, ok := grTransitions[[2]int{state, prop}]
	if ok {
		// We have a specific transition. We'll use it.
		return transition[0], prop, transition[1] == grBoundary
	}

	// No specific transition found. Try the less specific ones.
	transAnyProp, okAnyProp := grTransitions[[2]int{state, prAny}]
	transAnyState, okAnyState := grTransitions[[2]int{grAny, prop}]
	if okAnyProp && okAnyState {
		// Both apply. We'll use a mix (see comments for grTransitions).
		newState = transAnyState[0]
		boundary = transAnyState[1] == grBoundary
		if transAnyProp[2] < transAnyState[2] {
			boundary = transAnyProp[1] == grBoundary
		}
		return
	}

	if okAnyProp {
		// We only have a specific state.
		return transAnyProp[0], prop, transAnyProp[1] == grBoundary
		// This branch will probably never be reached because okAnyState will
		// always be true given the current transition map. But we keep it here
		// for future modifications to the transition map where this may not be
		// true anymore.
	}

	if okAnyState {
		// We only have a specific property.
		return transAnyState[0], prop, transAnyState[1] == grBoundary
	}

	// No known transition. GB999: Any ÷ Any.
	return grAny, prop, true
}