123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139 |
- // Copyright 2016 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package precis
- import "errors"
- // This file contains tables and code related to context rules.
- type catBitmap uint16
- const (
- // These bits, once set depending on the current value, are never unset.
- bJapanese catBitmap = 1 << iota
- bArabicIndicDigit
- bExtendedArabicIndicDigit
- // These bits are set on each iteration depending on the current value.
- bJoinStart
- bJoinMid
- bJoinEnd
- bVirama
- bLatinSmallL
- bGreek
- bHebrew
- // These bits indicated which of the permanent bits need to be set at the
- // end of the checks.
- bMustHaveJapn
- permanent = bJapanese | bArabicIndicDigit | bExtendedArabicIndicDigit | bMustHaveJapn
- )
- const finalShift = 10
- var errContext = errors.New("precis: contextual rule violated")
- func init() {
- // Programmatically set these required bits as, manually setting them seems
- // too error prone.
- for i, ct := range categoryTransitions {
- categoryTransitions[i].keep |= permanent
- categoryTransitions[i].accept |= ct.term
- }
- }
- var categoryTransitions = []struct {
- keep catBitmap // mask selecting which bits to keep from the previous state
- set catBitmap // mask for which bits to set for this transition
- // These bitmaps are used for rules that require lookahead.
- // term&accept == term must be true, which is enforced programmatically.
- term catBitmap // bits accepted as termination condition
- accept catBitmap // bits that pass, but not sufficient as termination
- // The rule function cannot take a *context as an argument, as it would
- // cause the context to escape, adding significant overhead.
- rule func(beforeBits catBitmap) (doLookahead bool, err error)
- }{
- joiningL: {set: bJoinStart},
- joiningD: {set: bJoinStart | bJoinEnd},
- joiningT: {keep: bJoinStart, set: bJoinMid},
- joiningR: {set: bJoinEnd},
- viramaModifier: {set: bVirama},
- viramaJoinT: {set: bVirama | bJoinMid},
- latinSmallL: {set: bLatinSmallL},
- greek: {set: bGreek},
- greekJoinT: {set: bGreek | bJoinMid},
- hebrew: {set: bHebrew},
- hebrewJoinT: {set: bHebrew | bJoinMid},
- japanese: {set: bJapanese},
- katakanaMiddleDot: {set: bMustHaveJapn},
- zeroWidthNonJoiner: {
- term: bJoinEnd,
- accept: bJoinMid,
- rule: func(before catBitmap) (doLookAhead bool, err error) {
- if before&bVirama != 0 {
- return false, nil
- }
- if before&bJoinStart == 0 {
- return false, errContext
- }
- return true, nil
- },
- },
- zeroWidthJoiner: {
- rule: func(before catBitmap) (doLookAhead bool, err error) {
- if before&bVirama == 0 {
- err = errContext
- }
- return false, err
- },
- },
- middleDot: {
- term: bLatinSmallL,
- rule: func(before catBitmap) (doLookAhead bool, err error) {
- if before&bLatinSmallL == 0 {
- return false, errContext
- }
- return true, nil
- },
- },
- greekLowerNumeralSign: {
- set: bGreek,
- term: bGreek,
- rule: func(before catBitmap) (doLookAhead bool, err error) {
- return true, nil
- },
- },
- hebrewPreceding: {
- set: bHebrew,
- rule: func(before catBitmap) (doLookAhead bool, err error) {
- if before&bHebrew == 0 {
- err = errContext
- }
- return false, err
- },
- },
- arabicIndicDigit: {
- set: bArabicIndicDigit,
- rule: func(before catBitmap) (doLookAhead bool, err error) {
- if before&bExtendedArabicIndicDigit != 0 {
- err = errContext
- }
- return false, err
- },
- },
- extendedArabicIndicDigit: {
- set: bExtendedArabicIndicDigit,
- rule: func(before catBitmap) (doLookAhead bool, err error) {
- if before&bArabicIndicDigit != 0 {
- err = errContext
- }
- return false, err
- },
- },
- }
|