12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- // Copyright 2015 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package cases
- func (c info) cccVal() info {
- if c&exceptionBit != 0 {
- return info(exceptions[c>>exceptionShift]) & cccMask
- }
- return c & cccMask
- }
- func (c info) cccType() info {
- ccc := c.cccVal()
- if ccc <= cccZero {
- return cccZero
- }
- return ccc
- }
- // TODO: Implement full Unicode breaking algorithm:
- // 1) Implement breaking in separate package.
- // 2) Use the breaker here.
- // 3) Compare table size and performance of using the more generic breaker.
- //
- // Note that we can extend the current algorithm to be much more accurate. This
- // only makes sense, though, if the performance and/or space penalty of using
- // the generic breaker is big. Extra data will only be needed for non-cased
- // runes, which means there are sufficient bits left in the caseType.
- // ICU prohibits breaking in such cases as well.
- // For the purpose of title casing we use an approximation of the Unicode Word
- // Breaking algorithm defined in Annex #29:
- // https://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table.
- //
- // For our approximation, we group the Word Break types into the following
- // categories, with associated rules:
- //
- // 1) Letter:
- // ALetter, Hebrew_Letter, Numeric, ExtendNumLet, Extend, Format_FE, ZWJ.
- // Rule: Never break between consecutive runes of this category.
- //
- // 2) Mid:
- // MidLetter, MidNumLet, Single_Quote.
- // (Cf. case-ignorable: MidLetter, MidNumLet, Single_Quote or cat is Mn,
- // Me, Cf, Lm or Sk).
- // Rule: Don't break between Letter and Mid, but break between two Mids.
- //
- // 3) Break:
- // Any other category: NewLine, MidNum, CR, LF, Double_Quote, Katakana, and
- // Other.
- // These categories should always result in a break between two cased letters.
- // Rule: Always break.
- //
- // Note 1: the Katakana and MidNum categories can, in esoteric cases, result in
- // preventing a break between two cased letters. For now we will ignore this
- // (e.g. [ALetter] [ExtendNumLet] [Katakana] [ExtendNumLet] [ALetter] and
- // [ALetter] [Numeric] [MidNum] [Numeric] [ALetter].)
- //
- // Note 2: the rule for Mid is very approximate, but works in most cases. To
- // improve, we could store the categories in the trie value and use a FA to
- // manage breaks. See TODO comment above.
- //
- // Note 3: according to the spec, it is possible for the Extend category to
- // introduce breaks between other categories grouped in Letter. However, this
- // is undesirable for our purposes. ICU prevents breaks in such cases as well.
- // isBreak returns whether this rune should introduce a break.
- func (c info) isBreak() bool {
- return c.cccVal() == cccBreak
- }
- // isLetter returns whether the rune is of break type ALetter, Hebrew_Letter,
- // Numeric, ExtendNumLet, or Extend.
- func (c info) isLetter() bool {
- ccc := c.cccVal()
- if ccc == cccZero {
- return !c.isCaseIgnorable()
- }
- return ccc != cccBreak
- }
|