123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816 |
- // Copyright 2014 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package cases
- // This file contains the definitions of case mappings for all supported
- // languages. The rules for the language-specific tailorings were taken and
- // modified from the CLDR transform definitions in common/transforms.
- import (
- "strings"
- "unicode"
- "unicode/utf8"
- "golang.org/x/text/internal"
- "golang.org/x/text/language"
- "golang.org/x/text/transform"
- "golang.org/x/text/unicode/norm"
- )
- // A mapFunc takes a context set to the current rune and writes the mapped
- // version to the same context. It may advance the context to the next rune. It
- // returns whether a checkpoint is possible: whether the pDst bytes written to
- // dst so far won't need changing as we see more source bytes.
- type mapFunc func(*context) bool
- // A spanFunc takes a context set to the current rune and returns whether this
- // rune would be altered when written to the output. It may advance the context
- // to the next rune. It returns whether a checkpoint is possible.
- type spanFunc func(*context) bool
- // maxIgnorable defines the maximum number of ignorables to consider for
- // lookahead operations.
- const maxIgnorable = 30
- // supported lists the language tags for which we have tailorings.
- const supported = "und af az el lt nl tr"
- func init() {
- tags := []language.Tag{}
- for _, s := range strings.Split(supported, " ") {
- tags = append(tags, language.MustParse(s))
- }
- matcher = internal.NewInheritanceMatcher(tags)
- Supported = language.NewCoverage(tags)
- }
- var (
- matcher *internal.InheritanceMatcher
- Supported language.Coverage
- // We keep the following lists separate, instead of having a single per-
- // language struct, to give the compiler a chance to remove unused code.
- // Some uppercase mappers are stateless, so we can precompute the
- // Transformers and save a bit on runtime allocations.
- upperFunc = []struct {
- upper mapFunc
- span spanFunc
- }{
- {nil, nil}, // und
- {nil, nil}, // af
- {aztrUpper(upper), isUpper}, // az
- {elUpper, noSpan}, // el
- {ltUpper(upper), noSpan}, // lt
- {nil, nil}, // nl
- {aztrUpper(upper), isUpper}, // tr
- }
- undUpper transform.SpanningTransformer = &undUpperCaser{}
- undLower transform.SpanningTransformer = &undLowerCaser{}
- undLowerIgnoreSigma transform.SpanningTransformer = &undLowerIgnoreSigmaCaser{}
- lowerFunc = []mapFunc{
- nil, // und
- nil, // af
- aztrLower, // az
- nil, // el
- ltLower, // lt
- nil, // nl
- aztrLower, // tr
- }
- titleInfos = []struct {
- title mapFunc
- lower mapFunc
- titleSpan spanFunc
- rewrite func(*context)
- }{
- {title, lower, isTitle, nil}, // und
- {title, lower, isTitle, afnlRewrite}, // af
- {aztrUpper(title), aztrLower, isTitle, nil}, // az
- {title, lower, isTitle, nil}, // el
- {ltUpper(title), ltLower, noSpan, nil}, // lt
- {nlTitle, lower, nlTitleSpan, afnlRewrite}, // nl
- {aztrUpper(title), aztrLower, isTitle, nil}, // tr
- }
- )
- func makeUpper(t language.Tag, o options) transform.SpanningTransformer {
- _, i, _ := matcher.Match(t)
- f := upperFunc[i].upper
- if f == nil {
- return undUpper
- }
- return &simpleCaser{f: f, span: upperFunc[i].span}
- }
- func makeLower(t language.Tag, o options) transform.SpanningTransformer {
- _, i, _ := matcher.Match(t)
- f := lowerFunc[i]
- if f == nil {
- if o.ignoreFinalSigma {
- return undLowerIgnoreSigma
- }
- return undLower
- }
- if o.ignoreFinalSigma {
- return &simpleCaser{f: f, span: isLower}
- }
- return &lowerCaser{
- first: f,
- midWord: finalSigma(f),
- }
- }
- func makeTitle(t language.Tag, o options) transform.SpanningTransformer {
- _, i, _ := matcher.Match(t)
- x := &titleInfos[i]
- lower := x.lower
- if o.noLower {
- lower = (*context).copy
- } else if !o.ignoreFinalSigma {
- lower = finalSigma(lower)
- }
- return &titleCaser{
- title: x.title,
- lower: lower,
- titleSpan: x.titleSpan,
- rewrite: x.rewrite,
- }
- }
- func noSpan(c *context) bool {
- c.err = transform.ErrEndOfSpan
- return false
- }
- // TODO: consider a similar special case for the fast majority lower case. This
- // is a bit more involved so will require some more precise benchmarking to
- // justify it.
- type undUpperCaser struct{ transform.NopResetter }
- // undUpperCaser implements the Transformer interface for doing an upper case
- // mapping for the root locale (und). It eliminates the need for an allocation
- // as it prevents escaping by not using function pointers.
- func (t undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
- c := context{dst: dst, src: src, atEOF: atEOF}
- for c.next() {
- upper(&c)
- c.checkpoint()
- }
- return c.ret()
- }
- func (t undUpperCaser) Span(src []byte, atEOF bool) (n int, err error) {
- c := context{src: src, atEOF: atEOF}
- for c.next() && isUpper(&c) {
- c.checkpoint()
- }
- return c.retSpan()
- }
- // undLowerIgnoreSigmaCaser implements the Transformer interface for doing
- // a lower case mapping for the root locale (und) ignoring final sigma
- // handling. This casing algorithm is used in some performance-critical packages
- // like secure/precis and x/net/http/idna, which warrants its special-casing.
- type undLowerIgnoreSigmaCaser struct{ transform.NopResetter }
- func (t undLowerIgnoreSigmaCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
- c := context{dst: dst, src: src, atEOF: atEOF}
- for c.next() && lower(&c) {
- c.checkpoint()
- }
- return c.ret()
- }
- // Span implements a generic lower-casing. This is possible as isLower works
- // for all lowercasing variants. All lowercase variants only vary in how they
- // transform a non-lowercase letter. They will never change an already lowercase
- // letter. In addition, there is no state.
- func (t undLowerIgnoreSigmaCaser) Span(src []byte, atEOF bool) (n int, err error) {
- c := context{src: src, atEOF: atEOF}
- for c.next() && isLower(&c) {
- c.checkpoint()
- }
- return c.retSpan()
- }
- type simpleCaser struct {
- context
- f mapFunc
- span spanFunc
- }
- // simpleCaser implements the Transformer interface for doing a case operation
- // on a rune-by-rune basis.
- func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
- c := context{dst: dst, src: src, atEOF: atEOF}
- for c.next() && t.f(&c) {
- c.checkpoint()
- }
- return c.ret()
- }
- func (t *simpleCaser) Span(src []byte, atEOF bool) (n int, err error) {
- c := context{src: src, atEOF: atEOF}
- for c.next() && t.span(&c) {
- c.checkpoint()
- }
- return c.retSpan()
- }
- // undLowerCaser implements the Transformer interface for doing a lower case
- // mapping for the root locale (und) ignoring final sigma handling. This casing
- // algorithm is used in some performance-critical packages like secure/precis
- // and x/net/http/idna, which warrants its special-casing.
- type undLowerCaser struct{ transform.NopResetter }
- func (t undLowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
- c := context{dst: dst, src: src, atEOF: atEOF}
- for isInterWord := true; c.next(); {
- if isInterWord {
- if c.info.isCased() {
- if !lower(&c) {
- break
- }
- isInterWord = false
- } else if !c.copy() {
- break
- }
- } else {
- if c.info.isNotCasedAndNotCaseIgnorable() {
- if !c.copy() {
- break
- }
- isInterWord = true
- } else if !c.hasPrefix("Σ") {
- if !lower(&c) {
- break
- }
- } else if !finalSigmaBody(&c) {
- break
- }
- }
- c.checkpoint()
- }
- return c.ret()
- }
- func (t undLowerCaser) Span(src []byte, atEOF bool) (n int, err error) {
- c := context{src: src, atEOF: atEOF}
- for c.next() && isLower(&c) {
- c.checkpoint()
- }
- return c.retSpan()
- }
- // lowerCaser implements the Transformer interface. The default Unicode lower
- // casing requires different treatment for the first and subsequent characters
- // of a word, most notably to handle the Greek final Sigma.
- type lowerCaser struct {
- undLowerIgnoreSigmaCaser
- context
- first, midWord mapFunc
- }
- func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
- t.context = context{dst: dst, src: src, atEOF: atEOF}
- c := &t.context
- for isInterWord := true; c.next(); {
- if isInterWord {
- if c.info.isCased() {
- if !t.first(c) {
- break
- }
- isInterWord = false
- } else if !c.copy() {
- break
- }
- } else {
- if c.info.isNotCasedAndNotCaseIgnorable() {
- if !c.copy() {
- break
- }
- isInterWord = true
- } else if !t.midWord(c) {
- break
- }
- }
- c.checkpoint()
- }
- return c.ret()
- }
- // titleCaser implements the Transformer interface. Title casing algorithms
- // distinguish between the first letter of a word and subsequent letters of the
- // same word. It uses state to avoid requiring a potentially infinite lookahead.
- type titleCaser struct {
- context
- // rune mappings used by the actual casing algorithms.
- title mapFunc
- lower mapFunc
- titleSpan spanFunc
- rewrite func(*context)
- }
- // Transform implements the standard Unicode title case algorithm as defined in
- // Chapter 3 of The Unicode Standard:
- // toTitlecase(X): Find the word boundaries in X according to Unicode Standard
- // Annex #29, "Unicode Text Segmentation." For each word boundary, find the
- // first cased character F following the word boundary. If F exists, map F to
- // Titlecase_Mapping(F); then map all characters C between F and the following
- // word boundary to Lowercase_Mapping(C).
- func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
- t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord}
- c := &t.context
- if !c.next() {
- return c.ret()
- }
- for {
- p := c.info
- if t.rewrite != nil {
- t.rewrite(c)
- }
- wasMid := p.isMid()
- // Break out of this loop on failure to ensure we do not modify the
- // state incorrectly.
- if p.isCased() {
- if !c.isMidWord {
- if !t.title(c) {
- break
- }
- c.isMidWord = true
- } else if !t.lower(c) {
- break
- }
- } else if !c.copy() {
- break
- } else if p.isBreak() {
- c.isMidWord = false
- }
- // As we save the state of the transformer, it is safe to call
- // checkpoint after any successful write.
- if !(c.isMidWord && wasMid) {
- c.checkpoint()
- }
- if !c.next() {
- break
- }
- if wasMid && c.info.isMid() {
- c.isMidWord = false
- }
- }
- return c.ret()
- }
- func (t *titleCaser) Span(src []byte, atEOF bool) (n int, err error) {
- t.context = context{src: src, atEOF: atEOF, isMidWord: t.isMidWord}
- c := &t.context
- if !c.next() {
- return c.retSpan()
- }
- for {
- p := c.info
- if t.rewrite != nil {
- t.rewrite(c)
- }
- wasMid := p.isMid()
- // Break out of this loop on failure to ensure we do not modify the
- // state incorrectly.
- if p.isCased() {
- if !c.isMidWord {
- if !t.titleSpan(c) {
- break
- }
- c.isMidWord = true
- } else if !isLower(c) {
- break
- }
- } else if p.isBreak() {
- c.isMidWord = false
- }
- // As we save the state of the transformer, it is safe to call
- // checkpoint after any successful write.
- if !(c.isMidWord && wasMid) {
- c.checkpoint()
- }
- if !c.next() {
- break
- }
- if wasMid && c.info.isMid() {
- c.isMidWord = false
- }
- }
- return c.retSpan()
- }
- // finalSigma adds Greek final Sigma handing to another casing function. It
- // determines whether a lowercased sigma should be σ or ς, by looking ahead for
- // case-ignorables and a cased letters.
- func finalSigma(f mapFunc) mapFunc {
- return func(c *context) bool {
- if !c.hasPrefix("Σ") {
- return f(c)
- }
- return finalSigmaBody(c)
- }
- }
- func finalSigmaBody(c *context) bool {
- // Current rune must be ∑.
- // ::NFD();
- // # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
- // Σ } [:case-ignorable:]* [:cased:] → σ;
- // [:cased:] [:case-ignorable:]* { Σ → ς;
- // ::Any-Lower;
- // ::NFC();
- p := c.pDst
- c.writeString("ς")
- // TODO: we should do this here, but right now this will never have an
- // effect as this is called when the prefix is Sigma, whereas Dutch and
- // Afrikaans only test for an apostrophe.
- //
- // if t.rewrite != nil {
- // t.rewrite(c)
- // }
- // We need to do one more iteration after maxIgnorable, as a cased
- // letter is not an ignorable and may modify the result.
- wasMid := false
- for i := 0; i < maxIgnorable+1; i++ {
- if !c.next() {
- return false
- }
- if !c.info.isCaseIgnorable() {
- // All Midword runes are also case ignorable, so we are
- // guaranteed to have a letter or word break here. As we are
- // unreading the run, there is no need to unset c.isMidWord;
- // the title caser will handle this.
- if c.info.isCased() {
- // p+1 is guaranteed to be in bounds: if writing ς was
- // successful, p+1 will contain the second byte of ς. If not,
- // this function will have returned after c.next returned false.
- c.dst[p+1]++ // ς → σ
- }
- c.unreadRune()
- return true
- }
- // A case ignorable may also introduce a word break, so we may need
- // to continue searching even after detecting a break.
- isMid := c.info.isMid()
- if (wasMid && isMid) || c.info.isBreak() {
- c.isMidWord = false
- }
- wasMid = isMid
- c.copy()
- }
- return true
- }
- // finalSigmaSpan would be the same as isLower.
- // elUpper implements Greek upper casing, which entails removing a predefined
- // set of non-blocked modifiers. Note that these accents should not be removed
- // for title casing!
- // Example: "Οδός" -> "ΟΔΟΣ".
- func elUpper(c *context) bool {
- // From CLDR:
- // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ;
- // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ;
- r, _ := utf8.DecodeRune(c.src[c.pSrc:])
- oldPDst := c.pDst
- if !upper(c) {
- return false
- }
- if !unicode.Is(unicode.Greek, r) {
- return true
- }
- i := 0
- // Take the properties of the uppercased rune that is already written to the
- // destination. This saves us the trouble of having to uppercase the
- // decomposed rune again.
- if b := norm.NFD.Properties(c.dst[oldPDst:]).Decomposition(); b != nil {
- // Restore the destination position and process the decomposed rune.
- r, sz := utf8.DecodeRune(b)
- if r <= 0xFF { // See A.6.1
- return true
- }
- c.pDst = oldPDst
- // Insert the first rune and ignore the modifiers. See A.6.2.
- c.writeBytes(b[:sz])
- i = len(b[sz:]) / 2 // Greek modifiers are always of length 2.
- }
- for ; i < maxIgnorable && c.next(); i++ {
- switch r, _ := utf8.DecodeRune(c.src[c.pSrc:]); r {
- // Above and Iota Subscript
- case 0x0300, // U+0300 COMBINING GRAVE ACCENT
- 0x0301, // U+0301 COMBINING ACUTE ACCENT
- 0x0304, // U+0304 COMBINING MACRON
- 0x0306, // U+0306 COMBINING BREVE
- 0x0308, // U+0308 COMBINING DIAERESIS
- 0x0313, // U+0313 COMBINING COMMA ABOVE
- 0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE
- 0x0342, // U+0342 COMBINING GREEK PERISPOMENI
- 0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI
- // No-op. Gobble the modifier.
- default:
- switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() {
- case cccZero:
- c.unreadRune()
- return true
- // We don't need to test for IotaSubscript as the only rune that
- // qualifies (U+0345) was already excluded in the switch statement
- // above. See A.4.
- case cccAbove:
- return c.copy()
- default:
- // Some other modifier. We're still allowed to gobble Greek
- // modifiers after this.
- c.copy()
- }
- }
- }
- return i == maxIgnorable
- }
- // TODO: implement elUpperSpan (low-priority: complex and infrequent).
- func ltLower(c *context) bool {
- // From CLDR:
- // # Introduce an explicit dot above when lowercasing capital I's and J's
- // # whenever there are more accents above.
- // # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
- // # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
- // # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
- // # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
- // # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
- // # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
- // # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
- // ::NFD();
- // I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307;
- // J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307;
- // I \u0328 (Į) } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0328 \u0307;
- // I \u0300 (Ì) → i \u0307 \u0300;
- // I \u0301 (Í) → i \u0307 \u0301;
- // I \u0303 (Ĩ) → i \u0307 \u0303;
- // ::Any-Lower();
- // ::NFC();
- i := 0
- if r := c.src[c.pSrc]; r < utf8.RuneSelf {
- lower(c)
- if r != 'I' && r != 'J' {
- return true
- }
- } else {
- p := norm.NFD.Properties(c.src[c.pSrc:])
- if d := p.Decomposition(); len(d) >= 3 && (d[0] == 'I' || d[0] == 'J') {
- // UTF-8 optimization: the decomposition will only have an above
- // modifier if the last rune of the decomposition is in [U+300-U+311].
- // In all other cases, a decomposition starting with I is always
- // an I followed by modifiers that are not cased themselves. See A.2.
- if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4.
- if !c.writeBytes(d[:1]) {
- return false
- }
- c.dst[c.pDst-1] += 'a' - 'A' // lower
- // Assumption: modifier never changes on lowercase. See A.1.
- // Assumption: all modifiers added have CCC = Above. See A.2.3.
- return c.writeString("\u0307") && c.writeBytes(d[1:])
- }
- // In all other cases the additional modifiers will have a CCC
- // that is less than 230 (Above). We will insert the U+0307, if
- // needed, after these modifiers so that a string in FCD form
- // will remain so. See A.2.2.
- lower(c)
- i = 1
- } else {
- return lower(c)
- }
- }
- for ; i < maxIgnorable && c.next(); i++ {
- switch c.info.cccType() {
- case cccZero:
- c.unreadRune()
- return true
- case cccAbove:
- return c.writeString("\u0307") && c.copy() // See A.1.
- default:
- c.copy() // See A.1.
- }
- }
- return i == maxIgnorable
- }
- // ltLowerSpan would be the same as isLower.
- func ltUpper(f mapFunc) mapFunc {
- return func(c *context) bool {
- // Unicode:
- // 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
- //
- // From CLDR:
- // # Remove \u0307 following soft-dotteds (i, j, and the like), with possible
- // # intervening non-230 marks.
- // ::NFD();
- // [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ;
- // ::Any-Upper();
- // ::NFC();
- // TODO: See A.5. A soft-dotted rune never has an exception. This would
- // allow us to overload the exception bit and encode this property in
- // info. Need to measure performance impact of this.
- r, _ := utf8.DecodeRune(c.src[c.pSrc:])
- oldPDst := c.pDst
- if !f(c) {
- return false
- }
- if !unicode.Is(unicode.Soft_Dotted, r) {
- return true
- }
- // We don't need to do an NFD normalization, as a soft-dotted rune never
- // contains U+0307. See A.3.
- i := 0
- for ; i < maxIgnorable && c.next(); i++ {
- switch c.info.cccType() {
- case cccZero:
- c.unreadRune()
- return true
- case cccAbove:
- if c.hasPrefix("\u0307") {
- // We don't do a full NFC, but rather combine runes for
- // some of the common cases. (Returning NFC or
- // preserving normal form is neither a requirement nor
- // a possibility anyway).
- if !c.next() {
- return false
- }
- if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc {
- s := ""
- switch c.src[c.pSrc+1] {
- case 0x80: // U+0300 COMBINING GRAVE ACCENT
- s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE
- case 0x81: // U+0301 COMBINING ACUTE ACCENT
- s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE
- case 0x83: // U+0303 COMBINING TILDE
- s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE
- case 0x88: // U+0308 COMBINING DIAERESIS
- s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS
- default:
- }
- if s != "" {
- c.pDst = oldPDst
- return c.writeString(s)
- }
- }
- }
- return c.copy()
- default:
- c.copy()
- }
- }
- return i == maxIgnorable
- }
- }
- // TODO: implement ltUpperSpan (low priority: complex and infrequent).
- func aztrUpper(f mapFunc) mapFunc {
- return func(c *context) bool {
- // i→İ;
- if c.src[c.pSrc] == 'i' {
- return c.writeString("İ")
- }
- return f(c)
- }
- }
- func aztrLower(c *context) (done bool) {
- // From CLDR:
- // # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
- // # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
- // İ→i;
- // # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
- // # This matches the behavior of the canonically equivalent I-dot_above
- // # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
- // # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
- // # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
- // I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ;
- // I→ı ;
- // ::Any-Lower();
- if c.hasPrefix("\u0130") { // İ
- return c.writeString("i")
- }
- if c.src[c.pSrc] != 'I' {
- return lower(c)
- }
- // We ignore the lower-case I for now, but insert it later when we know
- // which form we need.
- start := c.pSrc + c.sz
- i := 0
- Loop:
- // We check for up to n ignorables before \u0307. As \u0307 is an
- // ignorable as well, n is maxIgnorable-1.
- for ; i < maxIgnorable && c.next(); i++ {
- switch c.info.cccType() {
- case cccAbove:
- if c.hasPrefix("\u0307") {
- return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307
- }
- done = true
- break Loop
- case cccZero:
- c.unreadRune()
- done = true
- break Loop
- default:
- // We'll write this rune after we know which starter to use.
- }
- }
- if i == maxIgnorable {
- done = true
- }
- return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done
- }
- // aztrLowerSpan would be the same as isLower.
- func nlTitle(c *context) bool {
- // From CLDR:
- // # Special titlecasing for Dutch initial "ij".
- // ::Any-Title();
- // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
- // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
- if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' {
- return title(c)
- }
- if !c.writeString("I") || !c.next() {
- return false
- }
- if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' {
- return c.writeString("J")
- }
- c.unreadRune()
- return true
- }
- func nlTitleSpan(c *context) bool {
- // From CLDR:
- // # Special titlecasing for Dutch initial "ij".
- // ::Any-Title();
- // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
- // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
- if c.src[c.pSrc] != 'I' {
- return isTitle(c)
- }
- if !c.next() || c.src[c.pSrc] == 'j' {
- return false
- }
- if c.src[c.pSrc] != 'J' {
- c.unreadRune()
- }
- return true
- }
- // Not part of CLDR, but see https://unicode.org/cldr/trac/ticket/7078.
- func afnlRewrite(c *context) {
- if c.hasPrefix("'") || c.hasPrefix("’") {
- c.isMidWord = true
- }
- }
|