map.go 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816
  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package cases
  5. // This file contains the definitions of case mappings for all supported
  6. // languages. The rules for the language-specific tailorings were taken and
  7. // modified from the CLDR transform definitions in common/transforms.
  8. import (
  9. "strings"
  10. "unicode"
  11. "unicode/utf8"
  12. "golang.org/x/text/internal"
  13. "golang.org/x/text/language"
  14. "golang.org/x/text/transform"
  15. "golang.org/x/text/unicode/norm"
  16. )
  17. // A mapFunc takes a context set to the current rune and writes the mapped
  18. // version to the same context. It may advance the context to the next rune. It
  19. // returns whether a checkpoint is possible: whether the pDst bytes written to
  20. // dst so far won't need changing as we see more source bytes.
  21. type mapFunc func(*context) bool
  22. // A spanFunc takes a context set to the current rune and returns whether this
  23. // rune would be altered when written to the output. It may advance the context
  24. // to the next rune. It returns whether a checkpoint is possible.
  25. type spanFunc func(*context) bool
  26. // maxIgnorable defines the maximum number of ignorables to consider for
  27. // lookahead operations.
  28. const maxIgnorable = 30
  29. // supported lists the language tags for which we have tailorings.
  30. const supported = "und af az el lt nl tr"
  31. func init() {
  32. tags := []language.Tag{}
  33. for _, s := range strings.Split(supported, " ") {
  34. tags = append(tags, language.MustParse(s))
  35. }
  36. matcher = internal.NewInheritanceMatcher(tags)
  37. Supported = language.NewCoverage(tags)
  38. }
  39. var (
  40. matcher *internal.InheritanceMatcher
  41. Supported language.Coverage
  42. // We keep the following lists separate, instead of having a single per-
  43. // language struct, to give the compiler a chance to remove unused code.
  44. // Some uppercase mappers are stateless, so we can precompute the
  45. // Transformers and save a bit on runtime allocations.
  46. upperFunc = []struct {
  47. upper mapFunc
  48. span spanFunc
  49. }{
  50. {nil, nil}, // und
  51. {nil, nil}, // af
  52. {aztrUpper(upper), isUpper}, // az
  53. {elUpper, noSpan}, // el
  54. {ltUpper(upper), noSpan}, // lt
  55. {nil, nil}, // nl
  56. {aztrUpper(upper), isUpper}, // tr
  57. }
  58. undUpper transform.SpanningTransformer = &undUpperCaser{}
  59. undLower transform.SpanningTransformer = &undLowerCaser{}
  60. undLowerIgnoreSigma transform.SpanningTransformer = &undLowerIgnoreSigmaCaser{}
  61. lowerFunc = []mapFunc{
  62. nil, // und
  63. nil, // af
  64. aztrLower, // az
  65. nil, // el
  66. ltLower, // lt
  67. nil, // nl
  68. aztrLower, // tr
  69. }
  70. titleInfos = []struct {
  71. title mapFunc
  72. lower mapFunc
  73. titleSpan spanFunc
  74. rewrite func(*context)
  75. }{
  76. {title, lower, isTitle, nil}, // und
  77. {title, lower, isTitle, afnlRewrite}, // af
  78. {aztrUpper(title), aztrLower, isTitle, nil}, // az
  79. {title, lower, isTitle, nil}, // el
  80. {ltUpper(title), ltLower, noSpan, nil}, // lt
  81. {nlTitle, lower, nlTitleSpan, afnlRewrite}, // nl
  82. {aztrUpper(title), aztrLower, isTitle, nil}, // tr
  83. }
  84. )
  85. func makeUpper(t language.Tag, o options) transform.SpanningTransformer {
  86. _, i, _ := matcher.Match(t)
  87. f := upperFunc[i].upper
  88. if f == nil {
  89. return undUpper
  90. }
  91. return &simpleCaser{f: f, span: upperFunc[i].span}
  92. }
  93. func makeLower(t language.Tag, o options) transform.SpanningTransformer {
  94. _, i, _ := matcher.Match(t)
  95. f := lowerFunc[i]
  96. if f == nil {
  97. if o.ignoreFinalSigma {
  98. return undLowerIgnoreSigma
  99. }
  100. return undLower
  101. }
  102. if o.ignoreFinalSigma {
  103. return &simpleCaser{f: f, span: isLower}
  104. }
  105. return &lowerCaser{
  106. first: f,
  107. midWord: finalSigma(f),
  108. }
  109. }
  110. func makeTitle(t language.Tag, o options) transform.SpanningTransformer {
  111. _, i, _ := matcher.Match(t)
  112. x := &titleInfos[i]
  113. lower := x.lower
  114. if o.noLower {
  115. lower = (*context).copy
  116. } else if !o.ignoreFinalSigma {
  117. lower = finalSigma(lower)
  118. }
  119. return &titleCaser{
  120. title: x.title,
  121. lower: lower,
  122. titleSpan: x.titleSpan,
  123. rewrite: x.rewrite,
  124. }
  125. }
  126. func noSpan(c *context) bool {
  127. c.err = transform.ErrEndOfSpan
  128. return false
  129. }
  130. // TODO: consider a similar special case for the fast majority lower case. This
  131. // is a bit more involved so will require some more precise benchmarking to
  132. // justify it.
  133. type undUpperCaser struct{ transform.NopResetter }
  134. // undUpperCaser implements the Transformer interface for doing an upper case
  135. // mapping for the root locale (und). It eliminates the need for an allocation
  136. // as it prevents escaping by not using function pointers.
  137. func (t undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  138. c := context{dst: dst, src: src, atEOF: atEOF}
  139. for c.next() {
  140. upper(&c)
  141. c.checkpoint()
  142. }
  143. return c.ret()
  144. }
  145. func (t undUpperCaser) Span(src []byte, atEOF bool) (n int, err error) {
  146. c := context{src: src, atEOF: atEOF}
  147. for c.next() && isUpper(&c) {
  148. c.checkpoint()
  149. }
  150. return c.retSpan()
  151. }
  152. // undLowerIgnoreSigmaCaser implements the Transformer interface for doing
  153. // a lower case mapping for the root locale (und) ignoring final sigma
  154. // handling. This casing algorithm is used in some performance-critical packages
  155. // like secure/precis and x/net/http/idna, which warrants its special-casing.
  156. type undLowerIgnoreSigmaCaser struct{ transform.NopResetter }
  157. func (t undLowerIgnoreSigmaCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  158. c := context{dst: dst, src: src, atEOF: atEOF}
  159. for c.next() && lower(&c) {
  160. c.checkpoint()
  161. }
  162. return c.ret()
  163. }
  164. // Span implements a generic lower-casing. This is possible as isLower works
  165. // for all lowercasing variants. All lowercase variants only vary in how they
  166. // transform a non-lowercase letter. They will never change an already lowercase
  167. // letter. In addition, there is no state.
  168. func (t undLowerIgnoreSigmaCaser) Span(src []byte, atEOF bool) (n int, err error) {
  169. c := context{src: src, atEOF: atEOF}
  170. for c.next() && isLower(&c) {
  171. c.checkpoint()
  172. }
  173. return c.retSpan()
  174. }
  175. type simpleCaser struct {
  176. context
  177. f mapFunc
  178. span spanFunc
  179. }
  180. // simpleCaser implements the Transformer interface for doing a case operation
  181. // on a rune-by-rune basis.
  182. func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  183. c := context{dst: dst, src: src, atEOF: atEOF}
  184. for c.next() && t.f(&c) {
  185. c.checkpoint()
  186. }
  187. return c.ret()
  188. }
  189. func (t *simpleCaser) Span(src []byte, atEOF bool) (n int, err error) {
  190. c := context{src: src, atEOF: atEOF}
  191. for c.next() && t.span(&c) {
  192. c.checkpoint()
  193. }
  194. return c.retSpan()
  195. }
  196. // undLowerCaser implements the Transformer interface for doing a lower case
  197. // mapping for the root locale (und) ignoring final sigma handling. This casing
  198. // algorithm is used in some performance-critical packages like secure/precis
  199. // and x/net/http/idna, which warrants its special-casing.
  200. type undLowerCaser struct{ transform.NopResetter }
  201. func (t undLowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  202. c := context{dst: dst, src: src, atEOF: atEOF}
  203. for isInterWord := true; c.next(); {
  204. if isInterWord {
  205. if c.info.isCased() {
  206. if !lower(&c) {
  207. break
  208. }
  209. isInterWord = false
  210. } else if !c.copy() {
  211. break
  212. }
  213. } else {
  214. if c.info.isNotCasedAndNotCaseIgnorable() {
  215. if !c.copy() {
  216. break
  217. }
  218. isInterWord = true
  219. } else if !c.hasPrefix("Σ") {
  220. if !lower(&c) {
  221. break
  222. }
  223. } else if !finalSigmaBody(&c) {
  224. break
  225. }
  226. }
  227. c.checkpoint()
  228. }
  229. return c.ret()
  230. }
  231. func (t undLowerCaser) Span(src []byte, atEOF bool) (n int, err error) {
  232. c := context{src: src, atEOF: atEOF}
  233. for c.next() && isLower(&c) {
  234. c.checkpoint()
  235. }
  236. return c.retSpan()
  237. }
  238. // lowerCaser implements the Transformer interface. The default Unicode lower
  239. // casing requires different treatment for the first and subsequent characters
  240. // of a word, most notably to handle the Greek final Sigma.
  241. type lowerCaser struct {
  242. undLowerIgnoreSigmaCaser
  243. context
  244. first, midWord mapFunc
  245. }
  246. func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  247. t.context = context{dst: dst, src: src, atEOF: atEOF}
  248. c := &t.context
  249. for isInterWord := true; c.next(); {
  250. if isInterWord {
  251. if c.info.isCased() {
  252. if !t.first(c) {
  253. break
  254. }
  255. isInterWord = false
  256. } else if !c.copy() {
  257. break
  258. }
  259. } else {
  260. if c.info.isNotCasedAndNotCaseIgnorable() {
  261. if !c.copy() {
  262. break
  263. }
  264. isInterWord = true
  265. } else if !t.midWord(c) {
  266. break
  267. }
  268. }
  269. c.checkpoint()
  270. }
  271. return c.ret()
  272. }
  273. // titleCaser implements the Transformer interface. Title casing algorithms
  274. // distinguish between the first letter of a word and subsequent letters of the
  275. // same word. It uses state to avoid requiring a potentially infinite lookahead.
  276. type titleCaser struct {
  277. context
  278. // rune mappings used by the actual casing algorithms.
  279. title mapFunc
  280. lower mapFunc
  281. titleSpan spanFunc
  282. rewrite func(*context)
  283. }
  284. // Transform implements the standard Unicode title case algorithm as defined in
  285. // Chapter 3 of The Unicode Standard:
  286. // toTitlecase(X): Find the word boundaries in X according to Unicode Standard
  287. // Annex #29, "Unicode Text Segmentation." For each word boundary, find the
  288. // first cased character F following the word boundary. If F exists, map F to
  289. // Titlecase_Mapping(F); then map all characters C between F and the following
  290. // word boundary to Lowercase_Mapping(C).
  291. func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  292. t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord}
  293. c := &t.context
  294. if !c.next() {
  295. return c.ret()
  296. }
  297. for {
  298. p := c.info
  299. if t.rewrite != nil {
  300. t.rewrite(c)
  301. }
  302. wasMid := p.isMid()
  303. // Break out of this loop on failure to ensure we do not modify the
  304. // state incorrectly.
  305. if p.isCased() {
  306. if !c.isMidWord {
  307. if !t.title(c) {
  308. break
  309. }
  310. c.isMidWord = true
  311. } else if !t.lower(c) {
  312. break
  313. }
  314. } else if !c.copy() {
  315. break
  316. } else if p.isBreak() {
  317. c.isMidWord = false
  318. }
  319. // As we save the state of the transformer, it is safe to call
  320. // checkpoint after any successful write.
  321. if !(c.isMidWord && wasMid) {
  322. c.checkpoint()
  323. }
  324. if !c.next() {
  325. break
  326. }
  327. if wasMid && c.info.isMid() {
  328. c.isMidWord = false
  329. }
  330. }
  331. return c.ret()
  332. }
  333. func (t *titleCaser) Span(src []byte, atEOF bool) (n int, err error) {
  334. t.context = context{src: src, atEOF: atEOF, isMidWord: t.isMidWord}
  335. c := &t.context
  336. if !c.next() {
  337. return c.retSpan()
  338. }
  339. for {
  340. p := c.info
  341. if t.rewrite != nil {
  342. t.rewrite(c)
  343. }
  344. wasMid := p.isMid()
  345. // Break out of this loop on failure to ensure we do not modify the
  346. // state incorrectly.
  347. if p.isCased() {
  348. if !c.isMidWord {
  349. if !t.titleSpan(c) {
  350. break
  351. }
  352. c.isMidWord = true
  353. } else if !isLower(c) {
  354. break
  355. }
  356. } else if p.isBreak() {
  357. c.isMidWord = false
  358. }
  359. // As we save the state of the transformer, it is safe to call
  360. // checkpoint after any successful write.
  361. if !(c.isMidWord && wasMid) {
  362. c.checkpoint()
  363. }
  364. if !c.next() {
  365. break
  366. }
  367. if wasMid && c.info.isMid() {
  368. c.isMidWord = false
  369. }
  370. }
  371. return c.retSpan()
  372. }
  373. // finalSigma adds Greek final Sigma handing to another casing function. It
  374. // determines whether a lowercased sigma should be σ or ς, by looking ahead for
  375. // case-ignorables and a cased letters.
  376. func finalSigma(f mapFunc) mapFunc {
  377. return func(c *context) bool {
  378. if !c.hasPrefix("Σ") {
  379. return f(c)
  380. }
  381. return finalSigmaBody(c)
  382. }
  383. }
  384. func finalSigmaBody(c *context) bool {
  385. // Current rune must be ∑.
  386. // ::NFD();
  387. // # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
  388. // Σ } [:case-ignorable:]* [:cased:] → σ;
  389. // [:cased:] [:case-ignorable:]* { Σ → ς;
  390. // ::Any-Lower;
  391. // ::NFC();
  392. p := c.pDst
  393. c.writeString("ς")
  394. // TODO: we should do this here, but right now this will never have an
  395. // effect as this is called when the prefix is Sigma, whereas Dutch and
  396. // Afrikaans only test for an apostrophe.
  397. //
  398. // if t.rewrite != nil {
  399. // t.rewrite(c)
  400. // }
  401. // We need to do one more iteration after maxIgnorable, as a cased
  402. // letter is not an ignorable and may modify the result.
  403. wasMid := false
  404. for i := 0; i < maxIgnorable+1; i++ {
  405. if !c.next() {
  406. return false
  407. }
  408. if !c.info.isCaseIgnorable() {
  409. // All Midword runes are also case ignorable, so we are
  410. // guaranteed to have a letter or word break here. As we are
  411. // unreading the run, there is no need to unset c.isMidWord;
  412. // the title caser will handle this.
  413. if c.info.isCased() {
  414. // p+1 is guaranteed to be in bounds: if writing ς was
  415. // successful, p+1 will contain the second byte of ς. If not,
  416. // this function will have returned after c.next returned false.
  417. c.dst[p+1]++ // ς → σ
  418. }
  419. c.unreadRune()
  420. return true
  421. }
  422. // A case ignorable may also introduce a word break, so we may need
  423. // to continue searching even after detecting a break.
  424. isMid := c.info.isMid()
  425. if (wasMid && isMid) || c.info.isBreak() {
  426. c.isMidWord = false
  427. }
  428. wasMid = isMid
  429. c.copy()
  430. }
  431. return true
  432. }
  433. // finalSigmaSpan would be the same as isLower.
  434. // elUpper implements Greek upper casing, which entails removing a predefined
  435. // set of non-blocked modifiers. Note that these accents should not be removed
  436. // for title casing!
  437. // Example: "Οδός" -> "ΟΔΟΣ".
  438. func elUpper(c *context) bool {
  439. // From CLDR:
  440. // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ;
  441. // [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ;
  442. r, _ := utf8.DecodeRune(c.src[c.pSrc:])
  443. oldPDst := c.pDst
  444. if !upper(c) {
  445. return false
  446. }
  447. if !unicode.Is(unicode.Greek, r) {
  448. return true
  449. }
  450. i := 0
  451. // Take the properties of the uppercased rune that is already written to the
  452. // destination. This saves us the trouble of having to uppercase the
  453. // decomposed rune again.
  454. if b := norm.NFD.Properties(c.dst[oldPDst:]).Decomposition(); b != nil {
  455. // Restore the destination position and process the decomposed rune.
  456. r, sz := utf8.DecodeRune(b)
  457. if r <= 0xFF { // See A.6.1
  458. return true
  459. }
  460. c.pDst = oldPDst
  461. // Insert the first rune and ignore the modifiers. See A.6.2.
  462. c.writeBytes(b[:sz])
  463. i = len(b[sz:]) / 2 // Greek modifiers are always of length 2.
  464. }
  465. for ; i < maxIgnorable && c.next(); i++ {
  466. switch r, _ := utf8.DecodeRune(c.src[c.pSrc:]); r {
  467. // Above and Iota Subscript
  468. case 0x0300, // U+0300 COMBINING GRAVE ACCENT
  469. 0x0301, // U+0301 COMBINING ACUTE ACCENT
  470. 0x0304, // U+0304 COMBINING MACRON
  471. 0x0306, // U+0306 COMBINING BREVE
  472. 0x0308, // U+0308 COMBINING DIAERESIS
  473. 0x0313, // U+0313 COMBINING COMMA ABOVE
  474. 0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE
  475. 0x0342, // U+0342 COMBINING GREEK PERISPOMENI
  476. 0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI
  477. // No-op. Gobble the modifier.
  478. default:
  479. switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() {
  480. case cccZero:
  481. c.unreadRune()
  482. return true
  483. // We don't need to test for IotaSubscript as the only rune that
  484. // qualifies (U+0345) was already excluded in the switch statement
  485. // above. See A.4.
  486. case cccAbove:
  487. return c.copy()
  488. default:
  489. // Some other modifier. We're still allowed to gobble Greek
  490. // modifiers after this.
  491. c.copy()
  492. }
  493. }
  494. }
  495. return i == maxIgnorable
  496. }
  497. // TODO: implement elUpperSpan (low-priority: complex and infrequent).
  498. func ltLower(c *context) bool {
  499. // From CLDR:
  500. // # Introduce an explicit dot above when lowercasing capital I's and J's
  501. // # whenever there are more accents above.
  502. // # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
  503. // # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
  504. // # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
  505. // # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
  506. // # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
  507. // # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
  508. // # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
  509. // ::NFD();
  510. // I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307;
  511. // J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307;
  512. // I \u0328 (Į) } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0328 \u0307;
  513. // I \u0300 (Ì) → i \u0307 \u0300;
  514. // I \u0301 (Í) → i \u0307 \u0301;
  515. // I \u0303 (Ĩ) → i \u0307 \u0303;
  516. // ::Any-Lower();
  517. // ::NFC();
  518. i := 0
  519. if r := c.src[c.pSrc]; r < utf8.RuneSelf {
  520. lower(c)
  521. if r != 'I' && r != 'J' {
  522. return true
  523. }
  524. } else {
  525. p := norm.NFD.Properties(c.src[c.pSrc:])
  526. if d := p.Decomposition(); len(d) >= 3 && (d[0] == 'I' || d[0] == 'J') {
  527. // UTF-8 optimization: the decomposition will only have an above
  528. // modifier if the last rune of the decomposition is in [U+300-U+311].
  529. // In all other cases, a decomposition starting with I is always
  530. // an I followed by modifiers that are not cased themselves. See A.2.
  531. if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4.
  532. if !c.writeBytes(d[:1]) {
  533. return false
  534. }
  535. c.dst[c.pDst-1] += 'a' - 'A' // lower
  536. // Assumption: modifier never changes on lowercase. See A.1.
  537. // Assumption: all modifiers added have CCC = Above. See A.2.3.
  538. return c.writeString("\u0307") && c.writeBytes(d[1:])
  539. }
  540. // In all other cases the additional modifiers will have a CCC
  541. // that is less than 230 (Above). We will insert the U+0307, if
  542. // needed, after these modifiers so that a string in FCD form
  543. // will remain so. See A.2.2.
  544. lower(c)
  545. i = 1
  546. } else {
  547. return lower(c)
  548. }
  549. }
  550. for ; i < maxIgnorable && c.next(); i++ {
  551. switch c.info.cccType() {
  552. case cccZero:
  553. c.unreadRune()
  554. return true
  555. case cccAbove:
  556. return c.writeString("\u0307") && c.copy() // See A.1.
  557. default:
  558. c.copy() // See A.1.
  559. }
  560. }
  561. return i == maxIgnorable
  562. }
  563. // ltLowerSpan would be the same as isLower.
  564. func ltUpper(f mapFunc) mapFunc {
  565. return func(c *context) bool {
  566. // Unicode:
  567. // 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
  568. //
  569. // From CLDR:
  570. // # Remove \u0307 following soft-dotteds (i, j, and the like), with possible
  571. // # intervening non-230 marks.
  572. // ::NFD();
  573. // [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ;
  574. // ::Any-Upper();
  575. // ::NFC();
  576. // TODO: See A.5. A soft-dotted rune never has an exception. This would
  577. // allow us to overload the exception bit and encode this property in
  578. // info. Need to measure performance impact of this.
  579. r, _ := utf8.DecodeRune(c.src[c.pSrc:])
  580. oldPDst := c.pDst
  581. if !f(c) {
  582. return false
  583. }
  584. if !unicode.Is(unicode.Soft_Dotted, r) {
  585. return true
  586. }
  587. // We don't need to do an NFD normalization, as a soft-dotted rune never
  588. // contains U+0307. See A.3.
  589. i := 0
  590. for ; i < maxIgnorable && c.next(); i++ {
  591. switch c.info.cccType() {
  592. case cccZero:
  593. c.unreadRune()
  594. return true
  595. case cccAbove:
  596. if c.hasPrefix("\u0307") {
  597. // We don't do a full NFC, but rather combine runes for
  598. // some of the common cases. (Returning NFC or
  599. // preserving normal form is neither a requirement nor
  600. // a possibility anyway).
  601. if !c.next() {
  602. return false
  603. }
  604. if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc {
  605. s := ""
  606. switch c.src[c.pSrc+1] {
  607. case 0x80: // U+0300 COMBINING GRAVE ACCENT
  608. s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE
  609. case 0x81: // U+0301 COMBINING ACUTE ACCENT
  610. s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE
  611. case 0x83: // U+0303 COMBINING TILDE
  612. s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE
  613. case 0x88: // U+0308 COMBINING DIAERESIS
  614. s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS
  615. default:
  616. }
  617. if s != "" {
  618. c.pDst = oldPDst
  619. return c.writeString(s)
  620. }
  621. }
  622. }
  623. return c.copy()
  624. default:
  625. c.copy()
  626. }
  627. }
  628. return i == maxIgnorable
  629. }
  630. }
  631. // TODO: implement ltUpperSpan (low priority: complex and infrequent).
  632. func aztrUpper(f mapFunc) mapFunc {
  633. return func(c *context) bool {
  634. // i→İ;
  635. if c.src[c.pSrc] == 'i' {
  636. return c.writeString("İ")
  637. }
  638. return f(c)
  639. }
  640. }
  641. func aztrLower(c *context) (done bool) {
  642. // From CLDR:
  643. // # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
  644. // # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
  645. // İ→i;
  646. // # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
  647. // # This matches the behavior of the canonically equivalent I-dot_above
  648. // # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
  649. // # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
  650. // # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
  651. // I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ;
  652. // I→ı ;
  653. // ::Any-Lower();
  654. if c.hasPrefix("\u0130") { // İ
  655. return c.writeString("i")
  656. }
  657. if c.src[c.pSrc] != 'I' {
  658. return lower(c)
  659. }
  660. // We ignore the lower-case I for now, but insert it later when we know
  661. // which form we need.
  662. start := c.pSrc + c.sz
  663. i := 0
  664. Loop:
  665. // We check for up to n ignorables before \u0307. As \u0307 is an
  666. // ignorable as well, n is maxIgnorable-1.
  667. for ; i < maxIgnorable && c.next(); i++ {
  668. switch c.info.cccType() {
  669. case cccAbove:
  670. if c.hasPrefix("\u0307") {
  671. return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307
  672. }
  673. done = true
  674. break Loop
  675. case cccZero:
  676. c.unreadRune()
  677. done = true
  678. break Loop
  679. default:
  680. // We'll write this rune after we know which starter to use.
  681. }
  682. }
  683. if i == maxIgnorable {
  684. done = true
  685. }
  686. return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done
  687. }
  688. // aztrLowerSpan would be the same as isLower.
  689. func nlTitle(c *context) bool {
  690. // From CLDR:
  691. // # Special titlecasing for Dutch initial "ij".
  692. // ::Any-Title();
  693. // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
  694. // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
  695. if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' {
  696. return title(c)
  697. }
  698. if !c.writeString("I") || !c.next() {
  699. return false
  700. }
  701. if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' {
  702. return c.writeString("J")
  703. }
  704. c.unreadRune()
  705. return true
  706. }
  707. func nlTitleSpan(c *context) bool {
  708. // From CLDR:
  709. // # Special titlecasing for Dutch initial "ij".
  710. // ::Any-Title();
  711. // # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
  712. // [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
  713. if c.src[c.pSrc] != 'I' {
  714. return isTitle(c)
  715. }
  716. if !c.next() || c.src[c.pSrc] == 'j' {
  717. return false
  718. }
  719. if c.src[c.pSrc] != 'J' {
  720. c.unreadRune()
  721. }
  722. return true
  723. }
  724. // Not part of CLDR, but see https://unicode.org/cldr/trac/ticket/7078.
  725. func afnlRewrite(c *context) {
  726. if c.hasPrefix("'") || c.hasPrefix("’") {
  727. c.isMidWord = true
  728. }
  729. }