builder.go 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package build // import "golang.org/x/text/collate/build"
  5. import (
  6. "fmt"
  7. "io"
  8. "log"
  9. "sort"
  10. "strings"
  11. "unicode/utf8"
  12. "golang.org/x/text/internal/colltab"
  13. "golang.org/x/text/language"
  14. "golang.org/x/text/unicode/norm"
  15. )
  16. // TODO: optimizations:
  17. // - expandElem is currently 20K. By putting unique colElems in a separate
  18. // table and having a byte array of indexes into this table, we can reduce
  19. // the total size to about 7K. By also factoring out the length bytes, we
  20. // can reduce this to about 6K.
  21. // - trie valueBlocks are currently 100K. There are a lot of sparse blocks
  22. // and many consecutive values with the same stride. This can be further
  23. // compacted.
  24. // - Compress secondary weights into 8 bits.
  25. // - Some LDML specs specify a context element. Currently we simply concatenate
  26. // those. Context can be implemented using the contraction trie. If Builder
  27. // could analyze and detect when using a context makes sense, there is no
  28. // need to expose this construct in the API.
  29. // A Builder builds a root collation table. The user must specify the
  30. // collation elements for each entry. A common use will be to base the weights
  31. // on those specified in the allkeys* file as provided by the UCA or CLDR.
  32. type Builder struct {
  33. index *trieBuilder
  34. root ordering
  35. locale []*Tailoring
  36. t *table
  37. err error
  38. built bool
  39. minNonVar int // lowest primary recorded for a variable
  40. varTop int // highest primary recorded for a non-variable
  41. // indexes used for reusing expansions and contractions
  42. expIndex map[string]int // positions of expansions keyed by their string representation
  43. ctHandle map[string]ctHandle // contraction handles keyed by a concatenation of the suffixes
  44. ctElem map[string]int // contraction elements keyed by their string representation
  45. }
  46. // A Tailoring builds a collation table based on another collation table.
  47. // The table is defined by specifying tailorings to the underlying table.
  48. // See https://unicode.org/reports/tr35/ for an overview of tailoring
  49. // collation tables. The CLDR contains pre-defined tailorings for a variety
  50. // of languages (See https://www.unicode.org/Public/cldr/<version>/core.zip.)
  51. type Tailoring struct {
  52. id string
  53. builder *Builder
  54. index *ordering
  55. anchor *entry
  56. before bool
  57. }
  58. // NewBuilder returns a new Builder.
  59. func NewBuilder() *Builder {
  60. return &Builder{
  61. index: newTrieBuilder(),
  62. root: makeRootOrdering(),
  63. expIndex: make(map[string]int),
  64. ctHandle: make(map[string]ctHandle),
  65. ctElem: make(map[string]int),
  66. }
  67. }
  68. // Tailoring returns a Tailoring for the given locale. One should
  69. // have completed all calls to Add before calling Tailoring.
  70. func (b *Builder) Tailoring(loc language.Tag) *Tailoring {
  71. t := &Tailoring{
  72. id: loc.String(),
  73. builder: b,
  74. index: b.root.clone(),
  75. }
  76. t.index.id = t.id
  77. b.locale = append(b.locale, t)
  78. return t
  79. }
  80. // Add adds an entry to the collation element table, mapping
  81. // a slice of runes to a sequence of collation elements.
  82. // A collation element is specified as list of weights: []int{primary, secondary, ...}.
  83. // The entries are typically obtained from a collation element table
  84. // as defined in https://www.unicode.org/reports/tr10/#Data_Table_Format.
  85. // Note that the collation elements specified by colelems are only used
  86. // as a guide. The actual weights generated by Builder may differ.
  87. // The argument variables is a list of indices into colelems that should contain
  88. // a value for each colelem that is a variable. (See the reference above.)
  89. func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
  90. str := string(runes)
  91. elems := make([]rawCE, len(colelems))
  92. for i, ce := range colelems {
  93. if len(ce) == 0 {
  94. break
  95. }
  96. elems[i] = makeRawCE(ce, 0)
  97. if len(ce) == 1 {
  98. elems[i].w[1] = defaultSecondary
  99. }
  100. if len(ce) <= 2 {
  101. elems[i].w[2] = defaultTertiary
  102. }
  103. if len(ce) <= 3 {
  104. elems[i].w[3] = ce[0]
  105. }
  106. }
  107. for i, ce := range elems {
  108. p := ce.w[0]
  109. isvar := false
  110. for _, j := range variables {
  111. if i == j {
  112. isvar = true
  113. }
  114. }
  115. if isvar {
  116. if p >= b.minNonVar && b.minNonVar > 0 {
  117. return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", p, b.minNonVar)
  118. }
  119. if p > b.varTop {
  120. b.varTop = p
  121. }
  122. } else if p > 1 { // 1 is a special primary value reserved for FFFE
  123. if p <= b.varTop {
  124. return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", p, b.varTop)
  125. }
  126. if b.minNonVar == 0 || p < b.minNonVar {
  127. b.minNonVar = p
  128. }
  129. }
  130. }
  131. elems, err := convertLargeWeights(elems)
  132. if err != nil {
  133. return err
  134. }
  135. cccs := []uint8{}
  136. nfd := norm.NFD.String(str)
  137. for i := range nfd {
  138. cccs = append(cccs, norm.NFD.PropertiesString(nfd[i:]).CCC())
  139. }
  140. if len(cccs) < len(elems) {
  141. if len(cccs) > 2 {
  142. return fmt.Errorf("number of decomposed characters should be greater or equal to the number of collation elements for len(colelems) > 3 (%d < %d)", len(cccs), len(elems))
  143. }
  144. p := len(elems) - 1
  145. for ; p > 0 && elems[p].w[0] == 0; p-- {
  146. elems[p].ccc = cccs[len(cccs)-1]
  147. }
  148. for ; p >= 0; p-- {
  149. elems[p].ccc = cccs[0]
  150. }
  151. } else {
  152. for i := range elems {
  153. elems[i].ccc = cccs[i]
  154. }
  155. }
  156. // doNorm in collate.go assumes that the following conditions hold.
  157. if len(elems) > 1 && len(cccs) > 1 && cccs[0] != 0 && cccs[0] != cccs[len(cccs)-1] {
  158. return fmt.Errorf("incompatible CCC values for expansion %X (%d)", runes, cccs)
  159. }
  160. b.root.newEntry(str, elems)
  161. return nil
  162. }
  163. func (t *Tailoring) setAnchor(anchor string) error {
  164. anchor = norm.NFC.String(anchor)
  165. a := t.index.find(anchor)
  166. if a == nil {
  167. a = t.index.newEntry(anchor, nil)
  168. a.implicit = true
  169. a.modified = true
  170. for _, r := range []rune(anchor) {
  171. e := t.index.find(string(r))
  172. e.lock = true
  173. }
  174. }
  175. t.anchor = a
  176. return nil
  177. }
  178. // SetAnchor sets the point after which elements passed in subsequent calls to
  179. // Insert will be inserted. It is equivalent to the reset directive in an LDML
  180. // specification. See Insert for an example.
  181. // SetAnchor supports the following logical reset positions:
  182. // <first_tertiary_ignorable/>, <last_teriary_ignorable/>, <first_primary_ignorable/>,
  183. // and <last_non_ignorable/>.
  184. func (t *Tailoring) SetAnchor(anchor string) error {
  185. if err := t.setAnchor(anchor); err != nil {
  186. return err
  187. }
  188. t.before = false
  189. return nil
  190. }
  191. // SetAnchorBefore is similar to SetAnchor, except that subsequent calls to
  192. // Insert will insert entries before the anchor.
  193. func (t *Tailoring) SetAnchorBefore(anchor string) error {
  194. if err := t.setAnchor(anchor); err != nil {
  195. return err
  196. }
  197. t.before = true
  198. return nil
  199. }
  200. // Insert sets the ordering of str relative to the entry set by the previous
  201. // call to SetAnchor or Insert. The argument extend corresponds
  202. // to the extend elements as defined in LDML. A non-empty value for extend
  203. // will cause the collation elements corresponding to extend to be appended
  204. // to the collation elements generated for the entry added by Insert.
  205. // This has the same net effect as sorting str after the string anchor+extend.
  206. // See https://www.unicode.org/reports/tr10/#Tailoring_Example for details
  207. // on parametric tailoring and https://unicode.org/reports/tr35/#Collation_Elements
  208. // for full details on LDML.
  209. //
  210. // Examples: create a tailoring for Swedish, where "ä" is ordered after "z"
  211. // at the primary sorting level:
  212. // t := b.Tailoring("se")
  213. // t.SetAnchor("z")
  214. // t.Insert(colltab.Primary, "ä", "")
  215. // Order "ü" after "ue" at the secondary sorting level:
  216. // t.SetAnchor("ue")
  217. // t.Insert(colltab.Secondary, "ü","")
  218. // or
  219. // t.SetAnchor("u")
  220. // t.Insert(colltab.Secondary, "ü", "e")
  221. // Order "q" afer "ab" at the secondary level and "Q" after "q"
  222. // at the tertiary level:
  223. // t.SetAnchor("ab")
  224. // t.Insert(colltab.Secondary, "q", "")
  225. // t.Insert(colltab.Tertiary, "Q", "")
  226. // Order "b" before "a":
  227. // t.SetAnchorBefore("a")
  228. // t.Insert(colltab.Primary, "b", "")
  229. // Order "0" after the last primary ignorable:
  230. // t.SetAnchor("<last_primary_ignorable/>")
  231. // t.Insert(colltab.Primary, "0", "")
  232. func (t *Tailoring) Insert(level colltab.Level, str, extend string) error {
  233. if t.anchor == nil {
  234. return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str)
  235. }
  236. str = norm.NFC.String(str)
  237. e := t.index.find(str)
  238. if e == nil {
  239. e = t.index.newEntry(str, nil)
  240. } else if e.logical != noAnchor {
  241. return fmt.Errorf("%s:Insert: cannot reinsert logical reset position %q", t.id, e.str)
  242. }
  243. if e.lock {
  244. return fmt.Errorf("%s:Insert: cannot reinsert element %q", t.id, e.str)
  245. }
  246. a := t.anchor
  247. // Find the first element after the anchor which differs at a level smaller or
  248. // equal to the given level. Then insert at this position.
  249. // See https://unicode.org/reports/tr35/#Collation_Elements, Section 5.14.5 for details.
  250. e.before = t.before
  251. if t.before {
  252. t.before = false
  253. if a.prev == nil {
  254. a.insertBefore(e)
  255. } else {
  256. for a = a.prev; a.level > level; a = a.prev {
  257. }
  258. a.insertAfter(e)
  259. }
  260. e.level = level
  261. } else {
  262. for ; a.level > level; a = a.next {
  263. }
  264. e.level = a.level
  265. if a != e {
  266. a.insertAfter(e)
  267. a.level = level
  268. } else {
  269. // We don't set a to prev itself. This has the effect of the entry
  270. // getting new collation elements that are an increment of itself.
  271. // This is intentional.
  272. a.prev.level = level
  273. }
  274. }
  275. e.extend = norm.NFD.String(extend)
  276. e.exclude = false
  277. e.modified = true
  278. e.elems = nil
  279. t.anchor = e
  280. return nil
  281. }
  282. func (o *ordering) getWeight(e *entry) []rawCE {
  283. if len(e.elems) == 0 && e.logical == noAnchor {
  284. if e.implicit {
  285. for _, r := range e.runes {
  286. e.elems = append(e.elems, o.getWeight(o.find(string(r)))...)
  287. }
  288. } else if e.before {
  289. count := [colltab.Identity + 1]int{}
  290. a := e
  291. for ; a.elems == nil && !a.implicit; a = a.next {
  292. count[a.level]++
  293. }
  294. e.elems = []rawCE{makeRawCE(a.elems[0].w, a.elems[0].ccc)}
  295. for i := colltab.Primary; i < colltab.Quaternary; i++ {
  296. if count[i] != 0 {
  297. e.elems[0].w[i] -= count[i]
  298. break
  299. }
  300. }
  301. if e.prev != nil {
  302. o.verifyWeights(e.prev, e, e.prev.level)
  303. }
  304. } else {
  305. prev := e.prev
  306. e.elems = nextWeight(prev.level, o.getWeight(prev))
  307. o.verifyWeights(e, e.next, e.level)
  308. }
  309. }
  310. return e.elems
  311. }
  312. func (o *ordering) addExtension(e *entry) {
  313. if ex := o.find(e.extend); ex != nil {
  314. e.elems = append(e.elems, ex.elems...)
  315. } else {
  316. for _, r := range []rune(e.extend) {
  317. e.elems = append(e.elems, o.find(string(r)).elems...)
  318. }
  319. }
  320. e.extend = ""
  321. }
  322. func (o *ordering) verifyWeights(a, b *entry, level colltab.Level) error {
  323. if level == colltab.Identity || b == nil || b.elems == nil || a.elems == nil {
  324. return nil
  325. }
  326. for i := colltab.Primary; i < level; i++ {
  327. if a.elems[0].w[i] < b.elems[0].w[i] {
  328. return nil
  329. }
  330. }
  331. if a.elems[0].w[level] >= b.elems[0].w[level] {
  332. err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems)
  333. log.Println(err)
  334. // TODO: return the error instead, or better, fix the conflicting entry by making room.
  335. }
  336. return nil
  337. }
  338. func (b *Builder) error(e error) {
  339. if e != nil {
  340. b.err = e
  341. }
  342. }
  343. func (b *Builder) errorID(locale string, e error) {
  344. if e != nil {
  345. b.err = fmt.Errorf("%s:%v", locale, e)
  346. }
  347. }
  348. // patchNorm ensures that NFC and NFD counterparts are consistent.
  349. func (o *ordering) patchNorm() {
  350. // Insert the NFD counterparts, if necessary.
  351. for _, e := range o.ordered {
  352. nfd := norm.NFD.String(e.str)
  353. if nfd != e.str {
  354. if e0 := o.find(nfd); e0 != nil && !e0.modified {
  355. e0.elems = e.elems
  356. } else if e.modified && !equalCEArrays(o.genColElems(nfd), e.elems) {
  357. e := o.newEntry(nfd, e.elems)
  358. e.modified = true
  359. }
  360. }
  361. }
  362. // Update unchanged composed forms if one of their parts changed.
  363. for _, e := range o.ordered {
  364. nfd := norm.NFD.String(e.str)
  365. if e.modified || nfd == e.str {
  366. continue
  367. }
  368. if e0 := o.find(nfd); e0 != nil {
  369. e.elems = e0.elems
  370. } else {
  371. e.elems = o.genColElems(nfd)
  372. if norm.NFD.LastBoundary([]byte(nfd)) == 0 {
  373. r := []rune(nfd)
  374. head := string(r[0])
  375. tail := ""
  376. for i := 1; i < len(r); i++ {
  377. s := norm.NFC.String(head + string(r[i]))
  378. if e0 := o.find(s); e0 != nil && e0.modified {
  379. head = s
  380. } else {
  381. tail += string(r[i])
  382. }
  383. }
  384. e.elems = append(o.genColElems(head), o.genColElems(tail)...)
  385. }
  386. }
  387. }
  388. // Exclude entries for which the individual runes generate the same collation elements.
  389. for _, e := range o.ordered {
  390. if len(e.runes) > 1 && equalCEArrays(o.genColElems(e.str), e.elems) {
  391. e.exclude = true
  392. }
  393. }
  394. }
  395. func (b *Builder) buildOrdering(o *ordering) {
  396. for _, e := range o.ordered {
  397. o.getWeight(e)
  398. }
  399. for _, e := range o.ordered {
  400. o.addExtension(e)
  401. }
  402. o.patchNorm()
  403. o.sort()
  404. simplify(o)
  405. b.processExpansions(o) // requires simplify
  406. b.processContractions(o) // requires simplify
  407. t := newNode()
  408. for e := o.front(); e != nil; e, _ = e.nextIndexed() {
  409. if !e.skip() {
  410. ce, err := e.encode()
  411. b.errorID(o.id, err)
  412. t.insert(e.runes[0], ce)
  413. }
  414. }
  415. o.handle = b.index.addTrie(t)
  416. }
  417. func (b *Builder) build() (*table, error) {
  418. if b.built {
  419. return b.t, b.err
  420. }
  421. b.built = true
  422. b.t = &table{
  423. Table: colltab.Table{
  424. MaxContractLen: utf8.UTFMax,
  425. VariableTop: uint32(b.varTop),
  426. },
  427. }
  428. b.buildOrdering(&b.root)
  429. b.t.root = b.root.handle
  430. for _, t := range b.locale {
  431. b.buildOrdering(t.index)
  432. if b.err != nil {
  433. break
  434. }
  435. }
  436. i, err := b.index.generate()
  437. b.t.trie = *i
  438. b.t.Index = colltab.Trie{
  439. Index: i.index,
  440. Values: i.values,
  441. Index0: i.index[blockSize*b.t.root.lookupStart:],
  442. Values0: i.values[blockSize*b.t.root.valueStart:],
  443. }
  444. b.error(err)
  445. return b.t, b.err
  446. }
  447. // Build builds the root Collator.
  448. func (b *Builder) Build() (colltab.Weighter, error) {
  449. table, err := b.build()
  450. if err != nil {
  451. return nil, err
  452. }
  453. return table, nil
  454. }
  455. // Build builds a Collator for Tailoring t.
  456. func (t *Tailoring) Build() (colltab.Weighter, error) {
  457. // TODO: implement.
  458. return nil, nil
  459. }
  460. // Print prints the tables for b and all its Tailorings as a Go file
  461. // that can be included in the Collate package.
  462. func (b *Builder) Print(w io.Writer) (n int, err error) {
  463. p := func(nn int, e error) {
  464. n += nn
  465. if err == nil {
  466. err = e
  467. }
  468. }
  469. t, err := b.build()
  470. if err != nil {
  471. return 0, err
  472. }
  473. p(fmt.Fprintf(w, `var availableLocales = "und`))
  474. for _, loc := range b.locale {
  475. if loc.id != "und" {
  476. p(fmt.Fprintf(w, ",%s", loc.id))
  477. }
  478. }
  479. p(fmt.Fprint(w, "\"\n\n"))
  480. p(fmt.Fprintf(w, "const varTop = 0x%x\n\n", b.varTop))
  481. p(fmt.Fprintln(w, "var locales = [...]tableIndex{"))
  482. for _, loc := range b.locale {
  483. if loc.id == "und" {
  484. p(t.fprintIndex(w, loc.index.handle, loc.id))
  485. }
  486. }
  487. for _, loc := range b.locale {
  488. if loc.id != "und" {
  489. p(t.fprintIndex(w, loc.index.handle, loc.id))
  490. }
  491. }
  492. p(fmt.Fprint(w, "}\n\n"))
  493. n, _, err = t.fprint(w, "main")
  494. return
  495. }
  496. // reproducibleFromNFKD checks whether the given expansion could be generated
  497. // from an NFKD expansion.
  498. func reproducibleFromNFKD(e *entry, exp, nfkd []rawCE) bool {
  499. // Length must be equal.
  500. if len(exp) != len(nfkd) {
  501. return false
  502. }
  503. for i, ce := range exp {
  504. // Primary and secondary values should be equal.
  505. if ce.w[0] != nfkd[i].w[0] || ce.w[1] != nfkd[i].w[1] {
  506. return false
  507. }
  508. // Tertiary values should be equal to maxTertiary for third element onwards.
  509. // TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can
  510. // simply be dropped. Try this out by dropping the following code.
  511. if i >= 2 && ce.w[2] != maxTertiary {
  512. return false
  513. }
  514. if _, err := makeCE(ce); err != nil {
  515. // Simply return false. The error will be caught elsewhere.
  516. return false
  517. }
  518. }
  519. return true
  520. }
  521. func simplify(o *ordering) {
  522. // Runes that are a starter of a contraction should not be removed.
  523. // (To date, there is only Kannada character 0CCA.)
  524. keep := make(map[rune]bool)
  525. for e := o.front(); e != nil; e, _ = e.nextIndexed() {
  526. if len(e.runes) > 1 {
  527. keep[e.runes[0]] = true
  528. }
  529. }
  530. // Tag entries for which the runes NFKD decompose to identical values.
  531. for e := o.front(); e != nil; e, _ = e.nextIndexed() {
  532. s := e.str
  533. nfkd := norm.NFKD.String(s)
  534. nfd := norm.NFD.String(s)
  535. if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == nfd {
  536. continue
  537. }
  538. if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) {
  539. e.decompose = true
  540. }
  541. }
  542. }
  543. // appendExpansion converts the given collation sequence to
  544. // collation elements and adds them to the expansion table.
  545. // It returns an index to the expansion table.
  546. func (b *Builder) appendExpansion(e *entry) int {
  547. t := b.t
  548. i := len(t.ExpandElem)
  549. ce := uint32(len(e.elems))
  550. t.ExpandElem = append(t.ExpandElem, ce)
  551. for _, w := range e.elems {
  552. ce, err := makeCE(w)
  553. if err != nil {
  554. b.error(err)
  555. return -1
  556. }
  557. t.ExpandElem = append(t.ExpandElem, ce)
  558. }
  559. return i
  560. }
  561. // processExpansions extracts data necessary to generate
  562. // the extraction tables.
  563. func (b *Builder) processExpansions(o *ordering) {
  564. for e := o.front(); e != nil; e, _ = e.nextIndexed() {
  565. if !e.expansion() {
  566. continue
  567. }
  568. key := fmt.Sprintf("%v", e.elems)
  569. i, ok := b.expIndex[key]
  570. if !ok {
  571. i = b.appendExpansion(e)
  572. b.expIndex[key] = i
  573. }
  574. e.expansionIndex = i
  575. }
  576. }
  577. func (b *Builder) processContractions(o *ordering) {
  578. // Collate contractions per starter rune.
  579. starters := []rune{}
  580. cm := make(map[rune][]*entry)
  581. for e := o.front(); e != nil; e, _ = e.nextIndexed() {
  582. if e.contraction() {
  583. if len(e.str) > b.t.MaxContractLen {
  584. b.t.MaxContractLen = len(e.str)
  585. }
  586. r := e.runes[0]
  587. if _, ok := cm[r]; !ok {
  588. starters = append(starters, r)
  589. }
  590. cm[r] = append(cm[r], e)
  591. }
  592. }
  593. // Add entries of single runes that are at a start of a contraction.
  594. for e := o.front(); e != nil; e, _ = e.nextIndexed() {
  595. if !e.contraction() {
  596. r := e.runes[0]
  597. if _, ok := cm[r]; ok {
  598. cm[r] = append(cm[r], e)
  599. }
  600. }
  601. }
  602. // Build the tries for the contractions.
  603. t := b.t
  604. for _, r := range starters {
  605. l := cm[r]
  606. // Compute suffix strings. There are 31 different contraction suffix
  607. // sets for 715 contractions and 82 contraction starter runes as of
  608. // version 6.0.0.
  609. sufx := []string{}
  610. hasSingle := false
  611. for _, e := range l {
  612. if len(e.runes) > 1 {
  613. sufx = append(sufx, string(e.runes[1:]))
  614. } else {
  615. hasSingle = true
  616. }
  617. }
  618. if !hasSingle {
  619. b.error(fmt.Errorf("no single entry for starter rune %U found", r))
  620. continue
  621. }
  622. // Unique the suffix set.
  623. sort.Strings(sufx)
  624. key := strings.Join(sufx, "\n")
  625. handle, ok := b.ctHandle[key]
  626. if !ok {
  627. var err error
  628. handle, err = appendTrie(&t.ContractTries, sufx)
  629. if err != nil {
  630. b.error(err)
  631. }
  632. b.ctHandle[key] = handle
  633. }
  634. // Bucket sort entries in index order.
  635. es := make([]*entry, len(l))
  636. for _, e := range l {
  637. var p, sn int
  638. if len(e.runes) > 1 {
  639. str := []byte(string(e.runes[1:]))
  640. p, sn = lookup(&t.ContractTries, handle, str)
  641. if sn != len(str) {
  642. log.Fatalf("%s: processContractions: unexpected length for '%X'; len=%d; want %d", o.id, e.runes, sn, len(str))
  643. }
  644. }
  645. if es[p] != nil {
  646. log.Fatalf("%s: multiple contractions for position %d for rune %U", o.id, p, e.runes[0])
  647. }
  648. es[p] = e
  649. }
  650. // Create collation elements for contractions.
  651. elems := []uint32{}
  652. for _, e := range es {
  653. ce, err := e.encodeBase()
  654. b.errorID(o.id, err)
  655. elems = append(elems, ce)
  656. }
  657. key = fmt.Sprintf("%v", elems)
  658. i, ok := b.ctElem[key]
  659. if !ok {
  660. i = len(t.ContractElem)
  661. b.ctElem[key] = i
  662. t.ContractElem = append(t.ContractElem, elems...)
  663. }
  664. // Store info in entry for starter rune.
  665. es[0].contractionIndex = i
  666. es[0].contractionHandle = handle
  667. }
  668. }