123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702 |
- // Copyright 2012 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package build // import "golang.org/x/text/collate/build"
- import (
- "fmt"
- "io"
- "log"
- "sort"
- "strings"
- "unicode/utf8"
- "golang.org/x/text/internal/colltab"
- "golang.org/x/text/language"
- "golang.org/x/text/unicode/norm"
- )
- // TODO: optimizations:
- // - expandElem is currently 20K. By putting unique colElems in a separate
- // table and having a byte array of indexes into this table, we can reduce
- // the total size to about 7K. By also factoring out the length bytes, we
- // can reduce this to about 6K.
- // - trie valueBlocks are currently 100K. There are a lot of sparse blocks
- // and many consecutive values with the same stride. This can be further
- // compacted.
- // - Compress secondary weights into 8 bits.
- // - Some LDML specs specify a context element. Currently we simply concatenate
- // those. Context can be implemented using the contraction trie. If Builder
- // could analyze and detect when using a context makes sense, there is no
- // need to expose this construct in the API.
- // A Builder builds a root collation table. The user must specify the
- // collation elements for each entry. A common use will be to base the weights
- // on those specified in the allkeys* file as provided by the UCA or CLDR.
- type Builder struct {
- index *trieBuilder
- root ordering
- locale []*Tailoring
- t *table
- err error
- built bool
- minNonVar int // lowest primary recorded for a variable
- varTop int // highest primary recorded for a non-variable
- // indexes used for reusing expansions and contractions
- expIndex map[string]int // positions of expansions keyed by their string representation
- ctHandle map[string]ctHandle // contraction handles keyed by a concatenation of the suffixes
- ctElem map[string]int // contraction elements keyed by their string representation
- }
- // A Tailoring builds a collation table based on another collation table.
- // The table is defined by specifying tailorings to the underlying table.
- // See https://unicode.org/reports/tr35/ for an overview of tailoring
- // collation tables. The CLDR contains pre-defined tailorings for a variety
- // of languages (See https://www.unicode.org/Public/cldr/<version>/core.zip.)
- type Tailoring struct {
- id string
- builder *Builder
- index *ordering
- anchor *entry
- before bool
- }
- // NewBuilder returns a new Builder.
- func NewBuilder() *Builder {
- return &Builder{
- index: newTrieBuilder(),
- root: makeRootOrdering(),
- expIndex: make(map[string]int),
- ctHandle: make(map[string]ctHandle),
- ctElem: make(map[string]int),
- }
- }
- // Tailoring returns a Tailoring for the given locale. One should
- // have completed all calls to Add before calling Tailoring.
- func (b *Builder) Tailoring(loc language.Tag) *Tailoring {
- t := &Tailoring{
- id: loc.String(),
- builder: b,
- index: b.root.clone(),
- }
- t.index.id = t.id
- b.locale = append(b.locale, t)
- return t
- }
- // Add adds an entry to the collation element table, mapping
- // a slice of runes to a sequence of collation elements.
- // A collation element is specified as list of weights: []int{primary, secondary, ...}.
- // The entries are typically obtained from a collation element table
- // as defined in https://www.unicode.org/reports/tr10/#Data_Table_Format.
- // Note that the collation elements specified by colelems are only used
- // as a guide. The actual weights generated by Builder may differ.
- // The argument variables is a list of indices into colelems that should contain
- // a value for each colelem that is a variable. (See the reference above.)
- func (b *Builder) Add(runes []rune, colelems [][]int, variables []int) error {
- str := string(runes)
- elems := make([]rawCE, len(colelems))
- for i, ce := range colelems {
- if len(ce) == 0 {
- break
- }
- elems[i] = makeRawCE(ce, 0)
- if len(ce) == 1 {
- elems[i].w[1] = defaultSecondary
- }
- if len(ce) <= 2 {
- elems[i].w[2] = defaultTertiary
- }
- if len(ce) <= 3 {
- elems[i].w[3] = ce[0]
- }
- }
- for i, ce := range elems {
- p := ce.w[0]
- isvar := false
- for _, j := range variables {
- if i == j {
- isvar = true
- }
- }
- if isvar {
- if p >= b.minNonVar && b.minNonVar > 0 {
- return fmt.Errorf("primary value %X of variable is larger than the smallest non-variable %X", p, b.minNonVar)
- }
- if p > b.varTop {
- b.varTop = p
- }
- } else if p > 1 { // 1 is a special primary value reserved for FFFE
- if p <= b.varTop {
- return fmt.Errorf("primary value %X of non-variable is smaller than the highest variable %X", p, b.varTop)
- }
- if b.minNonVar == 0 || p < b.minNonVar {
- b.minNonVar = p
- }
- }
- }
- elems, err := convertLargeWeights(elems)
- if err != nil {
- return err
- }
- cccs := []uint8{}
- nfd := norm.NFD.String(str)
- for i := range nfd {
- cccs = append(cccs, norm.NFD.PropertiesString(nfd[i:]).CCC())
- }
- if len(cccs) < len(elems) {
- if len(cccs) > 2 {
- return fmt.Errorf("number of decomposed characters should be greater or equal to the number of collation elements for len(colelems) > 3 (%d < %d)", len(cccs), len(elems))
- }
- p := len(elems) - 1
- for ; p > 0 && elems[p].w[0] == 0; p-- {
- elems[p].ccc = cccs[len(cccs)-1]
- }
- for ; p >= 0; p-- {
- elems[p].ccc = cccs[0]
- }
- } else {
- for i := range elems {
- elems[i].ccc = cccs[i]
- }
- }
- // doNorm in collate.go assumes that the following conditions hold.
- if len(elems) > 1 && len(cccs) > 1 && cccs[0] != 0 && cccs[0] != cccs[len(cccs)-1] {
- return fmt.Errorf("incompatible CCC values for expansion %X (%d)", runes, cccs)
- }
- b.root.newEntry(str, elems)
- return nil
- }
- func (t *Tailoring) setAnchor(anchor string) error {
- anchor = norm.NFC.String(anchor)
- a := t.index.find(anchor)
- if a == nil {
- a = t.index.newEntry(anchor, nil)
- a.implicit = true
- a.modified = true
- for _, r := range []rune(anchor) {
- e := t.index.find(string(r))
- e.lock = true
- }
- }
- t.anchor = a
- return nil
- }
- // SetAnchor sets the point after which elements passed in subsequent calls to
- // Insert will be inserted. It is equivalent to the reset directive in an LDML
- // specification. See Insert for an example.
- // SetAnchor supports the following logical reset positions:
- // <first_tertiary_ignorable/>, <last_teriary_ignorable/>, <first_primary_ignorable/>,
- // and <last_non_ignorable/>.
- func (t *Tailoring) SetAnchor(anchor string) error {
- if err := t.setAnchor(anchor); err != nil {
- return err
- }
- t.before = false
- return nil
- }
- // SetAnchorBefore is similar to SetAnchor, except that subsequent calls to
- // Insert will insert entries before the anchor.
- func (t *Tailoring) SetAnchorBefore(anchor string) error {
- if err := t.setAnchor(anchor); err != nil {
- return err
- }
- t.before = true
- return nil
- }
- // Insert sets the ordering of str relative to the entry set by the previous
- // call to SetAnchor or Insert. The argument extend corresponds
- // to the extend elements as defined in LDML. A non-empty value for extend
- // will cause the collation elements corresponding to extend to be appended
- // to the collation elements generated for the entry added by Insert.
- // This has the same net effect as sorting str after the string anchor+extend.
- // See https://www.unicode.org/reports/tr10/#Tailoring_Example for details
- // on parametric tailoring and https://unicode.org/reports/tr35/#Collation_Elements
- // for full details on LDML.
- //
- // Examples: create a tailoring for Swedish, where "ä" is ordered after "z"
- // at the primary sorting level:
- // t := b.Tailoring("se")
- // t.SetAnchor("z")
- // t.Insert(colltab.Primary, "ä", "")
- // Order "ü" after "ue" at the secondary sorting level:
- // t.SetAnchor("ue")
- // t.Insert(colltab.Secondary, "ü","")
- // or
- // t.SetAnchor("u")
- // t.Insert(colltab.Secondary, "ü", "e")
- // Order "q" afer "ab" at the secondary level and "Q" after "q"
- // at the tertiary level:
- // t.SetAnchor("ab")
- // t.Insert(colltab.Secondary, "q", "")
- // t.Insert(colltab.Tertiary, "Q", "")
- // Order "b" before "a":
- // t.SetAnchorBefore("a")
- // t.Insert(colltab.Primary, "b", "")
- // Order "0" after the last primary ignorable:
- // t.SetAnchor("<last_primary_ignorable/>")
- // t.Insert(colltab.Primary, "0", "")
- func (t *Tailoring) Insert(level colltab.Level, str, extend string) error {
- if t.anchor == nil {
- return fmt.Errorf("%s:Insert: no anchor point set for tailoring of %s", t.id, str)
- }
- str = norm.NFC.String(str)
- e := t.index.find(str)
- if e == nil {
- e = t.index.newEntry(str, nil)
- } else if e.logical != noAnchor {
- return fmt.Errorf("%s:Insert: cannot reinsert logical reset position %q", t.id, e.str)
- }
- if e.lock {
- return fmt.Errorf("%s:Insert: cannot reinsert element %q", t.id, e.str)
- }
- a := t.anchor
- // Find the first element after the anchor which differs at a level smaller or
- // equal to the given level. Then insert at this position.
- // See https://unicode.org/reports/tr35/#Collation_Elements, Section 5.14.5 for details.
- e.before = t.before
- if t.before {
- t.before = false
- if a.prev == nil {
- a.insertBefore(e)
- } else {
- for a = a.prev; a.level > level; a = a.prev {
- }
- a.insertAfter(e)
- }
- e.level = level
- } else {
- for ; a.level > level; a = a.next {
- }
- e.level = a.level
- if a != e {
- a.insertAfter(e)
- a.level = level
- } else {
- // We don't set a to prev itself. This has the effect of the entry
- // getting new collation elements that are an increment of itself.
- // This is intentional.
- a.prev.level = level
- }
- }
- e.extend = norm.NFD.String(extend)
- e.exclude = false
- e.modified = true
- e.elems = nil
- t.anchor = e
- return nil
- }
- func (o *ordering) getWeight(e *entry) []rawCE {
- if len(e.elems) == 0 && e.logical == noAnchor {
- if e.implicit {
- for _, r := range e.runes {
- e.elems = append(e.elems, o.getWeight(o.find(string(r)))...)
- }
- } else if e.before {
- count := [colltab.Identity + 1]int{}
- a := e
- for ; a.elems == nil && !a.implicit; a = a.next {
- count[a.level]++
- }
- e.elems = []rawCE{makeRawCE(a.elems[0].w, a.elems[0].ccc)}
- for i := colltab.Primary; i < colltab.Quaternary; i++ {
- if count[i] != 0 {
- e.elems[0].w[i] -= count[i]
- break
- }
- }
- if e.prev != nil {
- o.verifyWeights(e.prev, e, e.prev.level)
- }
- } else {
- prev := e.prev
- e.elems = nextWeight(prev.level, o.getWeight(prev))
- o.verifyWeights(e, e.next, e.level)
- }
- }
- return e.elems
- }
- func (o *ordering) addExtension(e *entry) {
- if ex := o.find(e.extend); ex != nil {
- e.elems = append(e.elems, ex.elems...)
- } else {
- for _, r := range []rune(e.extend) {
- e.elems = append(e.elems, o.find(string(r)).elems...)
- }
- }
- e.extend = ""
- }
- func (o *ordering) verifyWeights(a, b *entry, level colltab.Level) error {
- if level == colltab.Identity || b == nil || b.elems == nil || a.elems == nil {
- return nil
- }
- for i := colltab.Primary; i < level; i++ {
- if a.elems[0].w[i] < b.elems[0].w[i] {
- return nil
- }
- }
- if a.elems[0].w[level] >= b.elems[0].w[level] {
- err := fmt.Errorf("%s:overflow: collation elements of %q (%X) overflows those of %q (%X) at level %d (%X >= %X)", o.id, a.str, a.runes, b.str, b.runes, level, a.elems, b.elems)
- log.Println(err)
- // TODO: return the error instead, or better, fix the conflicting entry by making room.
- }
- return nil
- }
- func (b *Builder) error(e error) {
- if e != nil {
- b.err = e
- }
- }
- func (b *Builder) errorID(locale string, e error) {
- if e != nil {
- b.err = fmt.Errorf("%s:%v", locale, e)
- }
- }
- // patchNorm ensures that NFC and NFD counterparts are consistent.
- func (o *ordering) patchNorm() {
- // Insert the NFD counterparts, if necessary.
- for _, e := range o.ordered {
- nfd := norm.NFD.String(e.str)
- if nfd != e.str {
- if e0 := o.find(nfd); e0 != nil && !e0.modified {
- e0.elems = e.elems
- } else if e.modified && !equalCEArrays(o.genColElems(nfd), e.elems) {
- e := o.newEntry(nfd, e.elems)
- e.modified = true
- }
- }
- }
- // Update unchanged composed forms if one of their parts changed.
- for _, e := range o.ordered {
- nfd := norm.NFD.String(e.str)
- if e.modified || nfd == e.str {
- continue
- }
- if e0 := o.find(nfd); e0 != nil {
- e.elems = e0.elems
- } else {
- e.elems = o.genColElems(nfd)
- if norm.NFD.LastBoundary([]byte(nfd)) == 0 {
- r := []rune(nfd)
- head := string(r[0])
- tail := ""
- for i := 1; i < len(r); i++ {
- s := norm.NFC.String(head + string(r[i]))
- if e0 := o.find(s); e0 != nil && e0.modified {
- head = s
- } else {
- tail += string(r[i])
- }
- }
- e.elems = append(o.genColElems(head), o.genColElems(tail)...)
- }
- }
- }
- // Exclude entries for which the individual runes generate the same collation elements.
- for _, e := range o.ordered {
- if len(e.runes) > 1 && equalCEArrays(o.genColElems(e.str), e.elems) {
- e.exclude = true
- }
- }
- }
- func (b *Builder) buildOrdering(o *ordering) {
- for _, e := range o.ordered {
- o.getWeight(e)
- }
- for _, e := range o.ordered {
- o.addExtension(e)
- }
- o.patchNorm()
- o.sort()
- simplify(o)
- b.processExpansions(o) // requires simplify
- b.processContractions(o) // requires simplify
- t := newNode()
- for e := o.front(); e != nil; e, _ = e.nextIndexed() {
- if !e.skip() {
- ce, err := e.encode()
- b.errorID(o.id, err)
- t.insert(e.runes[0], ce)
- }
- }
- o.handle = b.index.addTrie(t)
- }
- func (b *Builder) build() (*table, error) {
- if b.built {
- return b.t, b.err
- }
- b.built = true
- b.t = &table{
- Table: colltab.Table{
- MaxContractLen: utf8.UTFMax,
- VariableTop: uint32(b.varTop),
- },
- }
- b.buildOrdering(&b.root)
- b.t.root = b.root.handle
- for _, t := range b.locale {
- b.buildOrdering(t.index)
- if b.err != nil {
- break
- }
- }
- i, err := b.index.generate()
- b.t.trie = *i
- b.t.Index = colltab.Trie{
- Index: i.index,
- Values: i.values,
- Index0: i.index[blockSize*b.t.root.lookupStart:],
- Values0: i.values[blockSize*b.t.root.valueStart:],
- }
- b.error(err)
- return b.t, b.err
- }
- // Build builds the root Collator.
- func (b *Builder) Build() (colltab.Weighter, error) {
- table, err := b.build()
- if err != nil {
- return nil, err
- }
- return table, nil
- }
- // Build builds a Collator for Tailoring t.
- func (t *Tailoring) Build() (colltab.Weighter, error) {
- // TODO: implement.
- return nil, nil
- }
- // Print prints the tables for b and all its Tailorings as a Go file
- // that can be included in the Collate package.
- func (b *Builder) Print(w io.Writer) (n int, err error) {
- p := func(nn int, e error) {
- n += nn
- if err == nil {
- err = e
- }
- }
- t, err := b.build()
- if err != nil {
- return 0, err
- }
- p(fmt.Fprintf(w, `var availableLocales = "und`))
- for _, loc := range b.locale {
- if loc.id != "und" {
- p(fmt.Fprintf(w, ",%s", loc.id))
- }
- }
- p(fmt.Fprint(w, "\"\n\n"))
- p(fmt.Fprintf(w, "const varTop = 0x%x\n\n", b.varTop))
- p(fmt.Fprintln(w, "var locales = [...]tableIndex{"))
- for _, loc := range b.locale {
- if loc.id == "und" {
- p(t.fprintIndex(w, loc.index.handle, loc.id))
- }
- }
- for _, loc := range b.locale {
- if loc.id != "und" {
- p(t.fprintIndex(w, loc.index.handle, loc.id))
- }
- }
- p(fmt.Fprint(w, "}\n\n"))
- n, _, err = t.fprint(w, "main")
- return
- }
- // reproducibleFromNFKD checks whether the given expansion could be generated
- // from an NFKD expansion.
- func reproducibleFromNFKD(e *entry, exp, nfkd []rawCE) bool {
- // Length must be equal.
- if len(exp) != len(nfkd) {
- return false
- }
- for i, ce := range exp {
- // Primary and secondary values should be equal.
- if ce.w[0] != nfkd[i].w[0] || ce.w[1] != nfkd[i].w[1] {
- return false
- }
- // Tertiary values should be equal to maxTertiary for third element onwards.
- // TODO: there seem to be a lot of cases in CLDR (e.g. ㏭ in zh.xml) that can
- // simply be dropped. Try this out by dropping the following code.
- if i >= 2 && ce.w[2] != maxTertiary {
- return false
- }
- if _, err := makeCE(ce); err != nil {
- // Simply return false. The error will be caught elsewhere.
- return false
- }
- }
- return true
- }
- func simplify(o *ordering) {
- // Runes that are a starter of a contraction should not be removed.
- // (To date, there is only Kannada character 0CCA.)
- keep := make(map[rune]bool)
- for e := o.front(); e != nil; e, _ = e.nextIndexed() {
- if len(e.runes) > 1 {
- keep[e.runes[0]] = true
- }
- }
- // Tag entries for which the runes NFKD decompose to identical values.
- for e := o.front(); e != nil; e, _ = e.nextIndexed() {
- s := e.str
- nfkd := norm.NFKD.String(s)
- nfd := norm.NFD.String(s)
- if e.decompose || len(e.runes) > 1 || len(e.elems) == 1 || keep[e.runes[0]] || nfkd == nfd {
- continue
- }
- if reproducibleFromNFKD(e, e.elems, o.genColElems(nfkd)) {
- e.decompose = true
- }
- }
- }
- // appendExpansion converts the given collation sequence to
- // collation elements and adds them to the expansion table.
- // It returns an index to the expansion table.
- func (b *Builder) appendExpansion(e *entry) int {
- t := b.t
- i := len(t.ExpandElem)
- ce := uint32(len(e.elems))
- t.ExpandElem = append(t.ExpandElem, ce)
- for _, w := range e.elems {
- ce, err := makeCE(w)
- if err != nil {
- b.error(err)
- return -1
- }
- t.ExpandElem = append(t.ExpandElem, ce)
- }
- return i
- }
- // processExpansions extracts data necessary to generate
- // the extraction tables.
- func (b *Builder) processExpansions(o *ordering) {
- for e := o.front(); e != nil; e, _ = e.nextIndexed() {
- if !e.expansion() {
- continue
- }
- key := fmt.Sprintf("%v", e.elems)
- i, ok := b.expIndex[key]
- if !ok {
- i = b.appendExpansion(e)
- b.expIndex[key] = i
- }
- e.expansionIndex = i
- }
- }
- func (b *Builder) processContractions(o *ordering) {
- // Collate contractions per starter rune.
- starters := []rune{}
- cm := make(map[rune][]*entry)
- for e := o.front(); e != nil; e, _ = e.nextIndexed() {
- if e.contraction() {
- if len(e.str) > b.t.MaxContractLen {
- b.t.MaxContractLen = len(e.str)
- }
- r := e.runes[0]
- if _, ok := cm[r]; !ok {
- starters = append(starters, r)
- }
- cm[r] = append(cm[r], e)
- }
- }
- // Add entries of single runes that are at a start of a contraction.
- for e := o.front(); e != nil; e, _ = e.nextIndexed() {
- if !e.contraction() {
- r := e.runes[0]
- if _, ok := cm[r]; ok {
- cm[r] = append(cm[r], e)
- }
- }
- }
- // Build the tries for the contractions.
- t := b.t
- for _, r := range starters {
- l := cm[r]
- // Compute suffix strings. There are 31 different contraction suffix
- // sets for 715 contractions and 82 contraction starter runes as of
- // version 6.0.0.
- sufx := []string{}
- hasSingle := false
- for _, e := range l {
- if len(e.runes) > 1 {
- sufx = append(sufx, string(e.runes[1:]))
- } else {
- hasSingle = true
- }
- }
- if !hasSingle {
- b.error(fmt.Errorf("no single entry for starter rune %U found", r))
- continue
- }
- // Unique the suffix set.
- sort.Strings(sufx)
- key := strings.Join(sufx, "\n")
- handle, ok := b.ctHandle[key]
- if !ok {
- var err error
- handle, err = appendTrie(&t.ContractTries, sufx)
- if err != nil {
- b.error(err)
- }
- b.ctHandle[key] = handle
- }
- // Bucket sort entries in index order.
- es := make([]*entry, len(l))
- for _, e := range l {
- var p, sn int
- if len(e.runes) > 1 {
- str := []byte(string(e.runes[1:]))
- p, sn = lookup(&t.ContractTries, handle, str)
- if sn != len(str) {
- log.Fatalf("%s: processContractions: unexpected length for '%X'; len=%d; want %d", o.id, e.runes, sn, len(str))
- }
- }
- if es[p] != nil {
- log.Fatalf("%s: multiple contractions for position %d for rune %U", o.id, p, e.runes[0])
- }
- es[p] = e
- }
- // Create collation elements for contractions.
- elems := []uint32{}
- for _, e := range es {
- ce, err := e.encodeBase()
- b.errorID(o.id, err)
- elems = append(elems, ce)
- }
- key = fmt.Sprintf("%v", elems)
- i, ok := b.ctElem[key]
- if !ok {
- i = len(t.ContractElem)
- b.ctElem[key] = i
- t.ContractElem = append(t.ContractElem, elems...)
- }
- // Store info in entry for starter rune.
- es[0].contractionIndex = i
- es[0].contractionHandle = handle
- }
- }
|