123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833 |
- // Copyright 2014 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- // +build ignore
- // This program generates the trie for casing operations. The Unicode casing
- // algorithm requires the lookup of various properties and mappings for each
- // rune. The table generated by this generator combines several of the most
- // frequently used of these into a single trie so that they can be accessed
- // with a single lookup.
- package main
- import (
- "bytes"
- "fmt"
- "io"
- "io/ioutil"
- "log"
- "reflect"
- "strconv"
- "strings"
- "unicode"
- "golang.org/x/text/internal/gen"
- "golang.org/x/text/internal/triegen"
- "golang.org/x/text/internal/ucd"
- "golang.org/x/text/unicode/norm"
- )
- func main() {
- gen.Init()
- genTables()
- genTablesTest()
- gen.Repackage("gen_trieval.go", "trieval.go", "cases")
- }
- // runeInfo contains all information for a rune that we care about for casing
- // operations.
- type runeInfo struct {
- Rune rune
- entry info // trie value for this rune.
- CaseMode info
- // Simple case mappings.
- Simple [1 + maxCaseMode][]rune
- // Special casing
- HasSpecial bool
- Conditional bool
- Special [1 + maxCaseMode][]rune
- // Folding
- FoldSimple rune
- FoldSpecial rune
- FoldFull []rune
- // TODO: FC_NFKC, or equivalent data.
- // Properties
- SoftDotted bool
- CaseIgnorable bool
- Cased bool
- DecomposeGreek bool
- BreakType string
- BreakCat breakCategory
- // We care mostly about 0, Above, and IotaSubscript.
- CCC byte
- }
- type breakCategory int
- const (
- breakBreak breakCategory = iota
- breakLetter
- breakMid
- )
- // mapping returns the case mapping for the given case type.
- func (r *runeInfo) mapping(c info) string {
- if r.HasSpecial {
- return string(r.Special[c])
- }
- if len(r.Simple[c]) != 0 {
- return string(r.Simple[c])
- }
- return string(r.Rune)
- }
- func parse(file string, f func(p *ucd.Parser)) {
- ucd.Parse(gen.OpenUCDFile(file), f)
- }
- func parseUCD() []runeInfo {
- chars := make([]runeInfo, unicode.MaxRune)
- get := func(r rune) *runeInfo {
- c := &chars[r]
- c.Rune = r
- return c
- }
- parse("UnicodeData.txt", func(p *ucd.Parser) {
- ri := get(p.Rune(0))
- ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass))
- ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping)
- ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping)
- ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping)
- if p.String(ucd.GeneralCategory) == "Lt" {
- ri.CaseMode = cTitle
- }
- })
- // <code>; <property>
- parse("PropList.txt", func(p *ucd.Parser) {
- if p.String(1) == "Soft_Dotted" {
- chars[p.Rune(0)].SoftDotted = true
- }
- })
- // <code>; <word break type>
- parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
- ri := get(p.Rune(0))
- switch p.String(1) {
- case "Case_Ignorable":
- ri.CaseIgnorable = true
- case "Cased":
- ri.Cased = true
- case "Lowercase":
- ri.CaseMode = cLower
- case "Uppercase":
- ri.CaseMode = cUpper
- }
- })
- // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
- parse("SpecialCasing.txt", func(p *ucd.Parser) {
- // We drop all conditional special casing and deal with them manually in
- // the language-specific case mappers. Rune 0x03A3 is the only one with
- // a conditional formatting that is not language-specific. However,
- // dealing with this letter is tricky, especially in a streaming
- // context, so we deal with it in the Caser for Greek specifically.
- ri := get(p.Rune(0))
- if p.String(4) == "" {
- ri.HasSpecial = true
- ri.Special[cLower] = p.Runes(1)
- ri.Special[cTitle] = p.Runes(2)
- ri.Special[cUpper] = p.Runes(3)
- } else {
- ri.Conditional = true
- }
- })
- // TODO: Use text breaking according to UAX #29.
- // <code>; <word break type>
- parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
- ri := get(p.Rune(0))
- ri.BreakType = p.String(1)
- // We collapse the word breaking properties onto the categories we need.
- switch p.String(1) { // TODO: officially we need to canonicalize.
- case "MidLetter", "MidNumLet", "Single_Quote":
- ri.BreakCat = breakMid
- if !ri.CaseIgnorable {
- // finalSigma relies on the fact that all breakMid runes are
- // also a Case_Ignorable. Revisit this code when this changes.
- log.Fatalf("Rune %U, which has a break category mid, is not a case ignorable", ri)
- }
- case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet", "Format", "ZWJ":
- ri.BreakCat = breakLetter
- }
- })
- // <code>; <type>; <mapping>
- parse("CaseFolding.txt", func(p *ucd.Parser) {
- ri := get(p.Rune(0))
- switch p.String(1) {
- case "C":
- ri.FoldSimple = p.Rune(2)
- ri.FoldFull = p.Runes(2)
- case "S":
- ri.FoldSimple = p.Rune(2)
- case "T":
- ri.FoldSpecial = p.Rune(2)
- case "F":
- ri.FoldFull = p.Runes(2)
- default:
- log.Fatalf("%U: unknown type: %s", p.Rune(0), p.String(1))
- }
- })
- return chars
- }
- func genTables() {
- chars := parseUCD()
- verifyProperties(chars)
- t := triegen.NewTrie("case")
- for i := range chars {
- c := &chars[i]
- makeEntry(c)
- t.Insert(rune(i), uint64(c.entry))
- }
- w := gen.NewCodeWriter()
- defer w.WriteVersionedGoFile("tables.go", "cases")
- gen.WriteUnicodeVersion(w)
- // TODO: write CLDR version after adding a mechanism to detect that the
- // tables on which the manually created locale-sensitive casing code is
- // based hasn't changed.
- w.WriteVar("xorData", string(xorData))
- w.WriteVar("exceptions", string(exceptionData))
- sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{}))
- if err != nil {
- log.Fatal(err)
- }
- w.Size += sz
- }
- func makeEntry(ri *runeInfo) {
- if ri.CaseIgnorable {
- if ri.Cased {
- ri.entry = cIgnorableCased
- } else {
- ri.entry = cIgnorableUncased
- }
- } else {
- ri.entry = ri.CaseMode
- }
- // TODO: handle soft-dotted.
- ccc := cccOther
- switch ri.CCC {
- case 0: // Not_Reordered
- ccc = cccZero
- case above: // Above
- ccc = cccAbove
- }
- switch ri.BreakCat {
- case breakBreak:
- ccc = cccBreak
- case breakMid:
- ri.entry |= isMidBit
- }
- ri.entry |= ccc
- if ri.CaseMode == cUncased {
- return
- }
- // Need to do something special.
- if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) {
- makeException(ri)
- return
- }
- if f := string(ri.FoldFull); len(f) > 0 && f != ri.mapping(cUpper) && f != ri.mapping(cLower) {
- makeException(ri)
- return
- }
- // Rune is either lowercase or uppercase.
- orig := string(ri.Rune)
- mapped := ""
- if ri.CaseMode == cUpper {
- mapped = ri.mapping(cLower)
- } else {
- mapped = ri.mapping(cUpper)
- }
- if len(orig) != len(mapped) {
- makeException(ri)
- return
- }
- if string(ri.FoldFull) == ri.mapping(cUpper) {
- ri.entry |= inverseFoldBit
- }
- n := len(orig)
- // Create per-byte XOR mask.
- var b []byte
- for i := 0; i < n; i++ {
- b = append(b, orig[i]^mapped[i])
- }
- // Remove leading 0 bytes, but keep at least one byte.
- for ; len(b) > 1 && b[0] == 0; b = b[1:] {
- }
- if len(b) == 1 && b[0]&0xc0 == 0 {
- ri.entry |= info(b[0]) << xorShift
- return
- }
- key := string(b)
- x, ok := xorCache[key]
- if !ok {
- xorData = append(xorData, 0) // for detecting start of sequence
- xorData = append(xorData, b...)
- x = len(xorData) - 1
- xorCache[key] = x
- }
- ri.entry |= info(x<<xorShift) | xorIndexBit
- }
- var xorCache = map[string]int{}
- // xorData contains byte-wise XOR data for the least significant bytes of a
- // UTF-8 encoded rune. An index points to the last byte. The sequence starts
- // with a zero terminator.
- var xorData = []byte{}
- // See the comments in gen_trieval.go re "the exceptions slice".
- var exceptionData = []byte{0}
- // makeException encodes case mappings that cannot be expressed in a simple
- // XOR diff.
- func makeException(ri *runeInfo) {
- ccc := ri.entry & cccMask
- // Set exception bit and retain case type.
- ri.entry &= 0x0007
- ri.entry |= exceptionBit
- if len(exceptionData) >= 1<<numExceptionBits {
- log.Fatalf("%U:exceptionData too large %#x > %d bits", ri.Rune, len(exceptionData), numExceptionBits)
- }
- // Set the offset in the exceptionData array.
- ri.entry |= info(len(exceptionData) << exceptionShift)
- orig := string(ri.Rune)
- tc := ri.mapping(cTitle)
- uc := ri.mapping(cUpper)
- lc := ri.mapping(cLower)
- ff := string(ri.FoldFull)
- // addString sets the length of a string and adds it to the expansions array.
- addString := func(s string, b *byte) {
- if len(s) == 0 {
- // Zero-length mappings exist, but only for conditional casing,
- // which we are representing outside of this table.
- log.Fatalf("%U: has zero-length mapping.", ri.Rune)
- }
- *b <<= 3
- if s != orig || ri.CaseMode == cLower {
- n := len(s)
- if n > 7 {
- log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n)
- }
- *b |= byte(n)
- exceptionData = append(exceptionData, s...)
- }
- }
- // byte 0:
- exceptionData = append(exceptionData, byte(ccc)|byte(len(ff)))
- // byte 1:
- p := len(exceptionData)
- exceptionData = append(exceptionData, 0)
- if len(ff) > 7 { // May be zero-length.
- log.Fatalf("%U: fold string larger than 7 (%d)", ri.Rune, len(ff))
- }
- exceptionData = append(exceptionData, ff...)
- ct := ri.CaseMode
- if ct != cLower {
- addString(lc, &exceptionData[p])
- }
- if ct != cUpper {
- addString(uc, &exceptionData[p])
- }
- if ct != cTitle {
- addString(tc, &exceptionData[p])
- }
- }
- // sparseCompacter is a trie value block Compacter. There are many cases where
- // successive runes alternate between lower- and upper-case. This Compacter
- // exploits this by adding a special case type where the case value is obtained
- // from or-ing it with the least-significant bit of the rune, creating large
- // ranges of equal case values that compress well.
- type sparseCompacter struct {
- sparseBlocks [][]uint16
- sparseOffsets []uint16
- sparseCount int
- }
- // makeSparse returns the number of elements that compact block would contain
- // as well as the modified values.
- func makeSparse(vals []uint64) ([]uint16, int) {
- // Copy the values.
- values := make([]uint16, len(vals))
- for i, v := range vals {
- values[i] = uint16(v)
- }
- alt := func(i int, v uint16) uint16 {
- if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower {
- // Convert cLower or cUpper to cXORCase value, which has the form 11x.
- xor := v
- xor &^= 1
- xor |= uint16(i&1) ^ (v & 1)
- xor |= 0x4
- return xor
- }
- return v
- }
- var count int
- var previous uint16
- for i, v := range values {
- if v != 0 {
- // Try if the unmodified value is equal to the previous.
- if v == previous {
- continue
- }
- // Try if the xor-ed value is equal to the previous value.
- a := alt(i, v)
- if a == previous {
- values[i] = a
- continue
- }
- // This is a new value.
- count++
- // Use the xor-ed value if it will be identical to the next value.
- if p := i + 1; p < len(values) && alt(p, values[p]) == a {
- values[i] = a
- v = a
- }
- }
- previous = v
- }
- return values, count
- }
- func (s *sparseCompacter) Size(v []uint64) (int, bool) {
- _, n := makeSparse(v)
- // We limit using this method to having 16 entries.
- if n > 16 {
- return 0, false
- }
- return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true
- }
- func (s *sparseCompacter) Store(v []uint64) uint32 {
- h := uint32(len(s.sparseOffsets))
- values, sz := makeSparse(v)
- s.sparseBlocks = append(s.sparseBlocks, values)
- s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
- s.sparseCount += sz
- return h
- }
- func (s *sparseCompacter) Handler() string {
- // The sparse global variable and its lookup method is defined in gen_trieval.go.
- return "sparse.lookup"
- }
- func (s *sparseCompacter) Print(w io.Writer) (retErr error) {
- p := func(format string, args ...interface{}) {
- _, err := fmt.Fprintf(w, format, args...)
- if retErr == nil && err != nil {
- retErr = err
- }
- }
- ls := len(s.sparseBlocks)
- if ls == len(s.sparseOffsets) {
- s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
- }
- p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2)
- p("var sparseOffsets = %#v\n\n", s.sparseOffsets)
- ns := s.sparseCount
- p("// sparseValues: %d entries, %d bytes\n", ns, ns*4)
- p("var sparseValues = [%d]valueRange {", ns)
- for i, values := range s.sparseBlocks {
- p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i])
- var v uint16
- for i, nv := range values {
- if nv != v {
- if v != 0 {
- p(",hi:%#02x},", 0x80+i-1)
- }
- if nv != 0 {
- p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
- }
- }
- v = nv
- }
- if v != 0 {
- p(",hi:%#02x},", 0x80+len(values)-1)
- }
- }
- p("\n}\n\n")
- return
- }
- // verifyProperties that properties of the runes that are relied upon in the
- // implementation. Each property is marked with an identifier that is referred
- // to in the places where it is used.
- func verifyProperties(chars []runeInfo) {
- for i, c := range chars {
- r := rune(i)
- // Rune properties.
- // A.1: modifier never changes on lowercase. [ltLower]
- if c.CCC > 0 && unicode.ToLower(r) != r {
- log.Fatalf("%U: non-starter changes when lowercased", r)
- }
- // A.2: properties of decompositions starting with I or J. [ltLower]
- d := norm.NFD.PropertiesString(string(r)).Decomposition()
- if len(d) > 0 {
- if d[0] == 'I' || d[0] == 'J' {
- // A.2.1: we expect at least an ASCII character and a modifier.
- if len(d) < 3 {
- log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d))
- }
- // All subsequent runes are modifiers and all have the same CCC.
- runes := []rune(string(d[1:]))
- ccc := chars[runes[0]].CCC
- for _, mr := range runes[1:] {
- mc := chars[mr]
- // A.2.2: all modifiers have a CCC of Above or less.
- if ccc == 0 || ccc > above {
- log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc)
- }
- // A.2.3: a sequence of modifiers all have the same CCC.
- if mc.CCC != ccc {
- log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc)
- }
- // A.2.4: for each trailing r, r in [0x300, 0x311] <=> CCC == Above.
- if (ccc == above) != (0x300 <= mr && mr <= 0x311) {
- log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr)
- }
- if i += len(string(mr)); i >= len(d) {
- break
- }
- }
- }
- }
- // A.3: no U+0307 in decomposition of Soft-Dotted rune. [ltUpper]
- if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") {
- log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r)
- }
- // A.4: only rune U+0345 may be of CCC Iota_Subscript. [elUpper]
- if c.CCC == iotaSubscript && r != 0x0345 {
- log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r)
- }
- // A.5: soft-dotted runes do not have exceptions.
- if c.SoftDotted && c.entry&exceptionBit != 0 {
- log.Fatalf("%U: soft-dotted has exception", r)
- }
- // A.6: Greek decomposition. [elUpper]
- if unicode.Is(unicode.Greek, r) {
- if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil {
- runes := []rune(string(b))
- // A.6.1: If a Greek rune decomposes and the first rune of the
- // decomposition is greater than U+00FF, the rune is always
- // great and not a modifier.
- if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) {
- log.Fatalf("%U: expected first rune of Greek decomposition to be letter, found %U", r, f)
- }
- // A.6.2: Any follow-up rune in a Greek decomposition is a
- // modifier of which the first should be gobbled in
- // decomposition.
- for _, m := range runes[1:] {
- switch m {
- case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345:
- default:
- log.Fatalf("%U: modifier %U is outside of expected Greek modifier set", r, m)
- }
- }
- }
- }
- // Breaking properties.
- // B.1: all runes with CCC > 0 are of break type Extend.
- if c.CCC > 0 && c.BreakType != "Extend" {
- log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType)
- }
- // B.2: all cased runes with c.CCC == 0 are of break type ALetter.
- if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" {
- log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType)
- }
- // B.3: letter category.
- if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable {
- if c.BreakCat != breakLetter {
- log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter)
- }
- }
- }
- }
- func genTablesTest() {
- w := &bytes.Buffer{}
- fmt.Fprintln(w, "var (")
- printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore)
- // We discard the output as we know we have perfect functions. We run them
- // just to verify the properties are correct.
- n := printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased)
- n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower)
- n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper)
- if n > 0 {
- log.Fatalf("One of the discarded properties does not have a perfect filter.")
- }
- // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
- fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{")
- parse("SpecialCasing.txt", func(p *ucd.Parser) {
- // Skip conditional entries.
- if p.String(4) != "" {
- return
- }
- r := p.Rune(0)
- fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n",
- r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3)))
- })
- fmt.Fprint(w, "\t}\n\n")
- // <code>; <type>; <runes>
- table := map[rune]struct{ simple, full, special string }{}
- parse("CaseFolding.txt", func(p *ucd.Parser) {
- r := p.Rune(0)
- t := p.String(1)
- v := string(p.Runes(2))
- if t != "T" && v == string(unicode.ToLower(r)) {
- return
- }
- x := table[r]
- switch t {
- case "C":
- x.full = v
- x.simple = v
- case "S":
- x.simple = v
- case "F":
- x.full = v
- case "T":
- x.special = v
- }
- table[r] = x
- })
- fmt.Fprintln(w, "\tfoldMap = map[rune]struct{ simple, full, special string }{")
- for r := rune(0); r < 0x10FFFF; r++ {
- x, ok := table[r]
- if !ok {
- continue
- }
- fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", r, x.simple, x.full, x.special)
- }
- fmt.Fprint(w, "\t}\n\n")
- // Break property
- notBreak := map[rune]bool{}
- parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
- switch p.String(1) {
- case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote",
- "ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet", "ZWJ":
- notBreak[p.Rune(0)] = true
- }
- })
- fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{")
- inBreak := false
- for r := rune(0); r <= lastRuneForTesting; r++ {
- if isBreak := !notBreak[r]; isBreak != inBreak {
- if isBreak {
- fmt.Fprintf(w, "\t\t{0x%x, ", r)
- } else {
- fmt.Fprintf(w, "0x%x},\n", r-1)
- }
- inBreak = isBreak
- }
- }
- if inBreak {
- fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting)
- }
- fmt.Fprint(w, "\t}\n\n")
- // Word break test
- // Filter out all samples that do not contain cased characters.
- cased := map[rune]bool{}
- parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
- if p.String(1) == "Cased" {
- cased[p.Rune(0)] = true
- }
- })
- fmt.Fprintln(w, "\tbreakTest = []string{")
- parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) {
- c := strings.Split(p.String(0), " ")
- const sep = '|'
- numCased := 0
- test := ""
- for ; len(c) >= 2; c = c[2:] {
- if c[0] == "÷" && test != "" {
- test += string(sep)
- }
- i, err := strconv.ParseUint(c[1], 16, 32)
- r := rune(i)
- if err != nil {
- log.Fatalf("Invalid rune %q.", c[1])
- }
- if r == sep {
- log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep)
- }
- if cased[r] {
- numCased++
- }
- test += string(r)
- }
- if numCased > 1 {
- fmt.Fprintf(w, "\t\t%q,\n", test)
- }
- })
- fmt.Fprintln(w, "\t}")
- fmt.Fprintln(w, ")")
- gen.WriteVersionedGoFile("tables_test.go", "cases", w.Bytes())
- }
- // These functions are just used for verification that their definition have not
- // changed in the Unicode Standard.
- func verifyCased(r rune) bool {
- return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r)
- }
- func verifyLower(r rune) bool {
- return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r)
- }
- func verifyUpper(r rune) bool {
- return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r)
- }
- // verifyIgnore is an approximation of the Case_Ignorable property using the
- // core unicode package. It is used to reduce the size of the test data.
- func verifyIgnore(r rune) bool {
- props := []*unicode.RangeTable{
- unicode.Mn,
- unicode.Me,
- unicode.Cf,
- unicode.Lm,
- unicode.Sk,
- }
- for _, p := range props {
- if unicode.Is(p, r) {
- return true
- }
- }
- return false
- }
- // printProperties prints tables of rune properties from the given UCD file.
- // A filter func f can be given to exclude certain values. A rune r will have
- // the indicated property if it is in the generated table or if f(r).
- func printProperties(w io.Writer, file, property string, f func(r rune) bool) int {
- verify := map[rune]bool{}
- n := 0
- varNameParts := strings.Split(property, "_")
- varNameParts[0] = strings.ToLower(varNameParts[0])
- fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, ""))
- parse(file, func(p *ucd.Parser) {
- if p.String(1) == property {
- r := p.Rune(0)
- verify[r] = true
- if !f(r) {
- n++
- fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r)
- }
- }
- })
- fmt.Fprint(w, "\t}\n\n")
- // Verify that f is correct, that is, it represents a subset of the property.
- for r := rune(0); r <= lastRuneForTesting; r++ {
- if !verify[r] && f(r) {
- log.Fatalf("Incorrect filter func for property %q.", property)
- }
- }
- return n
- }
- // The newCaseTrie, sparseValues and sparseOffsets definitions below are
- // placeholders referred to by gen_trieval.go. The real definitions are
- // generated by this program and written to tables.go.
- func newCaseTrie(int) int { return 0 }
- var (
- sparseValues [0]valueRange
- sparseOffsets [0]uint16
- )
|