publics
/
text


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833
							// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// +build ignore

// This program generates the trie for casing operations. The Unicode casing
// algorithm requires the lookup of various properties and mappings for each
// rune. The table generated by this generator combines several of the most
// frequently used of these into a single trie so that they can be accessed
// with a single lookup.
package main

import (
	"bytes"
	"fmt"
	"io"
	"io/ioutil"
	"log"
	"reflect"
	"strconv"
	"strings"
	"unicode"

	"golang.org/x/text/internal/gen"
	"golang.org/x/text/internal/triegen"
	"golang.org/x/text/internal/ucd"
	"golang.org/x/text/unicode/norm"
)

func main() {
	gen.Init()
	genTables()
	genTablesTest()
	gen.Repackage("gen_trieval.go", "trieval.go", "cases")
}

// runeInfo contains all information for a rune that we care about for casing
// operations.
type runeInfo struct {
	Rune rune

	entry info // trie value for this rune.

	CaseMode info

	// Simple case mappings.
	Simple [1 + maxCaseMode][]rune

	// Special casing
	HasSpecial  bool
	Conditional bool
	Special     [1 + maxCaseMode][]rune

	// Folding
	FoldSimple  rune
	FoldSpecial rune
	FoldFull    []rune

	// TODO: FC_NFKC, or equivalent data.

	// Properties
	SoftDotted     bool
	CaseIgnorable  bool
	Cased          bool
	DecomposeGreek bool
	BreakType      string
	BreakCat       breakCategory

	// We care mostly about 0, Above, and IotaSubscript.
	CCC byte
}

type breakCategory int

const (
	breakBreak breakCategory = iota
	breakLetter
	breakMid
)

// mapping returns the case mapping for the given case type.
func (r *runeInfo) mapping(c info) string {
	if r.HasSpecial {
		return string(r.Special[c])
	}
	if len(r.Simple[c]) != 0 {
		return string(r.Simple[c])
	}
	return string(r.Rune)
}

func parse(file string, f func(p *ucd.Parser)) {
	ucd.Parse(gen.OpenUCDFile(file), f)
}

func parseUCD() []runeInfo {
	chars := make([]runeInfo, unicode.MaxRune)

	get := func(r rune) *runeInfo {
		c := &chars[r]
		c.Rune = r
		return c
	}

	parse("UnicodeData.txt", func(p *ucd.Parser) {
		ri := get(p.Rune(0))
		ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass))
		ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping)
		ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping)
		ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping)
		if p.String(ucd.GeneralCategory) == "Lt" {
			ri.CaseMode = cTitle
		}
	})

	// <code>; <property>
	parse("PropList.txt", func(p *ucd.Parser) {
		if p.String(1) == "Soft_Dotted" {
			chars[p.Rune(0)].SoftDotted = true
		}
	})

	// <code>; <word break type>
	parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
		ri := get(p.Rune(0))
		switch p.String(1) {
		case "Case_Ignorable":
			ri.CaseIgnorable = true
		case "Cased":
			ri.Cased = true
		case "Lowercase":
			ri.CaseMode = cLower
		case "Uppercase":
			ri.CaseMode = cUpper
		}
	})

	// <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
	parse("SpecialCasing.txt", func(p *ucd.Parser) {
		// We drop all conditional special casing and deal with them manually in
		// the language-specific case mappers. Rune 0x03A3 is the only one with
		// a conditional formatting that is not language-specific. However,
		// dealing with this letter is tricky, especially in a streaming
		// context, so we deal with it in the Caser for Greek specifically.
		ri := get(p.Rune(0))
		if p.String(4) == "" {
			ri.HasSpecial = true
			ri.Special[cLower] = p.Runes(1)
			ri.Special[cTitle] = p.Runes(2)
			ri.Special[cUpper] = p.Runes(3)
		} else {
			ri.Conditional = true
		}
	})

	// TODO: Use text breaking according to UAX #29.
	// <code>; <word break type>
	parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
		ri := get(p.Rune(0))
		ri.BreakType = p.String(1)

		// We collapse the word breaking properties onto the categories we need.
		switch p.String(1) { // TODO: officially we need to canonicalize.
		case "MidLetter", "MidNumLet", "Single_Quote":
			ri.BreakCat = breakMid
			if !ri.CaseIgnorable {
				// finalSigma relies on the fact that all breakMid runes are
				// also a Case_Ignorable. Revisit this code when this changes.
				log.Fatalf("Rune %U, which has a break category mid, is not a case ignorable", ri)
			}
		case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet", "Format", "ZWJ":
			ri.BreakCat = breakLetter
		}
	})

	// <code>; <type>; <mapping>
	parse("CaseFolding.txt", func(p *ucd.Parser) {
		ri := get(p.Rune(0))
		switch p.String(1) {
		case "C":
			ri.FoldSimple = p.Rune(2)
			ri.FoldFull = p.Runes(2)
		case "S":
			ri.FoldSimple = p.Rune(2)
		case "T":
			ri.FoldSpecial = p.Rune(2)
		case "F":
			ri.FoldFull = p.Runes(2)
		default:
			log.Fatalf("%U: unknown type: %s", p.Rune(0), p.String(1))
		}
	})

	return chars
}

func genTables() {
	chars := parseUCD()
	verifyProperties(chars)

	t := triegen.NewTrie("case")
	for i := range chars {
		c := &chars[i]
		makeEntry(c)
		t.Insert(rune(i), uint64(c.entry))
	}

	w := gen.NewCodeWriter()
	defer w.WriteVersionedGoFile("tables.go", "cases")

	gen.WriteUnicodeVersion(w)

	// TODO: write CLDR version after adding a mechanism to detect that the
	// tables on which the manually created locale-sensitive casing code is
	// based hasn't changed.

	w.WriteVar("xorData", string(xorData))
	w.WriteVar("exceptions", string(exceptionData))

	sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{}))
	if err != nil {
		log.Fatal(err)
	}
	w.Size += sz
}

func makeEntry(ri *runeInfo) {
	if ri.CaseIgnorable {
		if ri.Cased {
			ri.entry = cIgnorableCased
		} else {
			ri.entry = cIgnorableUncased
		}
	} else {
		ri.entry = ri.CaseMode
	}

	// TODO: handle soft-dotted.

	ccc := cccOther
	switch ri.CCC {
	case 0: // Not_Reordered
		ccc = cccZero
	case above: // Above
		ccc = cccAbove
	}
	switch ri.BreakCat {
	case breakBreak:
		ccc = cccBreak
	case breakMid:
		ri.entry |= isMidBit
	}

	ri.entry |= ccc

	if ri.CaseMode == cUncased {
		return
	}

	// Need to do something special.
	if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) {
		makeException(ri)
		return
	}
	if f := string(ri.FoldFull); len(f) > 0 && f != ri.mapping(cUpper) && f != ri.mapping(cLower) {
		makeException(ri)
		return
	}

	// Rune is either lowercase or uppercase.

	orig := string(ri.Rune)
	mapped := ""
	if ri.CaseMode == cUpper {
		mapped = ri.mapping(cLower)
	} else {
		mapped = ri.mapping(cUpper)
	}

	if len(orig) != len(mapped) {
		makeException(ri)
		return
	}

	if string(ri.FoldFull) == ri.mapping(cUpper) {
		ri.entry |= inverseFoldBit
	}

	n := len(orig)

	// Create per-byte XOR mask.
	var b []byte
	for i := 0; i < n; i++ {
		b = append(b, orig[i]^mapped[i])
	}

	// Remove leading 0 bytes, but keep at least one byte.
	for ; len(b) > 1 && b[0] == 0; b = b[1:] {
	}

	if len(b) == 1 && b[0]&0xc0 == 0 {
		ri.entry |= info(b[0]) << xorShift
		return
	}

	key := string(b)
	x, ok := xorCache[key]
	if !ok {
		xorData = append(xorData, 0) // for detecting start of sequence
		xorData = append(xorData, b...)

		x = len(xorData) - 1
		xorCache[key] = x
	}
	ri.entry |= info(x<<xorShift) | xorIndexBit
}

var xorCache = map[string]int{}

// xorData contains byte-wise XOR data for the least significant bytes of a
// UTF-8 encoded rune. An index points to the last byte. The sequence starts
// with a zero terminator.
var xorData = []byte{}

// See the comments in gen_trieval.go re "the exceptions slice".
var exceptionData = []byte{0}

// makeException encodes case mappings that cannot be expressed in a simple
// XOR diff.
func makeException(ri *runeInfo) {
	ccc := ri.entry & cccMask
	// Set exception bit and retain case type.
	ri.entry &= 0x0007
	ri.entry |= exceptionBit

	if len(exceptionData) >= 1<<numExceptionBits {
		log.Fatalf("%U:exceptionData too large %#x > %d bits", ri.Rune, len(exceptionData), numExceptionBits)
	}

	// Set the offset in the exceptionData array.
	ri.entry |= info(len(exceptionData) << exceptionShift)

	orig := string(ri.Rune)
	tc := ri.mapping(cTitle)
	uc := ri.mapping(cUpper)
	lc := ri.mapping(cLower)
	ff := string(ri.FoldFull)

	// addString sets the length of a string and adds it to the expansions array.
	addString := func(s string, b *byte) {
		if len(s) == 0 {
			// Zero-length mappings exist, but only for conditional casing,
			// which we are representing outside of this table.
			log.Fatalf("%U: has zero-length mapping.", ri.Rune)
		}
		*b <<= 3
		if s != orig || ri.CaseMode == cLower {
			n := len(s)
			if n > 7 {
				log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n)
			}
			*b |= byte(n)
			exceptionData = append(exceptionData, s...)
		}
	}

	// byte 0:
	exceptionData = append(exceptionData, byte(ccc)|byte(len(ff)))

	// byte 1:
	p := len(exceptionData)
	exceptionData = append(exceptionData, 0)

	if len(ff) > 7 { // May be zero-length.
		log.Fatalf("%U: fold string larger than 7 (%d)", ri.Rune, len(ff))
	}
	exceptionData = append(exceptionData, ff...)
	ct := ri.CaseMode
	if ct != cLower {
		addString(lc, &exceptionData[p])
	}
	if ct != cUpper {
		addString(uc, &exceptionData[p])
	}
	if ct != cTitle {
		addString(tc, &exceptionData[p])
	}
}

// sparseCompacter is a trie value block Compacter. There are many cases where
// successive runes alternate between lower- and upper-case. This Compacter
// exploits this by adding a special case type where the case value is obtained
// from or-ing it with the least-significant bit of the rune, creating large
// ranges of equal case values that compress well.
type sparseCompacter struct {
	sparseBlocks  [][]uint16
	sparseOffsets []uint16
	sparseCount   int
}

// makeSparse returns the number of elements that compact block would contain
// as well as the modified values.
func makeSparse(vals []uint64) ([]uint16, int) {
	// Copy the values.
	values := make([]uint16, len(vals))
	for i, v := range vals {
		values[i] = uint16(v)
	}

	alt := func(i int, v uint16) uint16 {
		if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower {
			// Convert cLower or cUpper to cXORCase value, which has the form 11x.
			xor := v
			xor &^= 1
			xor |= uint16(i&1) ^ (v & 1)
			xor |= 0x4
			return xor
		}
		return v
	}

	var count int
	var previous uint16
	for i, v := range values {
		if v != 0 {
			// Try if the unmodified value is equal to the previous.
			if v == previous {
				continue
			}

			// Try if the xor-ed value is equal to the previous value.
			a := alt(i, v)
			if a == previous {
				values[i] = a
				continue
			}

			// This is a new value.
			count++

			// Use the xor-ed value if it will be identical to the next value.
			if p := i + 1; p < len(values) && alt(p, values[p]) == a {
				values[i] = a
				v = a
			}
		}
		previous = v
	}
	return values, count
}

func (s *sparseCompacter) Size(v []uint64) (int, bool) {
	_, n := makeSparse(v)

	// We limit using this method to having 16 entries.
	if n > 16 {
		return 0, false
	}

	return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true
}

func (s *sparseCompacter) Store(v []uint64) uint32 {
	h := uint32(len(s.sparseOffsets))
	values, sz := makeSparse(v)
	s.sparseBlocks = append(s.sparseBlocks, values)
	s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
	s.sparseCount += sz
	return h
}

func (s *sparseCompacter) Handler() string {
	// The sparse global variable and its lookup method is defined in gen_trieval.go.
	return "sparse.lookup"
}

func (s *sparseCompacter) Print(w io.Writer) (retErr error) {
	p := func(format string, args ...interface{}) {
		_, err := fmt.Fprintf(w, format, args...)
		if retErr == nil && err != nil {
			retErr = err
		}
	}

	ls := len(s.sparseBlocks)
	if ls == len(s.sparseOffsets) {
		s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
	}
	p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2)
	p("var sparseOffsets = %#v\n\n", s.sparseOffsets)

	ns := s.sparseCount
	p("// sparseValues: %d entries, %d bytes\n", ns, ns*4)
	p("var sparseValues = [%d]valueRange {", ns)
	for i, values := range s.sparseBlocks {
		p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i])
		var v uint16
		for i, nv := range values {
			if nv != v {
				if v != 0 {
					p(",hi:%#02x},", 0x80+i-1)
				}
				if nv != 0 {
					p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
				}
			}
			v = nv
		}
		if v != 0 {
			p(",hi:%#02x},", 0x80+len(values)-1)
		}
	}
	p("\n}\n\n")
	return
}

// verifyProperties that properties of the runes that are relied upon in the
// implementation. Each property is marked with an identifier that is referred
// to in the places where it is used.
func verifyProperties(chars []runeInfo) {
	for i, c := range chars {
		r := rune(i)

		// Rune properties.

		// A.1: modifier never changes on lowercase. [ltLower]
		if c.CCC > 0 && unicode.ToLower(r) != r {
			log.Fatalf("%U: non-starter changes when lowercased", r)
		}

		// A.2: properties of decompositions starting with I or J. [ltLower]
		d := norm.NFD.PropertiesString(string(r)).Decomposition()
		if len(d) > 0 {
			if d[0] == 'I' || d[0] == 'J' {
				// A.2.1: we expect at least an ASCII character and a modifier.
				if len(d) < 3 {
					log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d))
				}

				// All subsequent runes are modifiers and all have the same CCC.
				runes := []rune(string(d[1:]))
				ccc := chars[runes[0]].CCC

				for _, mr := range runes[1:] {
					mc := chars[mr]

					// A.2.2: all modifiers have a CCC of Above or less.
					if ccc == 0 || ccc > above {
						log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc)
					}

					// A.2.3: a sequence of modifiers all have the same CCC.
					if mc.CCC != ccc {
						log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc)
					}

					// A.2.4: for each trailing r, r in [0x300, 0x311] <=> CCC == Above.
					if (ccc == above) != (0x300 <= mr && mr <= 0x311) {
						log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr)
					}

					if i += len(string(mr)); i >= len(d) {
						break
					}
				}
			}
		}

		// A.3: no U+0307 in decomposition of Soft-Dotted rune. [ltUpper]
		if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") {
			log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r)
		}

		// A.4: only rune U+0345 may be of CCC Iota_Subscript. [elUpper]
		if c.CCC == iotaSubscript && r != 0x0345 {
			log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r)
		}

		// A.5: soft-dotted runes do not have exceptions.
		if c.SoftDotted && c.entry&exceptionBit != 0 {
			log.Fatalf("%U: soft-dotted has exception", r)
		}

		// A.6: Greek decomposition. [elUpper]
		if unicode.Is(unicode.Greek, r) {
			if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil {
				runes := []rune(string(b))
				// A.6.1: If a Greek rune decomposes and the first rune of the
				// decomposition is greater than U+00FF, the rune is always
				// great and not a modifier.
				if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) {
					log.Fatalf("%U: expected first rune of Greek decomposition to be letter, found %U", r, f)
				}
				// A.6.2: Any follow-up rune in a Greek decomposition is a
				// modifier of which the first should be gobbled in
				// decomposition.
				for _, m := range runes[1:] {
					switch m {
					case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345:
					default:
						log.Fatalf("%U: modifier %U is outside of expected Greek modifier set", r, m)
					}
				}
			}
		}

		// Breaking properties.

		// B.1: all runes with CCC > 0 are of break type Extend.
		if c.CCC > 0 && c.BreakType != "Extend" {
			log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType)
		}

		// B.2: all cased runes with c.CCC == 0 are of break type ALetter.
		if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" {
			log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType)
		}

		// B.3: letter category.
		if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable {
			if c.BreakCat != breakLetter {
				log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter)
			}
		}
	}
}

func genTablesTest() {
	w := &bytes.Buffer{}

	fmt.Fprintln(w, "var (")
	printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore)

	// We discard the output as we know we have perfect functions. We run them
	// just to verify the properties are correct.
	n := printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased)
	n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower)
	n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper)
	if n > 0 {
		log.Fatalf("One of the discarded properties does not have a perfect filter.")
	}

	// <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
	fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{")
	parse("SpecialCasing.txt", func(p *ucd.Parser) {
		// Skip conditional entries.
		if p.String(4) != "" {
			return
		}
		r := p.Rune(0)
		fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n",
			r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3)))
	})
	fmt.Fprint(w, "\t}\n\n")

	// <code>; <type>; <runes>
	table := map[rune]struct{ simple, full, special string }{}
	parse("CaseFolding.txt", func(p *ucd.Parser) {
		r := p.Rune(0)
		t := p.String(1)
		v := string(p.Runes(2))
		if t != "T" && v == string(unicode.ToLower(r)) {
			return
		}
		x := table[r]
		switch t {
		case "C":
			x.full = v
			x.simple = v
		case "S":
			x.simple = v
		case "F":
			x.full = v
		case "T":
			x.special = v
		}
		table[r] = x
	})
	fmt.Fprintln(w, "\tfoldMap = map[rune]struct{ simple, full, special string }{")
	for r := rune(0); r < 0x10FFFF; r++ {
		x, ok := table[r]
		if !ok {
			continue
		}
		fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", r, x.simple, x.full, x.special)
	}
	fmt.Fprint(w, "\t}\n\n")

	// Break property
	notBreak := map[rune]bool{}
	parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
		switch p.String(1) {
		case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote",
			"ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet", "ZWJ":
			notBreak[p.Rune(0)] = true
		}
	})

	fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{")
	inBreak := false
	for r := rune(0); r <= lastRuneForTesting; r++ {
		if isBreak := !notBreak[r]; isBreak != inBreak {
			if isBreak {
				fmt.Fprintf(w, "\t\t{0x%x, ", r)
			} else {
				fmt.Fprintf(w, "0x%x},\n", r-1)
			}
			inBreak = isBreak
		}
	}
	if inBreak {
		fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting)
	}
	fmt.Fprint(w, "\t}\n\n")

	// Word break test
	// Filter out all samples that do not contain cased characters.
	cased := map[rune]bool{}
	parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
		if p.String(1) == "Cased" {
			cased[p.Rune(0)] = true
		}
	})

	fmt.Fprintln(w, "\tbreakTest = []string{")
	parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) {
		c := strings.Split(p.String(0), " ")

		const sep = '|'
		numCased := 0
		test := ""
		for ; len(c) >= 2; c = c[2:] {
			if c[0] == "÷" && test != "" {
				test += string(sep)
			}
			i, err := strconv.ParseUint(c[1], 16, 32)
			r := rune(i)
			if err != nil {
				log.Fatalf("Invalid rune %q.", c[1])
			}
			if r == sep {
				log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep)
			}
			if cased[r] {
				numCased++
			}
			test += string(r)
		}
		if numCased > 1 {
			fmt.Fprintf(w, "\t\t%q,\n", test)
		}
	})
	fmt.Fprintln(w, "\t}")

	fmt.Fprintln(w, ")")

	gen.WriteVersionedGoFile("tables_test.go", "cases", w.Bytes())
}

// These functions are just used for verification that their definition have not
// changed in the Unicode Standard.

func verifyCased(r rune) bool {
	return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r)
}

func verifyLower(r rune) bool {
	return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r)
}

func verifyUpper(r rune) bool {
	return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r)
}

// verifyIgnore is an approximation of the Case_Ignorable property using the
// core unicode package. It is used to reduce the size of the test data.
func verifyIgnore(r rune) bool {
	props := []*unicode.RangeTable{
		unicode.Mn,
		unicode.Me,
		unicode.Cf,
		unicode.Lm,
		unicode.Sk,
	}
	for _, p := range props {
		if unicode.Is(p, r) {
			return true
		}
	}
	return false
}

// printProperties prints tables of rune properties from the given UCD file.
// A filter func f can be given to exclude certain values. A rune r will have
// the indicated property if it is in the generated table or if f(r).
func printProperties(w io.Writer, file, property string, f func(r rune) bool) int {
	verify := map[rune]bool{}
	n := 0
	varNameParts := strings.Split(property, "_")
	varNameParts[0] = strings.ToLower(varNameParts[0])
	fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, ""))
	parse(file, func(p *ucd.Parser) {
		if p.String(1) == property {
			r := p.Rune(0)
			verify[r] = true
			if !f(r) {
				n++
				fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r)
			}
		}
	})
	fmt.Fprint(w, "\t}\n\n")

	// Verify that f is correct, that is, it represents a subset of the property.
	for r := rune(0); r <= lastRuneForTesting; r++ {
		if !verify[r] && f(r) {
			log.Fatalf("Incorrect filter func for property %q.", property)
		}
	}
	return n
}

// The newCaseTrie, sparseValues and sparseOffsets definitions below are
// placeholders referred to by gen_trieval.go. The real definitions are
// generated by this program and written to tables.go.

func newCaseTrie(int) int { return 0 }

var (
	sparseValues  [0]valueRange
	sparseOffsets [0]uint16
)