gen.go 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834
  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:build ignore
  5. // +build ignore
  6. // This program generates the trie for casing operations. The Unicode casing
  7. // algorithm requires the lookup of various properties and mappings for each
  8. // rune. The table generated by this generator combines several of the most
  9. // frequently used of these into a single trie so that they can be accessed
  10. // with a single lookup.
  11. package main
  12. import (
  13. "bytes"
  14. "fmt"
  15. "io"
  16. "io/ioutil"
  17. "log"
  18. "reflect"
  19. "strconv"
  20. "strings"
  21. "unicode"
  22. "golang.org/x/text/internal/gen"
  23. "golang.org/x/text/internal/triegen"
  24. "golang.org/x/text/internal/ucd"
  25. "golang.org/x/text/unicode/norm"
  26. )
  27. func main() {
  28. gen.Init()
  29. genTables()
  30. genTablesTest()
  31. gen.Repackage("gen_trieval.go", "trieval.go", "cases")
  32. }
  33. // runeInfo contains all information for a rune that we care about for casing
  34. // operations.
  35. type runeInfo struct {
  36. Rune rune
  37. entry info // trie value for this rune.
  38. CaseMode info
  39. // Simple case mappings.
  40. Simple [1 + maxCaseMode][]rune
  41. // Special casing
  42. HasSpecial bool
  43. Conditional bool
  44. Special [1 + maxCaseMode][]rune
  45. // Folding
  46. FoldSimple rune
  47. FoldSpecial rune
  48. FoldFull []rune
  49. // TODO: FC_NFKC, or equivalent data.
  50. // Properties
  51. SoftDotted bool
  52. CaseIgnorable bool
  53. Cased bool
  54. DecomposeGreek bool
  55. BreakType string
  56. BreakCat breakCategory
  57. // We care mostly about 0, Above, and IotaSubscript.
  58. CCC byte
  59. }
  60. type breakCategory int
  61. const (
  62. breakBreak breakCategory = iota
  63. breakLetter
  64. breakMid
  65. )
  66. // mapping returns the case mapping for the given case type.
  67. func (r *runeInfo) mapping(c info) string {
  68. if r.HasSpecial {
  69. return string(r.Special[c])
  70. }
  71. if len(r.Simple[c]) != 0 {
  72. return string(r.Simple[c])
  73. }
  74. return string(r.Rune)
  75. }
  76. func parse(file string, f func(p *ucd.Parser)) {
  77. ucd.Parse(gen.OpenUCDFile(file), f)
  78. }
  79. func parseUCD() []runeInfo {
  80. chars := make([]runeInfo, unicode.MaxRune)
  81. get := func(r rune) *runeInfo {
  82. c := &chars[r]
  83. c.Rune = r
  84. return c
  85. }
  86. parse("UnicodeData.txt", func(p *ucd.Parser) {
  87. ri := get(p.Rune(0))
  88. ri.CCC = byte(p.Int(ucd.CanonicalCombiningClass))
  89. ri.Simple[cLower] = p.Runes(ucd.SimpleLowercaseMapping)
  90. ri.Simple[cUpper] = p.Runes(ucd.SimpleUppercaseMapping)
  91. ri.Simple[cTitle] = p.Runes(ucd.SimpleTitlecaseMapping)
  92. if p.String(ucd.GeneralCategory) == "Lt" {
  93. ri.CaseMode = cTitle
  94. }
  95. })
  96. // <code>; <property>
  97. parse("PropList.txt", func(p *ucd.Parser) {
  98. if p.String(1) == "Soft_Dotted" {
  99. chars[p.Rune(0)].SoftDotted = true
  100. }
  101. })
  102. // <code>; <word break type>
  103. parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
  104. ri := get(p.Rune(0))
  105. switch p.String(1) {
  106. case "Case_Ignorable":
  107. ri.CaseIgnorable = true
  108. case "Cased":
  109. ri.Cased = true
  110. case "Lowercase":
  111. ri.CaseMode = cLower
  112. case "Uppercase":
  113. ri.CaseMode = cUpper
  114. }
  115. })
  116. // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
  117. parse("SpecialCasing.txt", func(p *ucd.Parser) {
  118. // We drop all conditional special casing and deal with them manually in
  119. // the language-specific case mappers. Rune 0x03A3 is the only one with
  120. // a conditional formatting that is not language-specific. However,
  121. // dealing with this letter is tricky, especially in a streaming
  122. // context, so we deal with it in the Caser for Greek specifically.
  123. ri := get(p.Rune(0))
  124. if p.String(4) == "" {
  125. ri.HasSpecial = true
  126. ri.Special[cLower] = p.Runes(1)
  127. ri.Special[cTitle] = p.Runes(2)
  128. ri.Special[cUpper] = p.Runes(3)
  129. } else {
  130. ri.Conditional = true
  131. }
  132. })
  133. // TODO: Use text breaking according to UAX #29.
  134. // <code>; <word break type>
  135. parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
  136. ri := get(p.Rune(0))
  137. ri.BreakType = p.String(1)
  138. // We collapse the word breaking properties onto the categories we need.
  139. switch p.String(1) { // TODO: officially we need to canonicalize.
  140. case "MidLetter", "MidNumLet", "Single_Quote":
  141. ri.BreakCat = breakMid
  142. if !ri.CaseIgnorable {
  143. // finalSigma relies on the fact that all breakMid runes are
  144. // also a Case_Ignorable. Revisit this code when this changes.
  145. log.Fatalf("Rune %U, which has a break category mid, is not a case ignorable", ri)
  146. }
  147. case "ALetter", "Hebrew_Letter", "Numeric", "Extend", "ExtendNumLet", "Format", "ZWJ":
  148. ri.BreakCat = breakLetter
  149. }
  150. })
  151. // <code>; <type>; <mapping>
  152. parse("CaseFolding.txt", func(p *ucd.Parser) {
  153. ri := get(p.Rune(0))
  154. switch p.String(1) {
  155. case "C":
  156. ri.FoldSimple = p.Rune(2)
  157. ri.FoldFull = p.Runes(2)
  158. case "S":
  159. ri.FoldSimple = p.Rune(2)
  160. case "T":
  161. ri.FoldSpecial = p.Rune(2)
  162. case "F":
  163. ri.FoldFull = p.Runes(2)
  164. default:
  165. log.Fatalf("%U: unknown type: %s", p.Rune(0), p.String(1))
  166. }
  167. })
  168. return chars
  169. }
  170. func genTables() {
  171. chars := parseUCD()
  172. verifyProperties(chars)
  173. t := triegen.NewTrie("case")
  174. for i := range chars {
  175. c := &chars[i]
  176. makeEntry(c)
  177. t.Insert(rune(i), uint64(c.entry))
  178. }
  179. w := gen.NewCodeWriter()
  180. defer w.WriteVersionedGoFile("tables.go", "cases")
  181. gen.WriteUnicodeVersion(w)
  182. // TODO: write CLDR version after adding a mechanism to detect that the
  183. // tables on which the manually created locale-sensitive casing code is
  184. // based hasn't changed.
  185. w.WriteVar("xorData", string(xorData))
  186. w.WriteVar("exceptions", string(exceptionData))
  187. sz, err := t.Gen(w, triegen.Compact(&sparseCompacter{}))
  188. if err != nil {
  189. log.Fatal(err)
  190. }
  191. w.Size += sz
  192. }
  193. func makeEntry(ri *runeInfo) {
  194. if ri.CaseIgnorable {
  195. if ri.Cased {
  196. ri.entry = cIgnorableCased
  197. } else {
  198. ri.entry = cIgnorableUncased
  199. }
  200. } else {
  201. ri.entry = ri.CaseMode
  202. }
  203. // TODO: handle soft-dotted.
  204. ccc := cccOther
  205. switch ri.CCC {
  206. case 0: // Not_Reordered
  207. ccc = cccZero
  208. case above: // Above
  209. ccc = cccAbove
  210. }
  211. switch ri.BreakCat {
  212. case breakBreak:
  213. ccc = cccBreak
  214. case breakMid:
  215. ri.entry |= isMidBit
  216. }
  217. ri.entry |= ccc
  218. if ri.CaseMode == cUncased {
  219. return
  220. }
  221. // Need to do something special.
  222. if ri.CaseMode == cTitle || ri.HasSpecial || ri.mapping(cTitle) != ri.mapping(cUpper) {
  223. makeException(ri)
  224. return
  225. }
  226. if f := string(ri.FoldFull); len(f) > 0 && f != ri.mapping(cUpper) && f != ri.mapping(cLower) {
  227. makeException(ri)
  228. return
  229. }
  230. // Rune is either lowercase or uppercase.
  231. orig := string(ri.Rune)
  232. mapped := ""
  233. if ri.CaseMode == cUpper {
  234. mapped = ri.mapping(cLower)
  235. } else {
  236. mapped = ri.mapping(cUpper)
  237. }
  238. if len(orig) != len(mapped) {
  239. makeException(ri)
  240. return
  241. }
  242. if string(ri.FoldFull) == ri.mapping(cUpper) {
  243. ri.entry |= inverseFoldBit
  244. }
  245. n := len(orig)
  246. // Create per-byte XOR mask.
  247. var b []byte
  248. for i := 0; i < n; i++ {
  249. b = append(b, orig[i]^mapped[i])
  250. }
  251. // Remove leading 0 bytes, but keep at least one byte.
  252. for ; len(b) > 1 && b[0] == 0; b = b[1:] {
  253. }
  254. if len(b) == 1 && b[0]&0xc0 == 0 {
  255. ri.entry |= info(b[0]) << xorShift
  256. return
  257. }
  258. key := string(b)
  259. x, ok := xorCache[key]
  260. if !ok {
  261. xorData = append(xorData, 0) // for detecting start of sequence
  262. xorData = append(xorData, b...)
  263. x = len(xorData) - 1
  264. xorCache[key] = x
  265. }
  266. ri.entry |= info(x<<xorShift) | xorIndexBit
  267. }
  268. var xorCache = map[string]int{}
  269. // xorData contains byte-wise XOR data for the least significant bytes of a
  270. // UTF-8 encoded rune. An index points to the last byte. The sequence starts
  271. // with a zero terminator.
  272. var xorData = []byte{}
  273. // See the comments in gen_trieval.go re "the exceptions slice".
  274. var exceptionData = []byte{0}
  275. // makeException encodes case mappings that cannot be expressed in a simple
  276. // XOR diff.
  277. func makeException(ri *runeInfo) {
  278. ccc := ri.entry & cccMask
  279. // Set exception bit and retain case type.
  280. ri.entry &= 0x0007
  281. ri.entry |= exceptionBit
  282. if len(exceptionData) >= 1<<numExceptionBits {
  283. log.Fatalf("%U:exceptionData too large %#x > %d bits", ri.Rune, len(exceptionData), numExceptionBits)
  284. }
  285. // Set the offset in the exceptionData array.
  286. ri.entry |= info(len(exceptionData) << exceptionShift)
  287. orig := string(ri.Rune)
  288. tc := ri.mapping(cTitle)
  289. uc := ri.mapping(cUpper)
  290. lc := ri.mapping(cLower)
  291. ff := string(ri.FoldFull)
  292. // addString sets the length of a string and adds it to the expansions array.
  293. addString := func(s string, b *byte) {
  294. if len(s) == 0 {
  295. // Zero-length mappings exist, but only for conditional casing,
  296. // which we are representing outside of this table.
  297. log.Fatalf("%U: has zero-length mapping.", ri.Rune)
  298. }
  299. *b <<= 3
  300. if s != orig || ri.CaseMode == cLower {
  301. n := len(s)
  302. if n > 7 {
  303. log.Fatalf("%U: mapping larger than 7 (%d)", ri.Rune, n)
  304. }
  305. *b |= byte(n)
  306. exceptionData = append(exceptionData, s...)
  307. }
  308. }
  309. // byte 0:
  310. exceptionData = append(exceptionData, byte(ccc)|byte(len(ff)))
  311. // byte 1:
  312. p := len(exceptionData)
  313. exceptionData = append(exceptionData, 0)
  314. if len(ff) > 7 { // May be zero-length.
  315. log.Fatalf("%U: fold string larger than 7 (%d)", ri.Rune, len(ff))
  316. }
  317. exceptionData = append(exceptionData, ff...)
  318. ct := ri.CaseMode
  319. if ct != cLower {
  320. addString(lc, &exceptionData[p])
  321. }
  322. if ct != cUpper {
  323. addString(uc, &exceptionData[p])
  324. }
  325. if ct != cTitle {
  326. addString(tc, &exceptionData[p])
  327. }
  328. }
  329. // sparseCompacter is a trie value block Compacter. There are many cases where
  330. // successive runes alternate between lower- and upper-case. This Compacter
  331. // exploits this by adding a special case type where the case value is obtained
  332. // from or-ing it with the least-significant bit of the rune, creating large
  333. // ranges of equal case values that compress well.
  334. type sparseCompacter struct {
  335. sparseBlocks [][]uint16
  336. sparseOffsets []uint16
  337. sparseCount int
  338. }
  339. // makeSparse returns the number of elements that compact block would contain
  340. // as well as the modified values.
  341. func makeSparse(vals []uint64) ([]uint16, int) {
  342. // Copy the values.
  343. values := make([]uint16, len(vals))
  344. for i, v := range vals {
  345. values[i] = uint16(v)
  346. }
  347. alt := func(i int, v uint16) uint16 {
  348. if cm := info(v & fullCasedMask); cm == cUpper || cm == cLower {
  349. // Convert cLower or cUpper to cXORCase value, which has the form 11x.
  350. xor := v
  351. xor &^= 1
  352. xor |= uint16(i&1) ^ (v & 1)
  353. xor |= 0x4
  354. return xor
  355. }
  356. return v
  357. }
  358. var count int
  359. var previous uint16
  360. for i, v := range values {
  361. if v != 0 {
  362. // Try if the unmodified value is equal to the previous.
  363. if v == previous {
  364. continue
  365. }
  366. // Try if the xor-ed value is equal to the previous value.
  367. a := alt(i, v)
  368. if a == previous {
  369. values[i] = a
  370. continue
  371. }
  372. // This is a new value.
  373. count++
  374. // Use the xor-ed value if it will be identical to the next value.
  375. if p := i + 1; p < len(values) && alt(p, values[p]) == a {
  376. values[i] = a
  377. v = a
  378. }
  379. }
  380. previous = v
  381. }
  382. return values, count
  383. }
  384. func (s *sparseCompacter) Size(v []uint64) (int, bool) {
  385. _, n := makeSparse(v)
  386. // We limit using this method to having 16 entries.
  387. if n > 16 {
  388. return 0, false
  389. }
  390. return 2 + int(reflect.TypeOf(valueRange{}).Size())*n, true
  391. }
  392. func (s *sparseCompacter) Store(v []uint64) uint32 {
  393. h := uint32(len(s.sparseOffsets))
  394. values, sz := makeSparse(v)
  395. s.sparseBlocks = append(s.sparseBlocks, values)
  396. s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
  397. s.sparseCount += sz
  398. return h
  399. }
  400. func (s *sparseCompacter) Handler() string {
  401. // The sparse global variable and its lookup method is defined in gen_trieval.go.
  402. return "sparse.lookup"
  403. }
  404. func (s *sparseCompacter) Print(w io.Writer) (retErr error) {
  405. p := func(format string, args ...interface{}) {
  406. _, err := fmt.Fprintf(w, format, args...)
  407. if retErr == nil && err != nil {
  408. retErr = err
  409. }
  410. }
  411. ls := len(s.sparseBlocks)
  412. if ls == len(s.sparseOffsets) {
  413. s.sparseOffsets = append(s.sparseOffsets, uint16(s.sparseCount))
  414. }
  415. p("// sparseOffsets: %d entries, %d bytes\n", ls+1, (ls+1)*2)
  416. p("var sparseOffsets = %#v\n\n", s.sparseOffsets)
  417. ns := s.sparseCount
  418. p("// sparseValues: %d entries, %d bytes\n", ns, ns*4)
  419. p("var sparseValues = [%d]valueRange {", ns)
  420. for i, values := range s.sparseBlocks {
  421. p("\n// Block %#x, offset %#x", i, s.sparseOffsets[i])
  422. var v uint16
  423. for i, nv := range values {
  424. if nv != v {
  425. if v != 0 {
  426. p(",hi:%#02x},", 0x80+i-1)
  427. }
  428. if nv != 0 {
  429. p("\n{value:%#04x,lo:%#02x", nv, 0x80+i)
  430. }
  431. }
  432. v = nv
  433. }
  434. if v != 0 {
  435. p(",hi:%#02x},", 0x80+len(values)-1)
  436. }
  437. }
  438. p("\n}\n\n")
  439. return
  440. }
  441. // verifyProperties that properties of the runes that are relied upon in the
  442. // implementation. Each property is marked with an identifier that is referred
  443. // to in the places where it is used.
  444. func verifyProperties(chars []runeInfo) {
  445. for i, c := range chars {
  446. r := rune(i)
  447. // Rune properties.
  448. // A.1: modifier never changes on lowercase. [ltLower]
  449. if c.CCC > 0 && unicode.ToLower(r) != r {
  450. log.Fatalf("%U: non-starter changes when lowercased", r)
  451. }
  452. // A.2: properties of decompositions starting with I or J. [ltLower]
  453. d := norm.NFD.PropertiesString(string(r)).Decomposition()
  454. if len(d) > 0 {
  455. if d[0] == 'I' || d[0] == 'J' {
  456. // A.2.1: we expect at least an ASCII character and a modifier.
  457. if len(d) < 3 {
  458. log.Fatalf("%U: length of decomposition was %d; want >= 3", r, len(d))
  459. }
  460. // All subsequent runes are modifiers and all have the same CCC.
  461. runes := []rune(string(d[1:]))
  462. ccc := chars[runes[0]].CCC
  463. for _, mr := range runes[1:] {
  464. mc := chars[mr]
  465. // A.2.2: all modifiers have a CCC of Above or less.
  466. if ccc == 0 || ccc > above {
  467. log.Fatalf("%U: CCC of successive rune (%U) was %d; want (0,230]", r, mr, ccc)
  468. }
  469. // A.2.3: a sequence of modifiers all have the same CCC.
  470. if mc.CCC != ccc {
  471. log.Fatalf("%U: CCC of follow-up modifier (%U) was %d; want %d", r, mr, mc.CCC, ccc)
  472. }
  473. // A.2.4: for each trailing r, r in [0x300, 0x311] <=> CCC == Above.
  474. if (ccc == above) != (0x300 <= mr && mr <= 0x311) {
  475. log.Fatalf("%U: modifier %U in [U+0300, U+0311] != ccc(%U) == 230", r, mr, mr)
  476. }
  477. if i += len(string(mr)); i >= len(d) {
  478. break
  479. }
  480. }
  481. }
  482. }
  483. // A.3: no U+0307 in decomposition of Soft-Dotted rune. [ltUpper]
  484. if unicode.Is(unicode.Soft_Dotted, r) && strings.Contains(string(d), "\u0307") {
  485. log.Fatalf("%U: decomposition of soft-dotted rune may not contain U+0307", r)
  486. }
  487. // A.4: only rune U+0345 may be of CCC Iota_Subscript. [elUpper]
  488. if c.CCC == iotaSubscript && r != 0x0345 {
  489. log.Fatalf("%U: only rune U+0345 may have CCC Iota_Subscript", r)
  490. }
  491. // A.5: soft-dotted runes do not have exceptions.
  492. if c.SoftDotted && c.entry&exceptionBit != 0 {
  493. log.Fatalf("%U: soft-dotted has exception", r)
  494. }
  495. // A.6: Greek decomposition. [elUpper]
  496. if unicode.Is(unicode.Greek, r) {
  497. if b := norm.NFD.PropertiesString(string(r)).Decomposition(); b != nil {
  498. runes := []rune(string(b))
  499. // A.6.1: If a Greek rune decomposes and the first rune of the
  500. // decomposition is greater than U+00FF, the rune is always
  501. // great and not a modifier.
  502. if f := runes[0]; unicode.IsMark(f) || f > 0xFF && !unicode.Is(unicode.Greek, f) {
  503. log.Fatalf("%U: expected first rune of Greek decomposition to be letter, found %U", r, f)
  504. }
  505. // A.6.2: Any follow-up rune in a Greek decomposition is a
  506. // modifier of which the first should be gobbled in
  507. // decomposition.
  508. for _, m := range runes[1:] {
  509. switch m {
  510. case 0x0313, 0x0314, 0x0301, 0x0300, 0x0306, 0x0342, 0x0308, 0x0304, 0x345:
  511. default:
  512. log.Fatalf("%U: modifier %U is outside of expected Greek modifier set", r, m)
  513. }
  514. }
  515. }
  516. }
  517. // Breaking properties.
  518. // B.1: all runes with CCC > 0 are of break type Extend.
  519. if c.CCC > 0 && c.BreakType != "Extend" {
  520. log.Fatalf("%U: CCC == %d, but got break type %s; want Extend", r, c.CCC, c.BreakType)
  521. }
  522. // B.2: all cased runes with c.CCC == 0 are of break type ALetter.
  523. if c.CCC == 0 && c.Cased && c.BreakType != "ALetter" {
  524. log.Fatalf("%U: cased, but got break type %s; want ALetter", r, c.BreakType)
  525. }
  526. // B.3: letter category.
  527. if c.CCC == 0 && c.BreakCat != breakBreak && !c.CaseIgnorable {
  528. if c.BreakCat != breakLetter {
  529. log.Fatalf("%U: check for letter break type gave %d; want %d", r, c.BreakCat, breakLetter)
  530. }
  531. }
  532. }
  533. }
  534. func genTablesTest() {
  535. w := &bytes.Buffer{}
  536. fmt.Fprintln(w, "var (")
  537. printProperties(w, "DerivedCoreProperties.txt", "Case_Ignorable", verifyIgnore)
  538. // We discard the output as we know we have perfect functions. We run them
  539. // just to verify the properties are correct.
  540. n := printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Cased", verifyCased)
  541. n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Lowercase", verifyLower)
  542. n += printProperties(ioutil.Discard, "DerivedCoreProperties.txt", "Uppercase", verifyUpper)
  543. if n > 0 {
  544. log.Fatalf("One of the discarded properties does not have a perfect filter.")
  545. }
  546. // <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)?
  547. fmt.Fprintln(w, "\tspecial = map[rune]struct{ toLower, toTitle, toUpper string }{")
  548. parse("SpecialCasing.txt", func(p *ucd.Parser) {
  549. // Skip conditional entries.
  550. if p.String(4) != "" {
  551. return
  552. }
  553. r := p.Rune(0)
  554. fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n",
  555. r, string(p.Runes(1)), string(p.Runes(2)), string(p.Runes(3)))
  556. })
  557. fmt.Fprint(w, "\t}\n\n")
  558. // <code>; <type>; <runes>
  559. table := map[rune]struct{ simple, full, special string }{}
  560. parse("CaseFolding.txt", func(p *ucd.Parser) {
  561. r := p.Rune(0)
  562. t := p.String(1)
  563. v := string(p.Runes(2))
  564. if t != "T" && v == string(unicode.ToLower(r)) {
  565. return
  566. }
  567. x := table[r]
  568. switch t {
  569. case "C":
  570. x.full = v
  571. x.simple = v
  572. case "S":
  573. x.simple = v
  574. case "F":
  575. x.full = v
  576. case "T":
  577. x.special = v
  578. }
  579. table[r] = x
  580. })
  581. fmt.Fprintln(w, "\tfoldMap = map[rune]struct{ simple, full, special string }{")
  582. for r := rune(0); r < 0x10FFFF; r++ {
  583. x, ok := table[r]
  584. if !ok {
  585. continue
  586. }
  587. fmt.Fprintf(w, "\t\t0x%04x: {%q, %q, %q},\n", r, x.simple, x.full, x.special)
  588. }
  589. fmt.Fprint(w, "\t}\n\n")
  590. // Break property
  591. notBreak := map[rune]bool{}
  592. parse("auxiliary/WordBreakProperty.txt", func(p *ucd.Parser) {
  593. switch p.String(1) {
  594. case "Extend", "Format", "MidLetter", "MidNumLet", "Single_Quote",
  595. "ALetter", "Hebrew_Letter", "Numeric", "ExtendNumLet", "ZWJ":
  596. notBreak[p.Rune(0)] = true
  597. }
  598. })
  599. fmt.Fprintln(w, "\tbreakProp = []struct{ lo, hi rune }{")
  600. inBreak := false
  601. for r := rune(0); r <= lastRuneForTesting; r++ {
  602. if isBreak := !notBreak[r]; isBreak != inBreak {
  603. if isBreak {
  604. fmt.Fprintf(w, "\t\t{0x%x, ", r)
  605. } else {
  606. fmt.Fprintf(w, "0x%x},\n", r-1)
  607. }
  608. inBreak = isBreak
  609. }
  610. }
  611. if inBreak {
  612. fmt.Fprintf(w, "0x%x},\n", lastRuneForTesting)
  613. }
  614. fmt.Fprint(w, "\t}\n\n")
  615. // Word break test
  616. // Filter out all samples that do not contain cased characters.
  617. cased := map[rune]bool{}
  618. parse("DerivedCoreProperties.txt", func(p *ucd.Parser) {
  619. if p.String(1) == "Cased" {
  620. cased[p.Rune(0)] = true
  621. }
  622. })
  623. fmt.Fprintln(w, "\tbreakTest = []string{")
  624. parse("auxiliary/WordBreakTest.txt", func(p *ucd.Parser) {
  625. c := strings.Split(p.String(0), " ")
  626. const sep = '|'
  627. numCased := 0
  628. test := ""
  629. for ; len(c) >= 2; c = c[2:] {
  630. if c[0] == "÷" && test != "" {
  631. test += string(sep)
  632. }
  633. i, err := strconv.ParseUint(c[1], 16, 32)
  634. r := rune(i)
  635. if err != nil {
  636. log.Fatalf("Invalid rune %q.", c[1])
  637. }
  638. if r == sep {
  639. log.Fatalf("Separator %q not allowed in test data. Pick another one.", sep)
  640. }
  641. if cased[r] {
  642. numCased++
  643. }
  644. test += string(r)
  645. }
  646. if numCased > 1 {
  647. fmt.Fprintf(w, "\t\t%q,\n", test)
  648. }
  649. })
  650. fmt.Fprintln(w, "\t}")
  651. fmt.Fprintln(w, ")")
  652. gen.WriteVersionedGoFile("tables_test.go", "cases", w.Bytes())
  653. }
  654. // These functions are just used for verification that their definition have not
  655. // changed in the Unicode Standard.
  656. func verifyCased(r rune) bool {
  657. return verifyLower(r) || verifyUpper(r) || unicode.IsTitle(r)
  658. }
  659. func verifyLower(r rune) bool {
  660. return unicode.IsLower(r) || unicode.Is(unicode.Other_Lowercase, r)
  661. }
  662. func verifyUpper(r rune) bool {
  663. return unicode.IsUpper(r) || unicode.Is(unicode.Other_Uppercase, r)
  664. }
  665. // verifyIgnore is an approximation of the Case_Ignorable property using the
  666. // core unicode package. It is used to reduce the size of the test data.
  667. func verifyIgnore(r rune) bool {
  668. props := []*unicode.RangeTable{
  669. unicode.Mn,
  670. unicode.Me,
  671. unicode.Cf,
  672. unicode.Lm,
  673. unicode.Sk,
  674. }
  675. for _, p := range props {
  676. if unicode.Is(p, r) {
  677. return true
  678. }
  679. }
  680. return false
  681. }
  682. // printProperties prints tables of rune properties from the given UCD file.
  683. // A filter func f can be given to exclude certain values. A rune r will have
  684. // the indicated property if it is in the generated table or if f(r).
  685. func printProperties(w io.Writer, file, property string, f func(r rune) bool) int {
  686. verify := map[rune]bool{}
  687. n := 0
  688. varNameParts := strings.Split(property, "_")
  689. varNameParts[0] = strings.ToLower(varNameParts[0])
  690. fmt.Fprintf(w, "\t%s = map[rune]bool{\n", strings.Join(varNameParts, ""))
  691. parse(file, func(p *ucd.Parser) {
  692. if p.String(1) == property {
  693. r := p.Rune(0)
  694. verify[r] = true
  695. if !f(r) {
  696. n++
  697. fmt.Fprintf(w, "\t\t0x%.4x: true,\n", r)
  698. }
  699. }
  700. })
  701. fmt.Fprint(w, "\t}\n\n")
  702. // Verify that f is correct, that is, it represents a subset of the property.
  703. for r := rune(0); r <= lastRuneForTesting; r++ {
  704. if !verify[r] && f(r) {
  705. log.Fatalf("Incorrect filter func for property %q.", property)
  706. }
  707. }
  708. return n
  709. }
  710. // The newCaseTrie, sparseValues and sparseOffsets definitions below are
  711. // placeholders referred to by gen_trieval.go. The real definitions are
  712. // generated by this program and written to tables.go.
  713. func newCaseTrie(int) int { return 0 }
  714. var (
  715. sparseValues [0]valueRange
  716. sparseOffsets [0]uint16
  717. )