maketables.go 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build ignore
  5. // Normalization table generator.
  6. // Data read from the web.
  7. // See forminfo.go for a description of the trie values associated with each rune.
  8. package main
  9. import (
  10. "bytes"
  11. "encoding/binary"
  12. "flag"
  13. "fmt"
  14. "io"
  15. "log"
  16. "sort"
  17. "strconv"
  18. "strings"
  19. "golang.org/x/text/internal/gen"
  20. "golang.org/x/text/internal/triegen"
  21. "golang.org/x/text/internal/ucd"
  22. )
  23. func main() {
  24. gen.Init()
  25. loadUnicodeData()
  26. compactCCC()
  27. loadCompositionExclusions()
  28. completeCharFields(FCanonical)
  29. completeCharFields(FCompatibility)
  30. computeNonStarterCounts()
  31. verifyComputed()
  32. printChars()
  33. testDerived()
  34. printTestdata()
  35. makeTables()
  36. }
  37. var (
  38. tablelist = flag.String("tables",
  39. "all",
  40. "comma-separated list of which tables to generate; "+
  41. "can be 'decomp', 'recomp', 'info' and 'all'")
  42. test = flag.Bool("test",
  43. false,
  44. "test existing tables against DerivedNormalizationProps and generate test data for regression testing")
  45. verbose = flag.Bool("verbose",
  46. false,
  47. "write data to stdout as it is parsed")
  48. )
  49. const MaxChar = 0x10FFFF // anything above this shouldn't exist
  50. // Quick Check properties of runes allow us to quickly
  51. // determine whether a rune may occur in a normal form.
  52. // For a given normal form, a rune may be guaranteed to occur
  53. // verbatim (QC=Yes), may or may not combine with another
  54. // rune (QC=Maybe), or may not occur (QC=No).
  55. type QCResult int
  56. const (
  57. QCUnknown QCResult = iota
  58. QCYes
  59. QCNo
  60. QCMaybe
  61. )
  62. func (r QCResult) String() string {
  63. switch r {
  64. case QCYes:
  65. return "Yes"
  66. case QCNo:
  67. return "No"
  68. case QCMaybe:
  69. return "Maybe"
  70. }
  71. return "***UNKNOWN***"
  72. }
  73. const (
  74. FCanonical = iota // NFC or NFD
  75. FCompatibility // NFKC or NFKD
  76. FNumberOfFormTypes
  77. )
  78. const (
  79. MComposed = iota // NFC or NFKC
  80. MDecomposed // NFD or NFKD
  81. MNumberOfModes
  82. )
  83. // This contains only the properties we're interested in.
  84. type Char struct {
  85. name string
  86. codePoint rune // if zero, this index is not a valid code point.
  87. ccc uint8 // canonical combining class
  88. origCCC uint8
  89. excludeInComp bool // from CompositionExclusions.txt
  90. compatDecomp bool // it has a compatibility expansion
  91. nTrailingNonStarters uint8
  92. nLeadingNonStarters uint8 // must be equal to trailing if non-zero
  93. forms [FNumberOfFormTypes]FormInfo // For FCanonical and FCompatibility
  94. state State
  95. }
  96. var chars = make([]Char, MaxChar+1)
  97. var cccMap = make(map[uint8]uint8)
  98. func (c Char) String() string {
  99. buf := new(bytes.Buffer)
  100. fmt.Fprintf(buf, "%U [%s]:\n", c.codePoint, c.name)
  101. fmt.Fprintf(buf, " ccc: %v\n", c.ccc)
  102. fmt.Fprintf(buf, " excludeInComp: %v\n", c.excludeInComp)
  103. fmt.Fprintf(buf, " compatDecomp: %v\n", c.compatDecomp)
  104. fmt.Fprintf(buf, " state: %v\n", c.state)
  105. fmt.Fprintf(buf, " NFC:\n")
  106. fmt.Fprint(buf, c.forms[FCanonical])
  107. fmt.Fprintf(buf, " NFKC:\n")
  108. fmt.Fprint(buf, c.forms[FCompatibility])
  109. return buf.String()
  110. }
  111. // In UnicodeData.txt, some ranges are marked like this:
  112. // 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
  113. // 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
  114. // parseCharacter keeps a state variable indicating the weirdness.
  115. type State int
  116. const (
  117. SNormal State = iota // known to be zero for the type
  118. SFirst
  119. SLast
  120. SMissing
  121. )
  122. var lastChar = rune('\u0000')
  123. func (c Char) isValid() bool {
  124. return c.codePoint != 0 && c.state != SMissing
  125. }
  126. type FormInfo struct {
  127. quickCheck [MNumberOfModes]QCResult // index: MComposed or MDecomposed
  128. verified [MNumberOfModes]bool // index: MComposed or MDecomposed
  129. combinesForward bool // May combine with rune on the right
  130. combinesBackward bool // May combine with rune on the left
  131. isOneWay bool // Never appears in result
  132. inDecomp bool // Some decompositions result in this char.
  133. decomp Decomposition
  134. expandedDecomp Decomposition
  135. }
  136. func (f FormInfo) String() string {
  137. buf := bytes.NewBuffer(make([]byte, 0))
  138. fmt.Fprintf(buf, " quickCheck[C]: %v\n", f.quickCheck[MComposed])
  139. fmt.Fprintf(buf, " quickCheck[D]: %v\n", f.quickCheck[MDecomposed])
  140. fmt.Fprintf(buf, " cmbForward: %v\n", f.combinesForward)
  141. fmt.Fprintf(buf, " cmbBackward: %v\n", f.combinesBackward)
  142. fmt.Fprintf(buf, " isOneWay: %v\n", f.isOneWay)
  143. fmt.Fprintf(buf, " inDecomp: %v\n", f.inDecomp)
  144. fmt.Fprintf(buf, " decomposition: %X\n", f.decomp)
  145. fmt.Fprintf(buf, " expandedDecomp: %X\n", f.expandedDecomp)
  146. return buf.String()
  147. }
  148. type Decomposition []rune
  149. func parseDecomposition(s string, skipfirst bool) (a []rune, err error) {
  150. decomp := strings.Split(s, " ")
  151. if len(decomp) > 0 && skipfirst {
  152. decomp = decomp[1:]
  153. }
  154. for _, d := range decomp {
  155. point, err := strconv.ParseUint(d, 16, 64)
  156. if err != nil {
  157. return a, err
  158. }
  159. a = append(a, rune(point))
  160. }
  161. return a, nil
  162. }
  163. func loadUnicodeData() {
  164. f := gen.OpenUCDFile("UnicodeData.txt")
  165. defer f.Close()
  166. p := ucd.New(f)
  167. for p.Next() {
  168. r := p.Rune(ucd.CodePoint)
  169. char := &chars[r]
  170. char.ccc = uint8(p.Uint(ucd.CanonicalCombiningClass))
  171. decmap := p.String(ucd.DecompMapping)
  172. exp, err := parseDecomposition(decmap, false)
  173. isCompat := false
  174. if err != nil {
  175. if len(decmap) > 0 {
  176. exp, err = parseDecomposition(decmap, true)
  177. if err != nil {
  178. log.Fatalf(`%U: bad decomp |%v|: "%s"`, r, decmap, err)
  179. }
  180. isCompat = true
  181. }
  182. }
  183. char.name = p.String(ucd.Name)
  184. char.codePoint = r
  185. char.forms[FCompatibility].decomp = exp
  186. if !isCompat {
  187. char.forms[FCanonical].decomp = exp
  188. } else {
  189. char.compatDecomp = true
  190. }
  191. if len(decmap) > 0 {
  192. char.forms[FCompatibility].decomp = exp
  193. }
  194. }
  195. if err := p.Err(); err != nil {
  196. log.Fatal(err)
  197. }
  198. }
  199. // compactCCC converts the sparse set of CCC values to a continguous one,
  200. // reducing the number of bits needed from 8 to 6.
  201. func compactCCC() {
  202. m := make(map[uint8]uint8)
  203. for i := range chars {
  204. c := &chars[i]
  205. m[c.ccc] = 0
  206. }
  207. cccs := []int{}
  208. for v, _ := range m {
  209. cccs = append(cccs, int(v))
  210. }
  211. sort.Ints(cccs)
  212. for i, c := range cccs {
  213. cccMap[uint8(i)] = uint8(c)
  214. m[uint8(c)] = uint8(i)
  215. }
  216. for i := range chars {
  217. c := &chars[i]
  218. c.origCCC = c.ccc
  219. c.ccc = m[c.ccc]
  220. }
  221. if len(m) >= 1<<6 {
  222. log.Fatalf("too many difference CCC values: %d >= 64", len(m))
  223. }
  224. }
  225. // CompositionExclusions.txt has form:
  226. // 0958 # ...
  227. // See https://unicode.org/reports/tr44/ for full explanation
  228. func loadCompositionExclusions() {
  229. f := gen.OpenUCDFile("CompositionExclusions.txt")
  230. defer f.Close()
  231. p := ucd.New(f)
  232. for p.Next() {
  233. c := &chars[p.Rune(0)]
  234. if c.excludeInComp {
  235. log.Fatalf("%U: Duplicate entry in exclusions.", c.codePoint)
  236. }
  237. c.excludeInComp = true
  238. }
  239. if e := p.Err(); e != nil {
  240. log.Fatal(e)
  241. }
  242. }
  243. // hasCompatDecomp returns true if any of the recursive
  244. // decompositions contains a compatibility expansion.
  245. // In this case, the character may not occur in NFK*.
  246. func hasCompatDecomp(r rune) bool {
  247. c := &chars[r]
  248. if c.compatDecomp {
  249. return true
  250. }
  251. for _, d := range c.forms[FCompatibility].decomp {
  252. if hasCompatDecomp(d) {
  253. return true
  254. }
  255. }
  256. return false
  257. }
  258. // Hangul related constants.
  259. const (
  260. HangulBase = 0xAC00
  261. HangulEnd = 0xD7A4 // hangulBase + Jamo combinations (19 * 21 * 28)
  262. JamoLBase = 0x1100
  263. JamoLEnd = 0x1113
  264. JamoVBase = 0x1161
  265. JamoVEnd = 0x1176
  266. JamoTBase = 0x11A8
  267. JamoTEnd = 0x11C3
  268. JamoLVTCount = 19 * 21 * 28
  269. JamoTCount = 28
  270. )
  271. func isHangul(r rune) bool {
  272. return HangulBase <= r && r < HangulEnd
  273. }
  274. func isHangulWithoutJamoT(r rune) bool {
  275. if !isHangul(r) {
  276. return false
  277. }
  278. r -= HangulBase
  279. return r < JamoLVTCount && r%JamoTCount == 0
  280. }
  281. func ccc(r rune) uint8 {
  282. return chars[r].ccc
  283. }
  284. // Insert a rune in a buffer, ordered by Canonical Combining Class.
  285. func insertOrdered(b Decomposition, r rune) Decomposition {
  286. n := len(b)
  287. b = append(b, 0)
  288. cc := ccc(r)
  289. if cc > 0 {
  290. // Use bubble sort.
  291. for ; n > 0; n-- {
  292. if ccc(b[n-1]) <= cc {
  293. break
  294. }
  295. b[n] = b[n-1]
  296. }
  297. }
  298. b[n] = r
  299. return b
  300. }
  301. // Recursively decompose.
  302. func decomposeRecursive(form int, r rune, d Decomposition) Decomposition {
  303. dcomp := chars[r].forms[form].decomp
  304. if len(dcomp) == 0 {
  305. return insertOrdered(d, r)
  306. }
  307. for _, c := range dcomp {
  308. d = decomposeRecursive(form, c, d)
  309. }
  310. return d
  311. }
  312. func completeCharFields(form int) {
  313. // Phase 0: pre-expand decomposition.
  314. for i := range chars {
  315. f := &chars[i].forms[form]
  316. if len(f.decomp) == 0 {
  317. continue
  318. }
  319. exp := make(Decomposition, 0)
  320. for _, c := range f.decomp {
  321. exp = decomposeRecursive(form, c, exp)
  322. }
  323. f.expandedDecomp = exp
  324. }
  325. // Phase 1: composition exclusion, mark decomposition.
  326. for i := range chars {
  327. c := &chars[i]
  328. f := &c.forms[form]
  329. // Marks script-specific exclusions and version restricted.
  330. f.isOneWay = c.excludeInComp
  331. // Singletons
  332. f.isOneWay = f.isOneWay || len(f.decomp) == 1
  333. // Non-starter decompositions
  334. if len(f.decomp) > 1 {
  335. chk := c.ccc != 0 || chars[f.decomp[0]].ccc != 0
  336. f.isOneWay = f.isOneWay || chk
  337. }
  338. // Runes that decompose into more than two runes.
  339. f.isOneWay = f.isOneWay || len(f.decomp) > 2
  340. if form == FCompatibility {
  341. f.isOneWay = f.isOneWay || hasCompatDecomp(c.codePoint)
  342. }
  343. for _, r := range f.decomp {
  344. chars[r].forms[form].inDecomp = true
  345. }
  346. }
  347. // Phase 2: forward and backward combining.
  348. for i := range chars {
  349. c := &chars[i]
  350. f := &c.forms[form]
  351. if !f.isOneWay && len(f.decomp) == 2 {
  352. f0 := &chars[f.decomp[0]].forms[form]
  353. f1 := &chars[f.decomp[1]].forms[form]
  354. if !f0.isOneWay {
  355. f0.combinesForward = true
  356. }
  357. if !f1.isOneWay {
  358. f1.combinesBackward = true
  359. }
  360. }
  361. if isHangulWithoutJamoT(rune(i)) {
  362. f.combinesForward = true
  363. }
  364. }
  365. // Phase 3: quick check values.
  366. for i := range chars {
  367. c := &chars[i]
  368. f := &c.forms[form]
  369. switch {
  370. case len(f.decomp) > 0:
  371. f.quickCheck[MDecomposed] = QCNo
  372. case isHangul(rune(i)):
  373. f.quickCheck[MDecomposed] = QCNo
  374. default:
  375. f.quickCheck[MDecomposed] = QCYes
  376. }
  377. switch {
  378. case f.isOneWay:
  379. f.quickCheck[MComposed] = QCNo
  380. case (i & 0xffff00) == JamoLBase:
  381. f.quickCheck[MComposed] = QCYes
  382. if JamoLBase <= i && i < JamoLEnd {
  383. f.combinesForward = true
  384. }
  385. if JamoVBase <= i && i < JamoVEnd {
  386. f.quickCheck[MComposed] = QCMaybe
  387. f.combinesBackward = true
  388. f.combinesForward = true
  389. }
  390. if JamoTBase <= i && i < JamoTEnd {
  391. f.quickCheck[MComposed] = QCMaybe
  392. f.combinesBackward = true
  393. }
  394. case !f.combinesBackward:
  395. f.quickCheck[MComposed] = QCYes
  396. default:
  397. f.quickCheck[MComposed] = QCMaybe
  398. }
  399. }
  400. }
  401. func computeNonStarterCounts() {
  402. // Phase 4: leading and trailing non-starter count
  403. for i := range chars {
  404. c := &chars[i]
  405. runes := []rune{rune(i)}
  406. // We always use FCompatibility so that the CGJ insertion points do not
  407. // change for repeated normalizations with different forms.
  408. if exp := c.forms[FCompatibility].expandedDecomp; len(exp) > 0 {
  409. runes = exp
  410. }
  411. // We consider runes that combine backwards to be non-starters for the
  412. // purpose of Stream-Safe Text Processing.
  413. for _, r := range runes {
  414. if cr := &chars[r]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward {
  415. break
  416. }
  417. c.nLeadingNonStarters++
  418. }
  419. for i := len(runes) - 1; i >= 0; i-- {
  420. if cr := &chars[runes[i]]; cr.ccc == 0 && !cr.forms[FCompatibility].combinesBackward {
  421. break
  422. }
  423. c.nTrailingNonStarters++
  424. }
  425. if c.nTrailingNonStarters > 3 {
  426. log.Fatalf("%U: Decomposition with more than 3 (%d) trailing modifiers (%U)", i, c.nTrailingNonStarters, runes)
  427. }
  428. if isHangul(rune(i)) {
  429. c.nTrailingNonStarters = 2
  430. if isHangulWithoutJamoT(rune(i)) {
  431. c.nTrailingNonStarters = 1
  432. }
  433. }
  434. if l, t := c.nLeadingNonStarters, c.nTrailingNonStarters; l > 0 && l != t {
  435. log.Fatalf("%U: number of leading and trailing non-starters should be equal (%d vs %d)", i, l, t)
  436. }
  437. if t := c.nTrailingNonStarters; t > 3 {
  438. log.Fatalf("%U: number of trailing non-starters is %d > 3", t)
  439. }
  440. }
  441. }
  442. func printBytes(w io.Writer, b []byte, name string) {
  443. fmt.Fprintf(w, "// %s: %d bytes\n", name, len(b))
  444. fmt.Fprintf(w, "var %s = [...]byte {", name)
  445. for i, c := range b {
  446. switch {
  447. case i%64 == 0:
  448. fmt.Fprintf(w, "\n// Bytes %x - %x\n", i, i+63)
  449. case i%8 == 0:
  450. fmt.Fprintf(w, "\n")
  451. }
  452. fmt.Fprintf(w, "0x%.2X, ", c)
  453. }
  454. fmt.Fprint(w, "\n}\n\n")
  455. }
  456. // See forminfo.go for format.
  457. func makeEntry(f *FormInfo, c *Char) uint16 {
  458. e := uint16(0)
  459. if r := c.codePoint; HangulBase <= r && r < HangulEnd {
  460. e |= 0x40
  461. }
  462. if f.combinesForward {
  463. e |= 0x20
  464. }
  465. if f.quickCheck[MDecomposed] == QCNo {
  466. e |= 0x4
  467. }
  468. switch f.quickCheck[MComposed] {
  469. case QCYes:
  470. case QCNo:
  471. e |= 0x10
  472. case QCMaybe:
  473. e |= 0x18
  474. default:
  475. log.Fatalf("Illegal quickcheck value %v.", f.quickCheck[MComposed])
  476. }
  477. e |= uint16(c.nTrailingNonStarters)
  478. return e
  479. }
  480. // decompSet keeps track of unique decompositions, grouped by whether
  481. // the decomposition is followed by a trailing and/or leading CCC.
  482. type decompSet [7]map[string]bool
  483. const (
  484. normalDecomp = iota
  485. firstMulti
  486. firstCCC
  487. endMulti
  488. firstLeadingCCC
  489. firstCCCZeroExcept
  490. firstStarterWithNLead
  491. lastDecomp
  492. )
  493. var cname = []string{"firstMulti", "firstCCC", "endMulti", "firstLeadingCCC", "firstCCCZeroExcept", "firstStarterWithNLead", "lastDecomp"}
  494. func makeDecompSet() decompSet {
  495. m := decompSet{}
  496. for i := range m {
  497. m[i] = make(map[string]bool)
  498. }
  499. return m
  500. }
  501. func (m *decompSet) insert(key int, s string) {
  502. m[key][s] = true
  503. }
  504. func printCharInfoTables(w io.Writer) int {
  505. mkstr := func(r rune, f *FormInfo) (int, string) {
  506. d := f.expandedDecomp
  507. s := string([]rune(d))
  508. if max := 1 << 6; len(s) >= max {
  509. const msg = "%U: too many bytes in decomposition: %d >= %d"
  510. log.Fatalf(msg, r, len(s), max)
  511. }
  512. head := uint8(len(s))
  513. if f.quickCheck[MComposed] != QCYes {
  514. head |= 0x40
  515. }
  516. if f.combinesForward {
  517. head |= 0x80
  518. }
  519. s = string([]byte{head}) + s
  520. lccc := ccc(d[0])
  521. tccc := ccc(d[len(d)-1])
  522. cc := ccc(r)
  523. if cc != 0 && lccc == 0 && tccc == 0 {
  524. log.Fatalf("%U: trailing and leading ccc are 0 for non-zero ccc %d", r, cc)
  525. }
  526. if tccc < lccc && lccc != 0 {
  527. const msg = "%U: lccc (%d) must be <= tcc (%d)"
  528. log.Fatalf(msg, r, lccc, tccc)
  529. }
  530. index := normalDecomp
  531. nTrail := chars[r].nTrailingNonStarters
  532. nLead := chars[r].nLeadingNonStarters
  533. if tccc > 0 || lccc > 0 || nTrail > 0 {
  534. tccc <<= 2
  535. tccc |= nTrail
  536. s += string([]byte{tccc})
  537. index = endMulti
  538. for _, r := range d[1:] {
  539. if ccc(r) == 0 {
  540. index = firstCCC
  541. }
  542. }
  543. if lccc > 0 || nLead > 0 {
  544. s += string([]byte{lccc})
  545. if index == firstCCC {
  546. log.Fatalf("%U: multi-segment decomposition not supported for decompositions with leading CCC != 0", r)
  547. }
  548. index = firstLeadingCCC
  549. }
  550. if cc != lccc {
  551. if cc != 0 {
  552. log.Fatalf("%U: for lccc != ccc, expected ccc to be 0; was %d", r, cc)
  553. }
  554. index = firstCCCZeroExcept
  555. }
  556. } else if len(d) > 1 {
  557. index = firstMulti
  558. }
  559. return index, s
  560. }
  561. decompSet := makeDecompSet()
  562. const nLeadStr = "\x00\x01" // 0-byte length and tccc with nTrail.
  563. decompSet.insert(firstStarterWithNLead, nLeadStr)
  564. // Store the uniqued decompositions in a byte buffer,
  565. // preceded by their byte length.
  566. for _, c := range chars {
  567. for _, f := range c.forms {
  568. if len(f.expandedDecomp) == 0 {
  569. continue
  570. }
  571. if f.combinesBackward {
  572. log.Fatalf("%U: combinesBackward and decompose", c.codePoint)
  573. }
  574. index, s := mkstr(c.codePoint, &f)
  575. decompSet.insert(index, s)
  576. }
  577. }
  578. decompositions := bytes.NewBuffer(make([]byte, 0, 10000))
  579. size := 0
  580. positionMap := make(map[string]uint16)
  581. decompositions.WriteString("\000")
  582. fmt.Fprintln(w, "const (")
  583. for i, m := range decompSet {
  584. sa := []string{}
  585. for s := range m {
  586. sa = append(sa, s)
  587. }
  588. sort.Strings(sa)
  589. for _, s := range sa {
  590. p := decompositions.Len()
  591. decompositions.WriteString(s)
  592. positionMap[s] = uint16(p)
  593. }
  594. if cname[i] != "" {
  595. fmt.Fprintf(w, "%s = 0x%X\n", cname[i], decompositions.Len())
  596. }
  597. }
  598. fmt.Fprintln(w, "maxDecomp = 0x8000")
  599. fmt.Fprintln(w, ")")
  600. b := decompositions.Bytes()
  601. printBytes(w, b, "decomps")
  602. size += len(b)
  603. varnames := []string{"nfc", "nfkc"}
  604. for i := 0; i < FNumberOfFormTypes; i++ {
  605. trie := triegen.NewTrie(varnames[i])
  606. for r, c := range chars {
  607. f := c.forms[i]
  608. d := f.expandedDecomp
  609. if len(d) != 0 {
  610. _, key := mkstr(c.codePoint, &f)
  611. trie.Insert(rune(r), uint64(positionMap[key]))
  612. if c.ccc != ccc(d[0]) {
  613. // We assume the lead ccc of a decomposition !=0 in this case.
  614. if ccc(d[0]) == 0 {
  615. log.Fatalf("Expected leading CCC to be non-zero; ccc is %d", c.ccc)
  616. }
  617. }
  618. } else if c.nLeadingNonStarters > 0 && len(f.expandedDecomp) == 0 && c.ccc == 0 && !f.combinesBackward {
  619. // Handle cases where it can't be detected that the nLead should be equal
  620. // to nTrail.
  621. trie.Insert(c.codePoint, uint64(positionMap[nLeadStr]))
  622. } else if v := makeEntry(&f, &c)<<8 | uint16(c.ccc); v != 0 {
  623. trie.Insert(c.codePoint, uint64(0x8000|v))
  624. }
  625. }
  626. sz, err := trie.Gen(w, triegen.Compact(&normCompacter{name: varnames[i]}))
  627. if err != nil {
  628. log.Fatal(err)
  629. }
  630. size += sz
  631. }
  632. return size
  633. }
  634. func contains(sa []string, s string) bool {
  635. for _, a := range sa {
  636. if a == s {
  637. return true
  638. }
  639. }
  640. return false
  641. }
  642. func makeTables() {
  643. w := &bytes.Buffer{}
  644. size := 0
  645. if *tablelist == "" {
  646. return
  647. }
  648. list := strings.Split(*tablelist, ",")
  649. if *tablelist == "all" {
  650. list = []string{"recomp", "info"}
  651. }
  652. // Compute maximum decomposition size.
  653. max := 0
  654. for _, c := range chars {
  655. if n := len(string(c.forms[FCompatibility].expandedDecomp)); n > max {
  656. max = n
  657. }
  658. }
  659. fmt.Fprintln(w, `import "sync"`)
  660. fmt.Fprintln(w)
  661. fmt.Fprintln(w, "const (")
  662. fmt.Fprintln(w, "\t// Version is the Unicode edition from which the tables are derived.")
  663. fmt.Fprintf(w, "\tVersion = %q\n", gen.UnicodeVersion())
  664. fmt.Fprintln(w)
  665. fmt.Fprintln(w, "\t// MaxTransformChunkSize indicates the maximum number of bytes that Transform")
  666. fmt.Fprintln(w, "\t// may need to write atomically for any Form. Making a destination buffer at")
  667. fmt.Fprintln(w, "\t// least this size ensures that Transform can always make progress and that")
  668. fmt.Fprintln(w, "\t// the user does not need to grow the buffer on an ErrShortDst.")
  669. fmt.Fprintf(w, "\tMaxTransformChunkSize = %d+maxNonStarters*4\n", len(string(0x034F))+max)
  670. fmt.Fprintln(w, ")\n")
  671. // Print the CCC remap table.
  672. size += len(cccMap)
  673. fmt.Fprintf(w, "var ccc = [%d]uint8{", len(cccMap))
  674. for i := 0; i < len(cccMap); i++ {
  675. if i%8 == 0 {
  676. fmt.Fprintln(w)
  677. }
  678. fmt.Fprintf(w, "%3d, ", cccMap[uint8(i)])
  679. }
  680. fmt.Fprintln(w, "\n}\n")
  681. if contains(list, "info") {
  682. size += printCharInfoTables(w)
  683. }
  684. if contains(list, "recomp") {
  685. // Note that we use 32 bit keys, instead of 64 bit.
  686. // This clips the bits of three entries, but we know
  687. // this won't cause a collision. The compiler will catch
  688. // any changes made to UnicodeData.txt that introduces
  689. // a collision.
  690. // Note that the recomposition map for NFC and NFKC
  691. // are identical.
  692. // Recomposition map
  693. nrentries := 0
  694. for _, c := range chars {
  695. f := c.forms[FCanonical]
  696. if !f.isOneWay && len(f.decomp) > 0 {
  697. nrentries++
  698. }
  699. }
  700. sz := nrentries * 8
  701. size += sz
  702. fmt.Fprintf(w, "// recompMap: %d bytes (entries only)\n", sz)
  703. fmt.Fprintln(w, "var recompMap map[uint32]rune")
  704. fmt.Fprintln(w, "var recompMapOnce sync.Once\n")
  705. fmt.Fprintln(w, `const recompMapPacked = "" +`)
  706. var buf [8]byte
  707. for i, c := range chars {
  708. f := c.forms[FCanonical]
  709. d := f.decomp
  710. if !f.isOneWay && len(d) > 0 {
  711. key := uint32(uint16(d[0]))<<16 + uint32(uint16(d[1]))
  712. binary.BigEndian.PutUint32(buf[:4], key)
  713. binary.BigEndian.PutUint32(buf[4:], uint32(i))
  714. fmt.Fprintf(w, "\t\t%q + // 0x%.8X: 0x%.8X\n", string(buf[:]), key, uint32(i))
  715. }
  716. }
  717. // hack so we don't have to special case the trailing plus sign
  718. fmt.Fprintf(w, ` ""`)
  719. fmt.Fprintln(w)
  720. }
  721. fmt.Fprintf(w, "// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size)
  722. gen.WriteVersionedGoFile("tables.go", "norm", w.Bytes())
  723. }
  724. func printChars() {
  725. if *verbose {
  726. for _, c := range chars {
  727. if !c.isValid() || c.state == SMissing {
  728. continue
  729. }
  730. fmt.Println(c)
  731. }
  732. }
  733. }
  734. // verifyComputed does various consistency tests.
  735. func verifyComputed() {
  736. for i, c := range chars {
  737. for _, f := range c.forms {
  738. isNo := (f.quickCheck[MDecomposed] == QCNo)
  739. if (len(f.decomp) > 0) != isNo && !isHangul(rune(i)) {
  740. log.Fatalf("%U: NF*D QC must be No if rune decomposes", i)
  741. }
  742. isMaybe := f.quickCheck[MComposed] == QCMaybe
  743. if f.combinesBackward != isMaybe {
  744. log.Fatalf("%U: NF*C QC must be Maybe if combinesBackward", i)
  745. }
  746. if len(f.decomp) > 0 && f.combinesForward && isMaybe {
  747. log.Fatalf("%U: NF*C QC must be Yes or No if combinesForward and decomposes", i)
  748. }
  749. if len(f.expandedDecomp) != 0 {
  750. continue
  751. }
  752. if a, b := c.nLeadingNonStarters > 0, (c.ccc > 0 || f.combinesBackward); a != b {
  753. // We accept these runes to be treated differently (it only affects
  754. // segment breaking in iteration, most likely on improper use), but
  755. // reconsider if more characters are added.
  756. // U+FF9E HALFWIDTH KATAKANA VOICED SOUND MARK;Lm;0;L;<narrow> 3099;;;;N;;;;;
  757. // U+FF9F HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK;Lm;0;L;<narrow> 309A;;;;N;;;;;
  758. // U+3133 HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<compat> 11AA;;;;N;HANGUL LETTER GIYEOG SIOS;;;;
  759. // U+318E HANGUL LETTER ARAEAE;Lo;0;L;<compat> 11A1;;;;N;HANGUL LETTER ALAE AE;;;;
  760. // U+FFA3 HALFWIDTH HANGUL LETTER KIYEOK-SIOS;Lo;0;L;<narrow> 3133;;;;N;HALFWIDTH HANGUL LETTER GIYEOG SIOS;;;;
  761. // U+FFDC HALFWIDTH HANGUL LETTER I;Lo;0;L;<narrow> 3163;;;;N;;;;;
  762. if i != 0xFF9E && i != 0xFF9F && !(0x3133 <= i && i <= 0x318E) && !(0xFFA3 <= i && i <= 0xFFDC) {
  763. log.Fatalf("%U: nLead was %v; want %v", i, a, b)
  764. }
  765. }
  766. }
  767. nfc := c.forms[FCanonical]
  768. nfkc := c.forms[FCompatibility]
  769. if nfc.combinesBackward != nfkc.combinesBackward {
  770. log.Fatalf("%U: Cannot combine combinesBackward\n", c.codePoint)
  771. }
  772. }
  773. }
  774. // Use values in DerivedNormalizationProps.txt to compare against the
  775. // values we computed.
  776. // DerivedNormalizationProps.txt has form:
  777. // 00C0..00C5 ; NFD_QC; N # ...
  778. // 0374 ; NFD_QC; N # ...
  779. // See https://unicode.org/reports/tr44/ for full explanation
  780. func testDerived() {
  781. f := gen.OpenUCDFile("DerivedNormalizationProps.txt")
  782. defer f.Close()
  783. p := ucd.New(f)
  784. for p.Next() {
  785. r := p.Rune(0)
  786. c := &chars[r]
  787. var ftype, mode int
  788. qt := p.String(1)
  789. switch qt {
  790. case "NFC_QC":
  791. ftype, mode = FCanonical, MComposed
  792. case "NFD_QC":
  793. ftype, mode = FCanonical, MDecomposed
  794. case "NFKC_QC":
  795. ftype, mode = FCompatibility, MComposed
  796. case "NFKD_QC":
  797. ftype, mode = FCompatibility, MDecomposed
  798. default:
  799. continue
  800. }
  801. var qr QCResult
  802. switch p.String(2) {
  803. case "Y":
  804. qr = QCYes
  805. case "N":
  806. qr = QCNo
  807. case "M":
  808. qr = QCMaybe
  809. default:
  810. log.Fatalf(`Unexpected quick check value "%s"`, p.String(2))
  811. }
  812. if got := c.forms[ftype].quickCheck[mode]; got != qr {
  813. log.Printf("%U: FAILED %s (was %v need %v)\n", r, qt, got, qr)
  814. }
  815. c.forms[ftype].verified[mode] = true
  816. }
  817. if err := p.Err(); err != nil {
  818. log.Fatal(err)
  819. }
  820. // Any unspecified value must be QCYes. Verify this.
  821. for i, c := range chars {
  822. for j, fd := range c.forms {
  823. for k, qr := range fd.quickCheck {
  824. if !fd.verified[k] && qr != QCYes {
  825. m := "%U: FAIL F:%d M:%d (was %v need Yes) %s\n"
  826. log.Printf(m, i, j, k, qr, c.name)
  827. }
  828. }
  829. }
  830. }
  831. }
  832. var testHeader = `const (
  833. Yes = iota
  834. No
  835. Maybe
  836. )
  837. type formData struct {
  838. qc uint8
  839. combinesForward bool
  840. decomposition string
  841. }
  842. type runeData struct {
  843. r rune
  844. ccc uint8
  845. nLead uint8
  846. nTrail uint8
  847. f [2]formData // 0: canonical; 1: compatibility
  848. }
  849. func f(qc uint8, cf bool, dec string) [2]formData {
  850. return [2]formData{{qc, cf, dec}, {qc, cf, dec}}
  851. }
  852. func g(qc, qck uint8, cf, cfk bool, d, dk string) [2]formData {
  853. return [2]formData{{qc, cf, d}, {qck, cfk, dk}}
  854. }
  855. var testData = []runeData{
  856. `
  857. func printTestdata() {
  858. type lastInfo struct {
  859. ccc uint8
  860. nLead uint8
  861. nTrail uint8
  862. f string
  863. }
  864. last := lastInfo{}
  865. w := &bytes.Buffer{}
  866. fmt.Fprintf(w, testHeader)
  867. for r, c := range chars {
  868. f := c.forms[FCanonical]
  869. qc, cf, d := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp)
  870. f = c.forms[FCompatibility]
  871. qck, cfk, dk := f.quickCheck[MComposed], f.combinesForward, string(f.expandedDecomp)
  872. s := ""
  873. if d == dk && qc == qck && cf == cfk {
  874. s = fmt.Sprintf("f(%s, %v, %q)", qc, cf, d)
  875. } else {
  876. s = fmt.Sprintf("g(%s, %s, %v, %v, %q, %q)", qc, qck, cf, cfk, d, dk)
  877. }
  878. current := lastInfo{c.ccc, c.nLeadingNonStarters, c.nTrailingNonStarters, s}
  879. if last != current {
  880. fmt.Fprintf(w, "\t{0x%x, %d, %d, %d, %s},\n", r, c.origCCC, c.nLeadingNonStarters, c.nTrailingNonStarters, s)
  881. last = current
  882. }
  883. }
  884. fmt.Fprintln(w, "}")
  885. gen.WriteVersionedGoFile("data_test.go", "norm", w.Bytes())
  886. }