gen.go 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:build ignore
  5. // +build ignore
  6. // Language tag table generator.
  7. // Data read from the web.
  8. package main
  9. import (
  10. "flag"
  11. "fmt"
  12. "io"
  13. "log"
  14. "sort"
  15. "strconv"
  16. "strings"
  17. "golang.org/x/text/internal/gen"
  18. "golang.org/x/text/internal/language"
  19. "golang.org/x/text/unicode/cldr"
  20. )
  21. var (
  22. test = flag.Bool("test",
  23. false,
  24. "test existing tables; can be used to compare web data with package data.")
  25. outputFile = flag.String("output",
  26. "tables.go",
  27. "output file for generated tables")
  28. )
  29. func main() {
  30. gen.Init()
  31. w := gen.NewCodeWriter()
  32. defer w.WriteGoFile("tables.go", "language")
  33. b := newBuilder(w)
  34. gen.WriteCLDRVersion(w)
  35. b.writeConstants()
  36. b.writeMatchData()
  37. }
  38. type builder struct {
  39. w *gen.CodeWriter
  40. hw io.Writer // MultiWriter for w and w.Hash
  41. data *cldr.CLDR
  42. supp *cldr.SupplementalData
  43. }
  44. func (b *builder) langIndex(s string) uint16 {
  45. return uint16(language.MustParseBase(s))
  46. }
  47. func (b *builder) regionIndex(s string) int {
  48. return int(language.MustParseRegion(s))
  49. }
  50. func (b *builder) scriptIndex(s string) int {
  51. return int(language.MustParseScript(s))
  52. }
  53. func newBuilder(w *gen.CodeWriter) *builder {
  54. r := gen.OpenCLDRCoreZip()
  55. defer r.Close()
  56. d := &cldr.Decoder{}
  57. data, err := d.DecodeZip(r)
  58. if err != nil {
  59. log.Fatal(err)
  60. }
  61. b := builder{
  62. w: w,
  63. hw: io.MultiWriter(w, w.Hash),
  64. data: data,
  65. supp: data.Supplemental(),
  66. }
  67. return &b
  68. }
  69. // writeConsts computes f(v) for all v in values and writes the results
  70. // as constants named _v to a single constant block.
  71. func (b *builder) writeConsts(f func(string) int, values ...string) {
  72. fmt.Fprintln(b.w, "const (")
  73. for _, v := range values {
  74. fmt.Fprintf(b.w, "\t_%s = %v\n", v, f(v))
  75. }
  76. fmt.Fprintln(b.w, ")")
  77. }
  78. // TODO: region inclusion data will probably not be use used in future matchers.
  79. var langConsts = []string{
  80. "de", "en", "fr", "it", "mo", "no", "nb", "pt", "sh", "mul", "und",
  81. }
  82. var scriptConsts = []string{
  83. "Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
  84. "Zzzz",
  85. }
  86. var regionConsts = []string{
  87. "001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
  88. "ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
  89. }
  90. func (b *builder) writeConstants() {
  91. b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
  92. b.writeConsts(b.regionIndex, regionConsts...)
  93. b.writeConsts(b.scriptIndex, scriptConsts...)
  94. }
  95. type mutualIntelligibility struct {
  96. want, have uint16
  97. distance uint8
  98. oneway bool
  99. }
  100. type scriptIntelligibility struct {
  101. wantLang, haveLang uint16
  102. wantScript, haveScript uint8
  103. distance uint8
  104. // Always oneway
  105. }
  106. type regionIntelligibility struct {
  107. lang uint16 // compact language id
  108. script uint8 // 0 means any
  109. group uint8 // 0 means any; if bit 7 is set it means inverse
  110. distance uint8
  111. // Always twoway.
  112. }
  113. // writeMatchData writes tables with languages and scripts for which there is
  114. // mutual intelligibility. The data is based on CLDR's languageMatching data.
  115. // Note that we use a different algorithm than the one defined by CLDR and that
  116. // we slightly modify the data. For example, we convert scores to confidence levels.
  117. // We also drop all region-related data as we use a different algorithm to
  118. // determine region equivalence.
  119. func (b *builder) writeMatchData() {
  120. lm := b.supp.LanguageMatching.LanguageMatches
  121. cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new")
  122. regionHierarchy := map[string][]string{}
  123. for _, g := range b.supp.TerritoryContainment.Group {
  124. regions := strings.Split(g.Contains, " ")
  125. regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...)
  126. }
  127. // Regions start at 1, so the slice must be one larger than the number of
  128. // regions.
  129. regionToGroups := make([]uint8, language.NumRegions+1)
  130. idToIndex := map[string]uint8{}
  131. for i, mv := range lm[0].MatchVariable {
  132. if i > 6 {
  133. log.Fatalf("Too many groups: %d", i)
  134. }
  135. idToIndex[mv.Id] = uint8(i + 1)
  136. // TODO: also handle '-'
  137. for _, r := range strings.Split(mv.Value, "+") {
  138. todo := []string{r}
  139. for k := 0; k < len(todo); k++ {
  140. r := todo[k]
  141. regionToGroups[b.regionIndex(r)] |= 1 << uint8(i)
  142. todo = append(todo, regionHierarchy[r]...)
  143. }
  144. }
  145. }
  146. b.w.WriteVar("regionToGroups", regionToGroups)
  147. // maps language id to in- and out-of-group region.
  148. paradigmLocales := [][3]uint16{}
  149. locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ")
  150. for i := 0; i < len(locales); i += 2 {
  151. x := [3]uint16{}
  152. for j := 0; j < 2; j++ {
  153. pc := strings.SplitN(locales[i+j], "-", 2)
  154. x[0] = b.langIndex(pc[0])
  155. if len(pc) == 2 {
  156. x[1+j] = uint16(b.regionIndex(pc[1]))
  157. }
  158. }
  159. paradigmLocales = append(paradigmLocales, x)
  160. }
  161. b.w.WriteVar("paradigmLocales", paradigmLocales)
  162. b.w.WriteType(mutualIntelligibility{})
  163. b.w.WriteType(scriptIntelligibility{})
  164. b.w.WriteType(regionIntelligibility{})
  165. matchLang := []mutualIntelligibility{}
  166. matchScript := []scriptIntelligibility{}
  167. matchRegion := []regionIntelligibility{}
  168. // Convert the languageMatch entries in lists keyed by desired language.
  169. for _, m := range lm[0].LanguageMatch {
  170. // Different versions of CLDR use different separators.
  171. desired := strings.Replace(m.Desired, "-", "_", -1)
  172. supported := strings.Replace(m.Supported, "-", "_", -1)
  173. d := strings.Split(desired, "_")
  174. s := strings.Split(supported, "_")
  175. if len(d) != len(s) {
  176. log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
  177. continue
  178. }
  179. distance, _ := strconv.ParseInt(m.Distance, 10, 8)
  180. switch len(d) {
  181. case 2:
  182. if desired == supported && desired == "*_*" {
  183. continue
  184. }
  185. // language-script pair.
  186. matchScript = append(matchScript, scriptIntelligibility{
  187. wantLang: uint16(b.langIndex(d[0])),
  188. haveLang: uint16(b.langIndex(s[0])),
  189. wantScript: uint8(b.scriptIndex(d[1])),
  190. haveScript: uint8(b.scriptIndex(s[1])),
  191. distance: uint8(distance),
  192. })
  193. if m.Oneway != "true" {
  194. matchScript = append(matchScript, scriptIntelligibility{
  195. wantLang: uint16(b.langIndex(s[0])),
  196. haveLang: uint16(b.langIndex(d[0])),
  197. wantScript: uint8(b.scriptIndex(s[1])),
  198. haveScript: uint8(b.scriptIndex(d[1])),
  199. distance: uint8(distance),
  200. })
  201. }
  202. case 1:
  203. if desired == supported && desired == "*" {
  204. continue
  205. }
  206. if distance == 1 {
  207. // nb == no is already handled by macro mapping. Check there
  208. // really is only this case.
  209. if d[0] != "no" || s[0] != "nb" {
  210. log.Fatalf("unhandled equivalence %s == %s", s[0], d[0])
  211. }
  212. continue
  213. }
  214. // TODO: consider dropping oneway field and just doubling the entry.
  215. matchLang = append(matchLang, mutualIntelligibility{
  216. want: uint16(b.langIndex(d[0])),
  217. have: uint16(b.langIndex(s[0])),
  218. distance: uint8(distance),
  219. oneway: m.Oneway == "true",
  220. })
  221. case 3:
  222. if desired == supported && desired == "*_*_*" {
  223. continue
  224. }
  225. if desired != supported {
  226. // This is now supported by CLDR, but only one case, which
  227. // should already be covered by paradigm locales. For instance,
  228. // test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in
  229. // testdata/CLDRLocaleMatcherTest.txt tests this.
  230. if supported != "en_*_GB" {
  231. log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
  232. }
  233. continue
  234. }
  235. ri := regionIntelligibility{
  236. lang: b.langIndex(d[0]),
  237. distance: uint8(distance),
  238. }
  239. if d[1] != "*" {
  240. ri.script = uint8(b.scriptIndex(d[1]))
  241. }
  242. switch {
  243. case d[2] == "*":
  244. ri.group = 0x80 // not contained in anything
  245. case strings.HasPrefix(d[2], "$!"):
  246. ri.group = 0x80
  247. d[2] = "$" + d[2][len("$!"):]
  248. fallthrough
  249. case strings.HasPrefix(d[2], "$"):
  250. ri.group |= idToIndex[d[2]]
  251. }
  252. matchRegion = append(matchRegion, ri)
  253. default:
  254. log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
  255. }
  256. }
  257. sort.SliceStable(matchLang, func(i, j int) bool {
  258. return matchLang[i].distance < matchLang[j].distance
  259. })
  260. b.w.WriteComment(`
  261. matchLang holds pairs of langIDs of base languages that are typically
  262. mutually intelligible. Each pair is associated with a confidence and
  263. whether the intelligibility goes one or both ways.`)
  264. b.w.WriteVar("matchLang", matchLang)
  265. b.w.WriteComment(`
  266. matchScript holds pairs of scriptIDs where readers of one script
  267. can typically also read the other. Each is associated with a confidence.`)
  268. sort.SliceStable(matchScript, func(i, j int) bool {
  269. return matchScript[i].distance < matchScript[j].distance
  270. })
  271. b.w.WriteVar("matchScript", matchScript)
  272. sort.SliceStable(matchRegion, func(i, j int) bool {
  273. return matchRegion[i].distance < matchRegion[j].distance
  274. })
  275. b.w.WriteVar("matchRegion", matchRegion)
  276. }