gen.go 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. // Copyright 2015 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:build ignore
  5. // +build ignore
  6. package main
  7. import (
  8. "bytes"
  9. "encoding/json"
  10. "fmt"
  11. "log"
  12. "strings"
  13. "golang.org/x/text/internal/gen"
  14. )
  15. type group struct {
  16. Encodings []struct {
  17. Labels []string
  18. Name string
  19. }
  20. }
  21. func main() {
  22. gen.Init()
  23. r := gen.Open("https://encoding.spec.whatwg.org", "whatwg", "encodings.json")
  24. var groups []group
  25. if err := json.NewDecoder(r).Decode(&groups); err != nil {
  26. log.Fatalf("Error reading encodings.json: %v", err)
  27. }
  28. w := &bytes.Buffer{}
  29. fmt.Fprintln(w, "type htmlEncoding byte")
  30. fmt.Fprintln(w, "const (")
  31. for i, g := range groups {
  32. for _, e := range g.Encodings {
  33. key := strings.ToLower(e.Name)
  34. name := consts[key]
  35. if name == "" {
  36. log.Fatalf("No const defined for %s.", key)
  37. }
  38. if i == 0 {
  39. fmt.Fprintf(w, "%s htmlEncoding = iota\n", name)
  40. } else {
  41. fmt.Fprintf(w, "%s\n", name)
  42. }
  43. }
  44. }
  45. fmt.Fprintln(w, "numEncodings")
  46. fmt.Fprint(w, ")\n\n")
  47. fmt.Fprintln(w, "var canonical = [numEncodings]string{")
  48. for _, g := range groups {
  49. for _, e := range g.Encodings {
  50. fmt.Fprintf(w, "%q,\n", strings.ToLower(e.Name))
  51. }
  52. }
  53. fmt.Fprint(w, "}\n\n")
  54. fmt.Fprintln(w, "var nameMap = map[string]htmlEncoding{")
  55. for _, g := range groups {
  56. for _, e := range g.Encodings {
  57. for _, l := range e.Labels {
  58. key := strings.ToLower(e.Name)
  59. name := consts[key]
  60. fmt.Fprintf(w, "%q: %s,\n", l, name)
  61. }
  62. }
  63. }
  64. fmt.Fprint(w, "}\n\n")
  65. var tags []string
  66. fmt.Fprintln(w, "var localeMap = []htmlEncoding{")
  67. for _, loc := range locales {
  68. tags = append(tags, loc.tag)
  69. fmt.Fprintf(w, "%s, // %s \n", consts[loc.name], loc.tag)
  70. }
  71. fmt.Fprint(w, "}\n\n")
  72. fmt.Fprintf(w, "const locales = %q\n", strings.Join(tags, " "))
  73. gen.WriteGoFile("tables.go", "htmlindex", w.Bytes())
  74. }
  75. // consts maps canonical encoding name to internal constant.
  76. var consts = map[string]string{
  77. "utf-8": "utf8",
  78. "ibm866": "ibm866",
  79. "iso-8859-2": "iso8859_2",
  80. "iso-8859-3": "iso8859_3",
  81. "iso-8859-4": "iso8859_4",
  82. "iso-8859-5": "iso8859_5",
  83. "iso-8859-6": "iso8859_6",
  84. "iso-8859-7": "iso8859_7",
  85. "iso-8859-8": "iso8859_8",
  86. "iso-8859-8-i": "iso8859_8I",
  87. "iso-8859-10": "iso8859_10",
  88. "iso-8859-13": "iso8859_13",
  89. "iso-8859-14": "iso8859_14",
  90. "iso-8859-15": "iso8859_15",
  91. "iso-8859-16": "iso8859_16",
  92. "koi8-r": "koi8r",
  93. "koi8-u": "koi8u",
  94. "macintosh": "macintosh",
  95. "windows-874": "windows874",
  96. "windows-1250": "windows1250",
  97. "windows-1251": "windows1251",
  98. "windows-1252": "windows1252",
  99. "windows-1253": "windows1253",
  100. "windows-1254": "windows1254",
  101. "windows-1255": "windows1255",
  102. "windows-1256": "windows1256",
  103. "windows-1257": "windows1257",
  104. "windows-1258": "windows1258",
  105. "x-mac-cyrillic": "macintoshCyrillic",
  106. "gbk": "gbk",
  107. "gb18030": "gb18030",
  108. // "hz-gb-2312": "hzgb2312", // Was removed from WhatWG
  109. "big5": "big5",
  110. "euc-jp": "eucjp",
  111. "iso-2022-jp": "iso2022jp",
  112. "shift_jis": "shiftJIS",
  113. "euc-kr": "euckr",
  114. "replacement": "replacement",
  115. "utf-16be": "utf16be",
  116. "utf-16le": "utf16le",
  117. "x-user-defined": "xUserDefined",
  118. }
  119. // locales is taken from
  120. // https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm.
  121. var locales = []struct{ tag, name string }{
  122. // The default value. Explicitly state latin to benefit from the exact
  123. // script option, while still making 1252 the default encoding for languages
  124. // written in Latin script.
  125. {"und_Latn", "windows-1252"},
  126. {"ar", "windows-1256"},
  127. {"ba", "windows-1251"},
  128. {"be", "windows-1251"},
  129. {"bg", "windows-1251"},
  130. {"cs", "windows-1250"},
  131. {"el", "iso-8859-7"},
  132. {"et", "windows-1257"},
  133. {"fa", "windows-1256"},
  134. {"he", "windows-1255"},
  135. {"hr", "windows-1250"},
  136. {"hu", "iso-8859-2"},
  137. {"ja", "shift_jis"},
  138. {"kk", "windows-1251"},
  139. {"ko", "euc-kr"},
  140. {"ku", "windows-1254"},
  141. {"ky", "windows-1251"},
  142. {"lt", "windows-1257"},
  143. {"lv", "windows-1257"},
  144. {"mk", "windows-1251"},
  145. {"pl", "iso-8859-2"},
  146. {"ru", "windows-1251"},
  147. {"sah", "windows-1251"},
  148. {"sk", "windows-1250"},
  149. {"sl", "iso-8859-2"},
  150. {"sr", "windows-1251"},
  151. {"tg", "windows-1251"},
  152. {"th", "windows-874"},
  153. {"tr", "windows-1254"},
  154. {"tt", "windows-1251"},
  155. {"uk", "windows-1251"},
  156. {"vi", "windows-1258"},
  157. {"zh-hans", "gb18030"},
  158. {"zh-hant", "big5"},
  159. }