maketables.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:build ignore
  5. // +build ignore
  6. package main
  7. import (
  8. "bufio"
  9. "fmt"
  10. "log"
  11. "net/http"
  12. "sort"
  13. "strings"
  14. "unicode/utf8"
  15. "golang.org/x/text/encoding"
  16. "golang.org/x/text/internal/gen"
  17. )
  18. const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
  19. "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
  20. ` !"#$%&'()*+,-./0123456789:;<=>?` +
  21. `@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` +
  22. "`abcdefghijklmnopqrstuvwxyz{|}~\u007f"
  23. var encodings = []struct {
  24. name string
  25. mib string
  26. comment string
  27. varName string
  28. replacement byte
  29. mapping string
  30. }{
  31. {
  32. "IBM Code Page 037",
  33. "IBM037",
  34. "",
  35. "CodePage037",
  36. 0x3f,
  37. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM037-2.1.2.ucm",
  38. },
  39. {
  40. "IBM Code Page 437",
  41. "PC8CodePage437",
  42. "",
  43. "CodePage437",
  44. encoding.ASCIISub,
  45. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm",
  46. },
  47. {
  48. "IBM Code Page 850",
  49. "PC850Multilingual",
  50. "",
  51. "CodePage850",
  52. encoding.ASCIISub,
  53. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm",
  54. },
  55. {
  56. "IBM Code Page 852",
  57. "PCp852",
  58. "",
  59. "CodePage852",
  60. encoding.ASCIISub,
  61. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm",
  62. },
  63. {
  64. "IBM Code Page 855",
  65. "IBM855",
  66. "",
  67. "CodePage855",
  68. encoding.ASCIISub,
  69. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm",
  70. },
  71. {
  72. "Windows Code Page 858", // PC latin1 with Euro
  73. "IBM00858",
  74. "",
  75. "CodePage858",
  76. encoding.ASCIISub,
  77. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm",
  78. },
  79. {
  80. "IBM Code Page 860",
  81. "IBM860",
  82. "",
  83. "CodePage860",
  84. encoding.ASCIISub,
  85. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm",
  86. },
  87. {
  88. "IBM Code Page 862",
  89. "PC862LatinHebrew",
  90. "",
  91. "CodePage862",
  92. encoding.ASCIISub,
  93. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm",
  94. },
  95. {
  96. "IBM Code Page 863",
  97. "IBM863",
  98. "",
  99. "CodePage863",
  100. encoding.ASCIISub,
  101. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm",
  102. },
  103. {
  104. "IBM Code Page 865",
  105. "IBM865",
  106. "",
  107. "CodePage865",
  108. encoding.ASCIISub,
  109. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm",
  110. },
  111. {
  112. "IBM Code Page 866",
  113. "IBM866",
  114. "",
  115. "CodePage866",
  116. encoding.ASCIISub,
  117. "http://encoding.spec.whatwg.org/index-ibm866.txt",
  118. },
  119. {
  120. "IBM Code Page 1047",
  121. "IBM1047",
  122. "",
  123. "CodePage1047",
  124. 0x3f,
  125. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm",
  126. },
  127. {
  128. "IBM Code Page 1140",
  129. "IBM01140",
  130. "",
  131. "CodePage1140",
  132. 0x3f,
  133. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/ibm-1140_P100-1997.ucm",
  134. },
  135. {
  136. "ISO 8859-1",
  137. "ISOLatin1",
  138. "",
  139. "ISO8859_1",
  140. encoding.ASCIISub,
  141. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm",
  142. },
  143. {
  144. "ISO 8859-2",
  145. "ISOLatin2",
  146. "",
  147. "ISO8859_2",
  148. encoding.ASCIISub,
  149. "http://encoding.spec.whatwg.org/index-iso-8859-2.txt",
  150. },
  151. {
  152. "ISO 8859-3",
  153. "ISOLatin3",
  154. "",
  155. "ISO8859_3",
  156. encoding.ASCIISub,
  157. "http://encoding.spec.whatwg.org/index-iso-8859-3.txt",
  158. },
  159. {
  160. "ISO 8859-4",
  161. "ISOLatin4",
  162. "",
  163. "ISO8859_4",
  164. encoding.ASCIISub,
  165. "http://encoding.spec.whatwg.org/index-iso-8859-4.txt",
  166. },
  167. {
  168. "ISO 8859-5",
  169. "ISOLatinCyrillic",
  170. "",
  171. "ISO8859_5",
  172. encoding.ASCIISub,
  173. "http://encoding.spec.whatwg.org/index-iso-8859-5.txt",
  174. },
  175. {
  176. "ISO 8859-6",
  177. "ISOLatinArabic",
  178. "",
  179. "ISO8859_6,ISO8859_6E,ISO8859_6I",
  180. encoding.ASCIISub,
  181. "http://encoding.spec.whatwg.org/index-iso-8859-6.txt",
  182. },
  183. {
  184. "ISO 8859-7",
  185. "ISOLatinGreek",
  186. "",
  187. "ISO8859_7",
  188. encoding.ASCIISub,
  189. "http://encoding.spec.whatwg.org/index-iso-8859-7.txt",
  190. },
  191. {
  192. "ISO 8859-8",
  193. "ISOLatinHebrew",
  194. "",
  195. "ISO8859_8,ISO8859_8E,ISO8859_8I",
  196. encoding.ASCIISub,
  197. "http://encoding.spec.whatwg.org/index-iso-8859-8.txt",
  198. },
  199. {
  200. "ISO 8859-9",
  201. "ISOLatin5",
  202. "",
  203. "ISO8859_9",
  204. encoding.ASCIISub,
  205. "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_9-1999.ucm",
  206. },
  207. {
  208. "ISO 8859-10",
  209. "ISOLatin6",
  210. "",
  211. "ISO8859_10",
  212. encoding.ASCIISub,
  213. "http://encoding.spec.whatwg.org/index-iso-8859-10.txt",
  214. },
  215. {
  216. "ISO 8859-13",
  217. "ISO885913",
  218. "",
  219. "ISO8859_13",
  220. encoding.ASCIISub,
  221. "http://encoding.spec.whatwg.org/index-iso-8859-13.txt",
  222. },
  223. {
  224. "ISO 8859-14",
  225. "ISO885914",
  226. "",
  227. "ISO8859_14",
  228. encoding.ASCIISub,
  229. "http://encoding.spec.whatwg.org/index-iso-8859-14.txt",
  230. },
  231. {
  232. "ISO 8859-15",
  233. "ISO885915",
  234. "",
  235. "ISO8859_15",
  236. encoding.ASCIISub,
  237. "http://encoding.spec.whatwg.org/index-iso-8859-15.txt",
  238. },
  239. {
  240. "ISO 8859-16",
  241. "ISO885916",
  242. "",
  243. "ISO8859_16",
  244. encoding.ASCIISub,
  245. "http://encoding.spec.whatwg.org/index-iso-8859-16.txt",
  246. },
  247. {
  248. "KOI8-R",
  249. "KOI8R",
  250. "",
  251. "KOI8R",
  252. encoding.ASCIISub,
  253. "http://encoding.spec.whatwg.org/index-koi8-r.txt",
  254. },
  255. {
  256. "KOI8-U",
  257. "KOI8U",
  258. "",
  259. "KOI8U",
  260. encoding.ASCIISub,
  261. "http://encoding.spec.whatwg.org/index-koi8-u.txt",
  262. },
  263. {
  264. "Macintosh",
  265. "Macintosh",
  266. "",
  267. "Macintosh",
  268. encoding.ASCIISub,
  269. "http://encoding.spec.whatwg.org/index-macintosh.txt",
  270. },
  271. {
  272. "Macintosh Cyrillic",
  273. "MacintoshCyrillic",
  274. "",
  275. "MacintoshCyrillic",
  276. encoding.ASCIISub,
  277. "http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt",
  278. },
  279. {
  280. "Windows 874",
  281. "Windows874",
  282. "",
  283. "Windows874",
  284. encoding.ASCIISub,
  285. "http://encoding.spec.whatwg.org/index-windows-874.txt",
  286. },
  287. {
  288. "Windows 1250",
  289. "Windows1250",
  290. "",
  291. "Windows1250",
  292. encoding.ASCIISub,
  293. "http://encoding.spec.whatwg.org/index-windows-1250.txt",
  294. },
  295. {
  296. "Windows 1251",
  297. "Windows1251",
  298. "",
  299. "Windows1251",
  300. encoding.ASCIISub,
  301. "http://encoding.spec.whatwg.org/index-windows-1251.txt",
  302. },
  303. {
  304. "Windows 1252",
  305. "Windows1252",
  306. "",
  307. "Windows1252",
  308. encoding.ASCIISub,
  309. "http://encoding.spec.whatwg.org/index-windows-1252.txt",
  310. },
  311. {
  312. "Windows 1253",
  313. "Windows1253",
  314. "",
  315. "Windows1253",
  316. encoding.ASCIISub,
  317. "http://encoding.spec.whatwg.org/index-windows-1253.txt",
  318. },
  319. {
  320. "Windows 1254",
  321. "Windows1254",
  322. "",
  323. "Windows1254",
  324. encoding.ASCIISub,
  325. "http://encoding.spec.whatwg.org/index-windows-1254.txt",
  326. },
  327. {
  328. "Windows 1255",
  329. "Windows1255",
  330. "",
  331. "Windows1255",
  332. encoding.ASCIISub,
  333. "http://encoding.spec.whatwg.org/index-windows-1255.txt",
  334. },
  335. {
  336. "Windows 1256",
  337. "Windows1256",
  338. "",
  339. "Windows1256",
  340. encoding.ASCIISub,
  341. "http://encoding.spec.whatwg.org/index-windows-1256.txt",
  342. },
  343. {
  344. "Windows 1257",
  345. "Windows1257",
  346. "",
  347. "Windows1257",
  348. encoding.ASCIISub,
  349. "http://encoding.spec.whatwg.org/index-windows-1257.txt",
  350. },
  351. {
  352. "Windows 1258",
  353. "Windows1258",
  354. "",
  355. "Windows1258",
  356. encoding.ASCIISub,
  357. "http://encoding.spec.whatwg.org/index-windows-1258.txt",
  358. },
  359. {
  360. "X-User-Defined",
  361. "XUserDefined",
  362. "It is defined at http://encoding.spec.whatwg.org/#x-user-defined",
  363. "XUserDefined",
  364. encoding.ASCIISub,
  365. ascii +
  366. "\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" +
  367. "\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" +
  368. "\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" +
  369. "\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" +
  370. "\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" +
  371. "\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" +
  372. "\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" +
  373. "\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" +
  374. "\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" +
  375. "\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" +
  376. "\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" +
  377. "\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" +
  378. "\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" +
  379. "\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" +
  380. "\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" +
  381. "\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff",
  382. },
  383. }
  384. func getWHATWG(url string) string {
  385. res, err := http.Get(url)
  386. if err != nil {
  387. log.Fatalf("%q: Get: %v", url, err)
  388. }
  389. defer res.Body.Close()
  390. mapping := make([]rune, 128)
  391. for i := range mapping {
  392. mapping[i] = '\ufffd'
  393. }
  394. scanner := bufio.NewScanner(res.Body)
  395. for scanner.Scan() {
  396. s := strings.TrimSpace(scanner.Text())
  397. if s == "" || s[0] == '#' {
  398. continue
  399. }
  400. x, y := 0, 0
  401. if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil {
  402. log.Fatalf("could not parse %q", s)
  403. }
  404. if x < 0 || 128 <= x {
  405. log.Fatalf("code %d is out of range", x)
  406. }
  407. if 0x80 <= y && y < 0xa0 {
  408. // We diverge from the WHATWG spec by mapping control characters
  409. // in the range [0x80, 0xa0) to U+FFFD.
  410. continue
  411. }
  412. mapping[x] = rune(y)
  413. }
  414. return ascii + string(mapping)
  415. }
  416. func getUCM(url string) string {
  417. res, err := http.Get(url)
  418. if err != nil {
  419. log.Fatalf("%q: Get: %v", url, err)
  420. }
  421. defer res.Body.Close()
  422. mapping := make([]rune, 256)
  423. for i := range mapping {
  424. mapping[i] = '\ufffd'
  425. }
  426. charsFound := 0
  427. scanner := bufio.NewScanner(res.Body)
  428. for scanner.Scan() {
  429. s := strings.TrimSpace(scanner.Text())
  430. if s == "" || s[0] == '#' {
  431. continue
  432. }
  433. var c byte
  434. var r rune
  435. if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil {
  436. continue
  437. }
  438. mapping[c] = r
  439. charsFound++
  440. }
  441. if charsFound < 200 {
  442. log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound)
  443. }
  444. return string(mapping)
  445. }
  446. func main() {
  447. mibs := map[string]bool{}
  448. all := []string{}
  449. w := gen.NewCodeWriter()
  450. defer w.WriteGoFile("tables.go", "charmap")
  451. printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) }
  452. printf("import (\n")
  453. printf("\t\"golang.org/x/text/encoding\"\n")
  454. printf("\t\"golang.org/x/text/encoding/internal/identifier\"\n")
  455. printf(")\n\n")
  456. for _, e := range encodings {
  457. varNames := strings.Split(e.varName, ",")
  458. all = append(all, varNames...)
  459. varName := varNames[0]
  460. switch {
  461. case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"):
  462. e.mapping = getWHATWG(e.mapping)
  463. case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"):
  464. e.mapping = getUCM(e.mapping)
  465. }
  466. asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00
  467. if asciiSuperset {
  468. low = 0x80
  469. }
  470. lvn := 1
  471. if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") {
  472. lvn = 3
  473. }
  474. lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:]
  475. printf("// %s is the %s encoding.\n", varName, e.name)
  476. if e.comment != "" {
  477. printf("//\n// %s\n", e.comment)
  478. }
  479. printf("var %s *Charmap = &%s\n\nvar %s = Charmap{\nname: %q,\n",
  480. varName, lowerVarName, lowerVarName, e.name)
  481. if mibs[e.mib] {
  482. log.Fatalf("MIB type %q declared multiple times.", e.mib)
  483. }
  484. printf("mib: identifier.%s,\n", e.mib)
  485. printf("asciiSuperset: %t,\n", asciiSuperset)
  486. printf("low: 0x%02x,\n", low)
  487. printf("replacement: 0x%02x,\n", e.replacement)
  488. printf("decode: [256]utf8Enc{\n")
  489. i, backMapping := 0, map[rune]byte{}
  490. for _, c := range e.mapping {
  491. if _, ok := backMapping[c]; !ok && c != utf8.RuneError {
  492. backMapping[c] = byte(i)
  493. }
  494. var buf [8]byte
  495. n := utf8.EncodeRune(buf[:], c)
  496. if n > 3 {
  497. panic(fmt.Sprintf("rune %q (%U) is too long", c, c))
  498. }
  499. printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2])
  500. if i%2 == 1 {
  501. printf("\n")
  502. }
  503. i++
  504. }
  505. printf("},\n")
  506. printf("encode: [256]uint32{\n")
  507. encode := make([]uint32, 0, 256)
  508. for c, i := range backMapping {
  509. encode = append(encode, uint32(i)<<24|uint32(c))
  510. }
  511. sort.Sort(byRune(encode))
  512. for len(encode) < cap(encode) {
  513. encode = append(encode, encode[len(encode)-1])
  514. }
  515. for i, enc := range encode {
  516. printf("0x%08x,", enc)
  517. if i%8 == 7 {
  518. printf("\n")
  519. }
  520. }
  521. printf("},\n}\n")
  522. // Add an estimate of the size of a single Charmap{} struct value, which
  523. // includes two 256 elem arrays of 4 bytes and some extra fields, which
  524. // align to 3 uint64s on 64-bit architectures.
  525. w.Size += 2*4*256 + 3*8
  526. }
  527. // TODO: add proper line breaking.
  528. printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n"))
  529. }
  530. type byRune []uint32
  531. func (b byRune) Len() int { return len(b) }
  532. func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff }
  533. func (b byRune) Swap(i, j int) { b[i], b[j] = b[j], b[i] }