gen.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build ignore
  5. package main
  6. // This program generates table.go and table_test.go.
  7. // Invoke as:
  8. //
  9. // go run gen.go -version "xxx" >table.go
  10. // go run gen.go -version "xxx" -test >table_test.go
  11. //
  12. // The version is derived from information found at
  13. // http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat
  14. // which is linked from http://publicsuffix.org/list/.
  15. //
  16. // To fetch a particular hg revision, such as 05b11a8d1ace, pass
  17. // -url "http://hg.mozilla.org/mozilla-central/raw-file/05b11a8d1ace/netwerk/dns/effective_tld_names.dat"
  18. import (
  19. "bufio"
  20. "bytes"
  21. "flag"
  22. "fmt"
  23. "go/format"
  24. "io"
  25. "net/http"
  26. "os"
  27. "regexp"
  28. "sort"
  29. "strings"
  30. "code.google.com/p/go.net/idna"
  31. )
  32. const (
  33. nodesBitsChildren = 9
  34. nodesBitsICANN = 1
  35. nodesBitsTextOffset = 15
  36. nodesBitsTextLength = 6
  37. childrenBitsWildcard = 1
  38. childrenBitsNodeType = 2
  39. childrenBitsHi = 14
  40. childrenBitsLo = 14
  41. )
  42. var (
  43. maxChildren int
  44. maxTextOffset int
  45. maxTextLength int
  46. maxHi uint32
  47. maxLo uint32
  48. )
  49. func max(a, b int) int {
  50. if a < b {
  51. return b
  52. }
  53. return a
  54. }
  55. func u32max(a, b uint32) uint32 {
  56. if a < b {
  57. return b
  58. }
  59. return a
  60. }
  61. const (
  62. nodeTypeNormal = 0
  63. nodeTypeException = 1
  64. nodeTypeParentOnly = 2
  65. numNodeType = 3
  66. )
  67. func nodeTypeStr(n int) string {
  68. switch n {
  69. case nodeTypeNormal:
  70. return "+"
  71. case nodeTypeException:
  72. return "!"
  73. case nodeTypeParentOnly:
  74. return "o"
  75. }
  76. panic("unreachable")
  77. }
  78. var (
  79. labelEncoding = map[string]uint32{}
  80. labelsList = []string{}
  81. labelsMap = map[string]bool{}
  82. rules = []string{}
  83. // validSuffix is used to check that the entries in the public suffix list
  84. // are in canonical form (after Punycode encoding). Specifically, capital
  85. // letters are not allowed.
  86. validSuffix = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
  87. crush = flag.Bool("crush", true, "make the generated node text as small as possible")
  88. subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
  89. url = flag.String("url",
  90. "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1",
  91. "URL of the publicsuffix.org list. If empty, stdin is read instead")
  92. v = flag.Bool("v", false, "verbose output (to stderr)")
  93. version = flag.String("version", "", "the effective_tld_names.dat version")
  94. test = flag.Bool("test", false, "generate table_test.go")
  95. )
  96. func main() {
  97. if err := main1(); err != nil {
  98. fmt.Fprintln(os.Stderr, err)
  99. os.Exit(1)
  100. }
  101. }
  102. func main1() error {
  103. flag.Parse()
  104. if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > 32 {
  105. return fmt.Errorf("not enough bits to encode the nodes table")
  106. }
  107. if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 {
  108. return fmt.Errorf("not enough bits to encode the children table")
  109. }
  110. if *version == "" {
  111. return fmt.Errorf("-version was not specified")
  112. }
  113. var r io.Reader = os.Stdin
  114. if *url != "" {
  115. res, err := http.Get(*url)
  116. if err != nil {
  117. return err
  118. }
  119. if res.StatusCode != http.StatusOK {
  120. return fmt.Errorf("bad GET status for %s: %d", *url, res.Status)
  121. }
  122. r = res.Body
  123. defer res.Body.Close()
  124. }
  125. var root node
  126. icann := false
  127. buf := new(bytes.Buffer)
  128. br := bufio.NewReader(r)
  129. for {
  130. s, err := br.ReadString('\n')
  131. if err != nil {
  132. if err == io.EOF {
  133. break
  134. }
  135. return err
  136. }
  137. s = strings.TrimSpace(s)
  138. if strings.Contains(s, "BEGIN ICANN DOMAINS") {
  139. icann = true
  140. continue
  141. }
  142. if strings.Contains(s, "END ICANN DOMAINS") {
  143. icann = false
  144. continue
  145. }
  146. if s == "" || strings.HasPrefix(s, "//") {
  147. continue
  148. }
  149. s, err = idna.ToASCII(s)
  150. if err != nil {
  151. return err
  152. }
  153. if !validSuffix.MatchString(s) {
  154. return fmt.Errorf("bad publicsuffix.org list data: %q", s)
  155. }
  156. if *subset {
  157. switch {
  158. case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"):
  159. case s == "ak.us" || strings.HasSuffix(s, ".ak.us"):
  160. case s == "ao" || strings.HasSuffix(s, ".ao"):
  161. case s == "ar" || strings.HasSuffix(s, ".ar"):
  162. case s == "arpa" || strings.HasSuffix(s, ".arpa"):
  163. case s == "cy" || strings.HasSuffix(s, ".cy"):
  164. case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"):
  165. case s == "jp":
  166. case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
  167. case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
  168. case s == "om" || strings.HasSuffix(s, ".om"):
  169. case s == "uk" || strings.HasSuffix(s, ".uk"):
  170. case s == "uk.com" || strings.HasSuffix(s, ".uk.com"):
  171. case s == "tw" || strings.HasSuffix(s, ".tw"):
  172. case s == "zw" || strings.HasSuffix(s, ".zw"):
  173. case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"):
  174. // xn--p1ai is Russian-Cyrillic "рф".
  175. default:
  176. continue
  177. }
  178. }
  179. rules = append(rules, s)
  180. nt, wildcard := nodeTypeNormal, false
  181. switch {
  182. case strings.HasPrefix(s, "*."):
  183. s, nt = s[2:], nodeTypeParentOnly
  184. wildcard = true
  185. case strings.HasPrefix(s, "!"):
  186. s, nt = s[1:], nodeTypeException
  187. }
  188. labels := strings.Split(s, ".")
  189. for n, i := &root, len(labels)-1; i >= 0; i-- {
  190. label := labels[i]
  191. n = n.child(label)
  192. if i == 0 {
  193. if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
  194. n.nodeType = nt
  195. }
  196. n.icann = n.icann && icann
  197. n.wildcard = n.wildcard || wildcard
  198. }
  199. labelsMap[label] = true
  200. }
  201. }
  202. labelsList = make([]string, 0, len(labelsMap))
  203. for label := range labelsMap {
  204. labelsList = append(labelsList, label)
  205. }
  206. sort.Strings(labelsList)
  207. p := printReal
  208. if *test {
  209. p = printTest
  210. }
  211. if err := p(buf, &root); err != nil {
  212. return err
  213. }
  214. b, err := format.Source(buf.Bytes())
  215. if err != nil {
  216. return err
  217. }
  218. _, err = os.Stdout.Write(b)
  219. return err
  220. }
  221. func printTest(w io.Writer, n *node) error {
  222. fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
  223. fmt.Fprintf(w, "package publicsuffix\n\nvar rules = [...]string{\n")
  224. for _, rule := range rules {
  225. fmt.Fprintf(w, "%q,\n", rule)
  226. }
  227. fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
  228. if err := n.walk(w, printNodeLabel); err != nil {
  229. return err
  230. }
  231. fmt.Fprintf(w, "}\n")
  232. return nil
  233. }
  234. func printReal(w io.Writer, n *node) error {
  235. const header = `// generated by go run gen.go; DO NOT EDIT
  236. package publicsuffix
  237. const version = %q
  238. const (
  239. nodesBitsChildren = %d
  240. nodesBitsICANN = %d
  241. nodesBitsTextOffset = %d
  242. nodesBitsTextLength = %d
  243. childrenBitsWildcard = %d
  244. childrenBitsNodeType = %d
  245. childrenBitsHi = %d
  246. childrenBitsLo = %d
  247. )
  248. const (
  249. nodeTypeNormal = %d
  250. nodeTypeException = %d
  251. nodeTypeParentOnly = %d
  252. )
  253. // numTLD is the number of top level domains.
  254. const numTLD = %d
  255. `
  256. fmt.Fprintf(w, header, *version,
  257. nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength,
  258. childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
  259. nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
  260. text := makeText()
  261. if text == "" {
  262. return fmt.Errorf("internal error: makeText returned no text")
  263. }
  264. for _, label := range labelsList {
  265. offset, length := strings.Index(text, label), len(label)
  266. if offset < 0 {
  267. return fmt.Errorf("internal error: could not find %q in text %q", label, text)
  268. }
  269. maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
  270. if offset >= 1<<nodesBitsTextOffset || length >= 1<<nodesBitsTextLength {
  271. return fmt.Errorf("text offset/length is too large: %d/%d", offset, length)
  272. }
  273. labelEncoding[label] = uint32(offset)<<nodesBitsTextLength | uint32(length)
  274. }
  275. fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ")
  276. for len(text) > 0 {
  277. n, plus := len(text), ""
  278. if n > 64 {
  279. n, plus = 64, " +"
  280. }
  281. fmt.Fprintf(w, "%q%s\n", text[:n], plus)
  282. text = text[n:]
  283. }
  284. n.walk(w, assignIndexes)
  285. fmt.Fprintf(w, `
  286. // nodes is the list of nodes. Each node is represented as a uint32, which
  287. // encodes the node's children, wildcard bit and node type (as an index into
  288. // the children array), ICANN bit and text.
  289. //
  290. // In the //-comment after each node's data, the nodes indexes of the children
  291. // are formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The
  292. // nodeType is printed as + for normal, ! for exception, and o for parent-only
  293. // nodes that have children but don't match a domain label in their own right.
  294. // An I denotes an ICANN domain.
  295. //
  296. // The layout within the uint32, from MSB to LSB, is:
  297. // [%2d bits] unused
  298. // [%2d bits] children index
  299. // [%2d bits] ICANN bit
  300. // [%2d bits] text index
  301. // [%2d bits] text length
  302. var nodes = [...]uint32{
  303. `,
  304. 32-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength,
  305. nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength)
  306. if err := n.walk(w, printNode); err != nil {
  307. return err
  308. }
  309. fmt.Fprintf(w, `}
  310. // children is the list of nodes' children, the parent's wildcard bit and the
  311. // parent's node type. If a node has no children then their children index
  312. // will be in the range [0, 6), depending on the wildcard bit and node type.
  313. //
  314. // The layout within the uint32, from MSB to LSB, is:
  315. // [%2d bits] unused
  316. // [%2d bits] wildcard bit
  317. // [%2d bits] node type
  318. // [%2d bits] high nodes index (exclusive) of children
  319. // [%2d bits] low nodes index (inclusive) of children
  320. var children=[...]uint32{
  321. `,
  322. 32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo,
  323. childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo)
  324. for i, c := range childrenEncoding {
  325. s := "---------------"
  326. lo := c & (1<<childrenBitsLo - 1)
  327. hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1)
  328. if lo != hi {
  329. s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi)
  330. }
  331. nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1)
  332. wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0
  333. fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n",
  334. c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType))
  335. }
  336. fmt.Fprintf(w, "}\n\n")
  337. fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1)
  338. fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1)
  339. fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1)
  340. fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1)
  341. fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1)
  342. return nil
  343. }
  344. type node struct {
  345. label string
  346. nodeType int
  347. icann bool
  348. wildcard bool
  349. // nodesIndex and childrenIndex are the index of this node in the nodes
  350. // and the index of its children offset/length in the children arrays.
  351. nodesIndex, childrenIndex int
  352. // firstChild is the index of this node's first child, or zero if this
  353. // node has no children.
  354. firstChild int
  355. // children are the node's children, in strictly increasing node label order.
  356. children []*node
  357. }
  358. func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error {
  359. if err := f(w, n); err != nil {
  360. return err
  361. }
  362. for _, c := range n.children {
  363. if err := c.walk(w, f); err != nil {
  364. return err
  365. }
  366. }
  367. return nil
  368. }
  369. // child returns the child of n with the given label. The child is created if
  370. // it did not exist beforehand.
  371. func (n *node) child(label string) *node {
  372. for _, c := range n.children {
  373. if c.label == label {
  374. return c
  375. }
  376. }
  377. c := &node{
  378. label: label,
  379. nodeType: nodeTypeParentOnly,
  380. icann: true,
  381. }
  382. n.children = append(n.children, c)
  383. sort.Sort(byLabel(n.children))
  384. return c
  385. }
  386. type byLabel []*node
  387. func (b byLabel) Len() int { return len(b) }
  388. func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
  389. func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
  390. var nextNodesIndex int
  391. // childrenEncoding are the encoded entries in the generated children array.
  392. // All these pre-defined entries have no children.
  393. var childrenEncoding = []uint32{
  394. 0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal.
  395. 1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException.
  396. 2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly.
  397. 4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal.
  398. 5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException.
  399. 6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly.
  400. }
  401. var firstCallToAssignIndexes = true
  402. func assignIndexes(w io.Writer, n *node) error {
  403. if len(n.children) != 0 {
  404. // Assign nodesIndex.
  405. n.firstChild = nextNodesIndex
  406. for _, c := range n.children {
  407. c.nodesIndex = nextNodesIndex
  408. nextNodesIndex++
  409. }
  410. // The root node's children is implicit.
  411. if firstCallToAssignIndexes {
  412. firstCallToAssignIndexes = false
  413. return nil
  414. }
  415. // Assign childrenIndex.
  416. maxChildren = max(maxChildren, len(childrenEncoding))
  417. if len(childrenEncoding) >= 1<<nodesBitsChildren {
  418. return fmt.Errorf("children table is too large")
  419. }
  420. n.childrenIndex = len(childrenEncoding)
  421. lo := uint32(n.firstChild)
  422. hi := lo + uint32(len(n.children))
  423. maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi)
  424. if lo >= 1<<childrenBitsLo || hi >= 1<<childrenBitsHi {
  425. return fmt.Errorf("children lo/hi is too large: %d/%d", lo, hi)
  426. }
  427. enc := hi<<childrenBitsLo | lo
  428. enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi)
  429. if n.wildcard {
  430. enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType)
  431. }
  432. childrenEncoding = append(childrenEncoding, enc)
  433. } else {
  434. n.childrenIndex = n.nodeType
  435. if n.wildcard {
  436. n.childrenIndex += numNodeType
  437. }
  438. }
  439. return nil
  440. }
  441. func printNode(w io.Writer, n *node) error {
  442. for _, c := range n.children {
  443. s := "---------------"
  444. if len(c.children) != 0 {
  445. s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children))
  446. }
  447. encoding := labelEncoding[c.label]
  448. if c.icann {
  449. encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
  450. }
  451. encoding |= uint32(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
  452. fmt.Fprintf(w, "0x%08x, // n0x%04x c0x%04x (%s)%s %s %s %s\n",
  453. encoding, c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard),
  454. nodeTypeStr(c.nodeType), icannStr(c.icann), c.label,
  455. )
  456. }
  457. return nil
  458. }
  459. func printNodeLabel(w io.Writer, n *node) error {
  460. for _, c := range n.children {
  461. fmt.Fprintf(w, "%q,\n", c.label)
  462. }
  463. return nil
  464. }
  465. func icannStr(icann bool) string {
  466. if icann {
  467. return "I"
  468. }
  469. return " "
  470. }
  471. func wildcardStr(wildcard bool) string {
  472. if wildcard {
  473. return "*"
  474. }
  475. return " "
  476. }
  477. // makeText combines all the strings in labelsList to form one giant string.
  478. // If the crush flag is true, then overlapping strings will be merged: "arpa"
  479. // and "parliament" could yield "arparliament".
  480. func makeText() string {
  481. if !*crush {
  482. return strings.Join(labelsList, "")
  483. }
  484. beforeLength := 0
  485. for _, s := range labelsList {
  486. beforeLength += len(s)
  487. }
  488. // Make a copy of labelsList.
  489. ss := append(make([]string, 0, len(labelsList)), labelsList...)
  490. // Remove strings that are substrings of other strings.
  491. for changed := true; changed; {
  492. changed = false
  493. for i, s := range ss {
  494. if s == "" {
  495. continue
  496. }
  497. for j, t := range ss {
  498. if i != j && t != "" && strings.Contains(s, t) {
  499. changed = true
  500. ss[j] = ""
  501. }
  502. }
  503. }
  504. }
  505. // Remove the empty strings.
  506. sort.Strings(ss)
  507. for len(ss) > 0 && ss[0] == "" {
  508. ss = ss[1:]
  509. }
  510. // Join strings where one suffix matches another prefix.
  511. for {
  512. // Find best i, j, k such that ss[i][len-k:] == ss[j][:k],
  513. // maximizing overlap length k.
  514. besti := -1
  515. bestj := -1
  516. bestk := 0
  517. for i, s := range ss {
  518. if s == "" {
  519. continue
  520. }
  521. for j, t := range ss {
  522. if i == j {
  523. continue
  524. }
  525. for k := bestk + 1; k <= len(s) && k <= len(t); k++ {
  526. if s[len(s)-k:] == t[:k] {
  527. besti = i
  528. bestj = j
  529. bestk = k
  530. }
  531. }
  532. }
  533. }
  534. if bestk > 0 {
  535. if *v {
  536. fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d) out of (%4d,%4d): %q and %q\n",
  537. bestk, besti, bestj, len(ss), len(ss), ss[besti], ss[bestj])
  538. }
  539. ss[besti] += ss[bestj][bestk:]
  540. ss[bestj] = ""
  541. continue
  542. }
  543. break
  544. }
  545. text := strings.Join(ss, "")
  546. if *v {
  547. fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
  548. }
  549. return text
  550. }