|
@@ -0,0 +1,439 @@
|
|
|
|
|
+// Copyright 2012 The Go Authors. All rights reserved.
|
|
|
|
|
+// Use of this source code is governed by a BSD-style
|
|
|
|
|
+// license that can be found in the LICENSE file.
|
|
|
|
|
+
|
|
|
|
|
+// +build ignore
|
|
|
|
|
+
|
|
|
|
|
+package main
|
|
|
|
|
+
|
|
|
|
|
+// This program generates table.go and table_test.go.
|
|
|
|
|
+// Invoke as:
|
|
|
|
|
+//
|
|
|
|
|
+// go run gen.go -version "xxx" >table.go
|
|
|
|
|
+// go run gen.go -version "xxx" -test >table_test.go
|
|
|
|
|
+//
|
|
|
|
|
+// The version is derived from information found at
|
|
|
|
|
+// http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat
|
|
|
|
|
+// which is linked from http://publicsuffix.org/list/.
|
|
|
|
|
+//
|
|
|
|
|
+// To fetch a particular hg revision, such as 05b11a8d1ace, pass
|
|
|
|
|
+// -url "http://hg.mozilla.org/mozilla-central/raw-file/05b11a8d1ace/netwerk/dns/effective_tld_names.dat"
|
|
|
|
|
+
|
|
|
|
|
+// TODO(nigeltao): decide what to do with non-ASCII entries.
|
|
|
|
|
+
|
|
|
|
|
+import (
|
|
|
|
|
+ "bufio"
|
|
|
|
|
+ "bytes"
|
|
|
|
|
+ "flag"
|
|
|
|
|
+ "fmt"
|
|
|
|
|
+ "go/format"
|
|
|
|
|
+ "io"
|
|
|
|
|
+ "net/http"
|
|
|
|
|
+ "os"
|
|
|
|
|
+ "sort"
|
|
|
|
|
+ "strings"
|
|
|
|
|
+)
|
|
|
|
|
+
|
|
|
|
|
+const (
|
|
|
|
|
+ nodeTypeNormal = 0
|
|
|
|
|
+ nodeTypeException = 1
|
|
|
|
|
+ nodeTypeParentOnly = 2
|
|
|
|
|
+)
|
|
|
|
|
+
|
|
|
|
|
+func nodeTypeString(n int) string {
|
|
|
|
|
+ switch n {
|
|
|
|
|
+ case nodeTypeNormal:
|
|
|
|
|
+ return "+"
|
|
|
|
|
+ case nodeTypeException:
|
|
|
|
|
+ return "!"
|
|
|
|
|
+ case nodeTypeParentOnly:
|
|
|
|
|
+ return "o"
|
|
|
|
|
+ }
|
|
|
|
|
+ panic("unreachable")
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+var (
|
|
|
|
|
+ labelEncoding = map[string]uint32{}
|
|
|
|
|
+ labelsList = []string{}
|
|
|
|
|
+ labelsMap = map[string]bool{}
|
|
|
|
|
+ rules = []string{}
|
|
|
|
|
+
|
|
|
|
|
+ crush = flag.Bool("crush", true, "make the generated node text as small as possible")
|
|
|
|
|
+ subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
|
|
|
|
|
+ url = flag.String("url",
|
|
|
|
|
+ "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1",
|
|
|
|
|
+ "URL of the publicsuffix.org list. If empty, stdin is read instead")
|
|
|
|
|
+ v = flag.Bool("v", false, "verbose output (to stderr)")
|
|
|
|
|
+ version = flag.String("version", "", "the effective_tld_names.dat version")
|
|
|
|
|
+ test = flag.Bool("test", false, "generate table_test.go")
|
|
|
|
|
+)
|
|
|
|
|
+
|
|
|
|
|
+func main() {
|
|
|
|
|
+ if err := main1(); err != nil {
|
|
|
|
|
+ fmt.Fprintln(os.Stderr, err)
|
|
|
|
|
+ os.Exit(1)
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+func main1() error {
|
|
|
|
|
+ flag.Parse()
|
|
|
|
|
+ if *version == "" {
|
|
|
|
|
+ return fmt.Errorf("-version was not specified")
|
|
|
|
|
+ }
|
|
|
|
|
+ var r io.Reader = os.Stdin
|
|
|
|
|
+ if *url != "" {
|
|
|
|
|
+ res, err := http.Get(*url)
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ return err
|
|
|
|
|
+ }
|
|
|
|
|
+ if res.StatusCode != http.StatusOK {
|
|
|
|
|
+ return fmt.Errorf("bad GET status for %s: %d", *url, res.Status)
|
|
|
|
|
+ }
|
|
|
|
|
+ r = res.Body
|
|
|
|
|
+ defer res.Body.Close()
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ var root node
|
|
|
|
|
+ buf := new(bytes.Buffer)
|
|
|
|
|
+ br := bufio.NewReader(r)
|
|
|
|
|
+ for {
|
|
|
|
|
+ s, err := br.ReadString('\n')
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ if err == io.EOF {
|
|
|
|
|
+ break
|
|
|
|
|
+ }
|
|
|
|
|
+ return err
|
|
|
|
|
+ }
|
|
|
|
|
+ s = strings.TrimSpace(s)
|
|
|
|
|
+ if s == "" || strings.HasPrefix(s, "//") || !isASCII(s) {
|
|
|
|
|
+ continue
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if *subset {
|
|
|
|
|
+ switch {
|
|
|
|
|
+ case s == "ao" || strings.HasSuffix(s, ".ao"):
|
|
|
|
|
+ case s == "ar" || strings.HasSuffix(s, ".ar"):
|
|
|
|
|
+ case s == "arpa" || strings.HasSuffix(s, ".arpa"):
|
|
|
|
|
+ case s == "jp":
|
|
|
|
|
+ case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
|
|
|
|
|
+ case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
|
|
|
|
|
+ case s == "uk" || strings.HasSuffix(s, ".uk"):
|
|
|
|
|
+ case s == "zw" || strings.HasSuffix(s, ".zw"):
|
|
|
|
|
+ default:
|
|
|
|
|
+ continue
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ rules = append(rules, s)
|
|
|
|
|
+
|
|
|
|
|
+ nt, wildcard := nodeTypeNormal, false
|
|
|
|
|
+ switch {
|
|
|
|
|
+ case strings.HasPrefix(s, "*."):
|
|
|
|
|
+ s, nt = s[2:], nodeTypeParentOnly
|
|
|
|
|
+ wildcard = true
|
|
|
|
|
+ case strings.HasPrefix(s, "!"):
|
|
|
|
|
+ s, nt = s[1:], nodeTypeException
|
|
|
|
|
+ }
|
|
|
|
|
+ labels := strings.Split(s, ".")
|
|
|
|
|
+ for n, i := &root, len(labels)-1; i >= 0; i-- {
|
|
|
|
|
+ label := labels[i]
|
|
|
|
|
+ n = n.child(label)
|
|
|
|
|
+ if i == 0 {
|
|
|
|
|
+ if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
|
|
|
|
|
+ n.nodeType = nt
|
|
|
|
|
+ }
|
|
|
|
|
+ n.wildcard = n.wildcard || wildcard
|
|
|
|
|
+ }
|
|
|
|
|
+ labelsMap[label] = true
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ labelsList = make([]string, 0, len(labelsMap))
|
|
|
|
|
+ for label := range labelsMap {
|
|
|
|
|
+ labelsList = append(labelsList, label)
|
|
|
|
|
+ }
|
|
|
|
|
+ sort.Strings(labelsList)
|
|
|
|
|
+
|
|
|
|
|
+ p := printReal
|
|
|
|
|
+ if *test {
|
|
|
|
|
+ p = printTest
|
|
|
|
|
+ }
|
|
|
|
|
+ if err := p(buf, &root); err != nil {
|
|
|
|
|
+ return err
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ b, err := format.Source(buf.Bytes())
|
|
|
|
|
+ if err != nil {
|
|
|
|
|
+ return err
|
|
|
|
|
+ }
|
|
|
|
|
+ _, err = os.Stdout.Write(b)
|
|
|
|
|
+ return err
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+func isASCII(s string) bool {
|
|
|
|
|
+ for i := 0; i < len(s); i++ {
|
|
|
|
|
+ if s[i] < 32 || 127 < s[i] {
|
|
|
|
|
+ return false
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ return true
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+func printTest(w io.Writer, n *node) error {
|
|
|
|
|
+ fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
|
|
|
|
|
+ fmt.Fprintf(w, "package publicsuffix\n\nvar rules = [...]string{\n")
|
|
|
|
|
+ for _, rule := range rules {
|
|
|
|
|
+ fmt.Fprintf(w, "%q,\n", rule)
|
|
|
|
|
+ }
|
|
|
|
|
+ fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
|
|
|
|
|
+ if err := n.walk(w, printNodeLabel); err != nil {
|
|
|
|
|
+ return err
|
|
|
|
|
+ }
|
|
|
|
|
+ fmt.Fprintf(w, "}\n")
|
|
|
|
|
+ return nil
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+func printReal(w io.Writer, n *node) error {
|
|
|
|
|
+ const header = `// generated by go run gen.go; DO NOT EDIT
|
|
|
|
|
+
|
|
|
|
|
+package publicsuffix
|
|
|
|
|
+
|
|
|
|
|
+const version = %q
|
|
|
|
|
+
|
|
|
|
|
+const (
|
|
|
|
|
+ nodeTypeNormal = %d
|
|
|
|
|
+ nodeTypeException = %d
|
|
|
|
|
+ nodeTypeParentOnly = %d
|
|
|
|
|
+)
|
|
|
|
|
+
|
|
|
|
|
+// numTLD is the number of top level domains.
|
|
|
|
|
+const numTLD = %d
|
|
|
|
|
+
|
|
|
|
|
+`
|
|
|
|
|
+ fmt.Fprintf(w, header, *version, nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
|
|
|
|
|
+
|
|
|
|
|
+ text := makeText()
|
|
|
|
|
+ if text == "" {
|
|
|
|
|
+ return fmt.Errorf("internal error: makeText returned no text")
|
|
|
|
|
+ }
|
|
|
|
|
+ for _, label := range labelsList {
|
|
|
|
|
+ offset, length := strings.Index(text, label), len(label)
|
|
|
|
|
+ if offset < 0 {
|
|
|
|
|
+ return fmt.Errorf("internal error: could not find %q in text %q", label, text)
|
|
|
|
|
+ }
|
|
|
|
|
+ if offset >= 1<<24 || length >= 1<<8 {
|
|
|
|
|
+ return fmt.Errorf("text offset/length is too large: %d/%d", offset, length)
|
|
|
|
|
+ }
|
|
|
|
|
+ labelEncoding[label] = uint32(offset)<<8 | uint32(length)
|
|
|
|
|
+ }
|
|
|
|
|
+ fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ")
|
|
|
|
|
+ for len(text) > 0 {
|
|
|
|
|
+ n, plus := len(text), ""
|
|
|
|
|
+ if n > 64 {
|
|
|
|
|
+ n, plus = 64, " +"
|
|
|
|
|
+ }
|
|
|
|
|
+ fmt.Fprintf(w, "%q%s\n", text[:n], plus)
|
|
|
|
|
+ text = text[n:]
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ n.walk(w, assignNodeIndexes)
|
|
|
|
|
+
|
|
|
|
|
+ fmt.Fprintf(w, `
|
|
|
|
|
+
|
|
|
|
|
+// Nodes is the list of nodes. Each node is encoded as two uint32 values.
|
|
|
|
|
+//
|
|
|
|
|
+// The first uint32 encodes the node's children, nodeType, and a wildcard bit.
|
|
|
|
|
+// In the //-comment after each node's data, the indexes of the children are
|
|
|
|
|
+// formatted as (0x1234-0x1256). The nodeType is printed as + for normal, ! for
|
|
|
|
|
+// exception, and o for parent-only nodes that have children but don't match a
|
|
|
|
|
+// domain in their own right. The * denotes the wildcard bit. The layout within
|
|
|
|
|
+// the uint32, from MSB to LSB, is:
|
|
|
|
|
+// [2] nodeType [1] wildcard [13] number of children [16] first child.
|
|
|
|
|
+// If a node has no children then the low 29 bits are zero.
|
|
|
|
|
+//
|
|
|
|
|
+// The second uint32 encodes the node's text. The layout is:
|
|
|
|
|
+// [24] text offset [8] text length.
|
|
|
|
|
+//
|
|
|
|
|
+// TODO(nigeltao): this table has a lot of zeroes, for childless nodes. It
|
|
|
|
|
+// would be tight, but it should be possible to use only 32 bits per node
|
|
|
|
|
+// instead of 64, with an offset into a parent-child table. A back-of-the-
|
|
|
|
|
+// envelope calculation suggests that at 6000 rows (of which 90%% are leaves),
|
|
|
|
|
+// this could save an extra 20KiB of data.
|
|
|
|
|
+var nodes = [...][2]uint32{
|
|
|
|
|
+`)
|
|
|
|
|
+ if err := n.walk(w, printNode); err != nil {
|
|
|
|
|
+ return err
|
|
|
|
|
+ }
|
|
|
|
|
+ fmt.Fprintf(w, "}\n")
|
|
|
|
|
+ return nil
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+type node struct {
|
|
|
|
|
+ label string
|
|
|
|
|
+ nodeType int
|
|
|
|
|
+ wildcard bool
|
|
|
|
|
+ // index is the index of this node in the nodes array.
|
|
|
|
|
+ index int
|
|
|
|
|
+ // firstChild is the index of this node's first child, or zero if this
|
|
|
|
|
+ // node has no children.
|
|
|
|
|
+ firstChild int
|
|
|
|
|
+ // children are the node's children, in strictly increasing node label order.
|
|
|
|
|
+ children []*node
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error {
|
|
|
|
|
+ if err := f(w, n); err != nil {
|
|
|
|
|
+ return err
|
|
|
|
|
+ }
|
|
|
|
|
+ for _, c := range n.children {
|
|
|
|
|
+ if err := c.walk(w, f); err != nil {
|
|
|
|
|
+ return err
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ return nil
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+// child returns the child of n with the given label. The child is created if
|
|
|
|
|
+// it did not exist beforehand.
|
|
|
|
|
+func (n *node) child(label string) *node {
|
|
|
|
|
+ for _, c := range n.children {
|
|
|
|
|
+ if c.label == label {
|
|
|
|
|
+ return c
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ c := &node{
|
|
|
|
|
+ label: label,
|
|
|
|
|
+ nodeType: nodeTypeParentOnly,
|
|
|
|
|
+ }
|
|
|
|
|
+ n.children = append(n.children, c)
|
|
|
|
|
+ sort.Sort(byLabel(n.children))
|
|
|
|
|
+ return c
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+type byLabel []*node
|
|
|
|
|
+
|
|
|
|
|
+func (b byLabel) Len() int { return len(b) }
|
|
|
|
|
+func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
|
|
|
|
|
+func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
|
|
|
|
|
+
|
|
|
|
|
+var nextNodeIndex int
|
|
|
|
|
+
|
|
|
|
|
+func assignNodeIndexes(w io.Writer, n *node) error {
|
|
|
|
|
+ if len(n.children) != 0 {
|
|
|
|
|
+ n.firstChild = nextNodeIndex
|
|
|
|
|
+ for _, c := range n.children {
|
|
|
|
|
+ c.index = nextNodeIndex
|
|
|
|
|
+ nextNodeIndex++
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ return nil
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+func printNode(w io.Writer, n *node) error {
|
|
|
|
|
+ for _, c := range n.children {
|
|
|
|
|
+ s := "-------------"
|
|
|
|
|
+ if len(c.children) != 0 {
|
|
|
|
|
+ s = fmt.Sprintf("0x%04x-0x%04x", c.firstChild, c.firstChild+len(c.children))
|
|
|
|
|
+ }
|
|
|
|
|
+ wildcardBit, wildcardStr := uint32(0), ' '
|
|
|
|
|
+ if c.wildcard {
|
|
|
|
|
+ wildcardBit, wildcardStr = 1<<29, '*'
|
|
|
|
|
+ }
|
|
|
|
|
+ if c.firstChild >= 1<<16 || len(c.children) >= 1<<13 {
|
|
|
|
|
+ return fmt.Errorf("nodes offset/length is too large: %d/%d", c.firstChild, len(c.children))
|
|
|
|
|
+ }
|
|
|
|
|
+ encoding := uint32(c.nodeType<<30) | wildcardBit | uint32(len(c.children)<<16) | uint32(c.firstChild)
|
|
|
|
|
+ fmt.Fprintf(w, "{0x%08x, 0x%08x}, // 0x%04x (%s) %s%c %s\n",
|
|
|
|
|
+ encoding, labelEncoding[c.label], c.index, s,
|
|
|
|
|
+ nodeTypeString(c.nodeType), wildcardStr, c.label,
|
|
|
|
|
+ )
|
|
|
|
|
+ }
|
|
|
|
|
+ return nil
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+func printNodeLabel(w io.Writer, n *node) error {
|
|
|
|
|
+ for _, c := range n.children {
|
|
|
|
|
+ fmt.Fprintf(w, "%q,\n", c.label)
|
|
|
|
|
+ }
|
|
|
|
|
+ return nil
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+// makeText combines all the strings in labelsList to form one giant string.
|
|
|
|
|
+// If the crush flag is true, then overlapping strings will be merged: "arpa"
|
|
|
|
|
+// and "parliament" could yield "arparliament".
|
|
|
|
|
+func makeText() string {
|
|
|
|
|
+ if !*crush {
|
|
|
|
|
+ return strings.Join(labelsList, "")
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ beforeLength := 0
|
|
|
|
|
+ for _, s := range labelsList {
|
|
|
|
|
+ beforeLength += len(s)
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Make a copy of labelsList.
|
|
|
|
|
+ ss := append(make([]string, 0, len(labelsList)), labelsList...)
|
|
|
|
|
+
|
|
|
|
|
+ // Remove strings that are substrings of other strings.
|
|
|
|
|
+ for changed := true; changed; {
|
|
|
|
|
+ changed = false
|
|
|
|
|
+ for i, s := range ss {
|
|
|
|
|
+ if s == "" {
|
|
|
|
|
+ continue
|
|
|
|
|
+ }
|
|
|
|
|
+ for j, t := range ss {
|
|
|
|
|
+ if i != j && t != "" && strings.Contains(s, t) {
|
|
|
|
|
+ changed = true
|
|
|
|
|
+ ss[j] = ""
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Remove the empty strings.
|
|
|
|
|
+ sort.Strings(ss)
|
|
|
|
|
+ for len(ss) > 0 && ss[0] == "" {
|
|
|
|
|
+ ss = ss[1:]
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Join strings where one suffix matches another prefix.
|
|
|
|
|
+ for {
|
|
|
|
|
+ // Find best i, j, k such that ss[i][len-k:] == ss[j][:k],
|
|
|
|
|
+ // maximizing overlap length k.
|
|
|
|
|
+ besti := -1
|
|
|
|
|
+ bestj := -1
|
|
|
|
|
+ bestk := 0
|
|
|
|
|
+ for i, s := range ss {
|
|
|
|
|
+ if s == "" {
|
|
|
|
|
+ continue
|
|
|
|
|
+ }
|
|
|
|
|
+ for j, t := range ss {
|
|
|
|
|
+ if i == j {
|
|
|
|
|
+ continue
|
|
|
|
|
+ }
|
|
|
|
|
+ for k := bestk + 1; k <= len(s) && k <= len(t); k++ {
|
|
|
|
|
+ if s[len(s)-k:] == t[:k] {
|
|
|
|
|
+ besti = i
|
|
|
|
|
+ bestj = j
|
|
|
|
|
+ bestk = k
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ if bestk > 0 {
|
|
|
|
|
+ if *v {
|
|
|
|
|
+ fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d) out of (%4d,%4d): %q and %q\n",
|
|
|
|
|
+ bestk, besti, bestj, len(ss), len(ss), ss[besti], ss[bestj])
|
|
|
|
|
+ }
|
|
|
|
|
+ ss[besti] += ss[bestj][bestk:]
|
|
|
|
|
+ ss[bestj] = ""
|
|
|
|
|
+ continue
|
|
|
|
|
+ }
|
|
|
|
|
+ break
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ text := strings.Join(ss, "")
|
|
|
|
|
+ if *v {
|
|
|
|
|
+ fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
|
|
|
|
|
+ }
|
|
|
|
|
+ return text
|
|
|
|
|
+}
|