Browse Source

go.net/publicsuffix: new package.

The tables were generated by:

go run gen.go -subset -version "subset of publicsuffix.org's effective_tld_names.dat, hg revision 05b11a8d1ace (2012-11-09)"       >table.go

go run gen.go -subset -version "subset of publicsuffix.org's effective_tld_names.dat, hg revision 05b11a8d1ace (2012-11-09)" -test >table_test.go

The input data is subsetted so that code review is easier while still
covering the interesting * and ! rules. A follow-up changelist will
check in the unfiltered public suffix list.

Update golang/go#1960.

R=rsc, dr.volker.dobler
CC=golang-dev
https://golang.org/cl/6912045
Nigel Tao 13 years ago
parent
commit
67a3048087
5 changed files with 1082 additions and 0 deletions
  1. 439 0
      publicsuffix/gen.go
  2. 97 0
      publicsuffix/list.go
  3. 271 0
      publicsuffix/list_test.go
  4. 119 0
      publicsuffix/table.go
  5. 156 0
      publicsuffix/table_test.go

+ 439 - 0
publicsuffix/gen.go

@@ -0,0 +1,439 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+// This program generates table.go and table_test.go.
+// Invoke as:
+//
+//	go run gen.go -version "xxx"       >table.go
+//	go run gen.go -version "xxx" -test >table_test.go
+//
+// The version is derived from information found at
+// http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat
+// which is linked from http://publicsuffix.org/list/.
+//
+// To fetch a particular hg revision, such as 05b11a8d1ace, pass
+// -url "http://hg.mozilla.org/mozilla-central/raw-file/05b11a8d1ace/netwerk/dns/effective_tld_names.dat"
+
+// TODO(nigeltao): decide what to do with non-ASCII entries.
+
+import (
+	"bufio"
+	"bytes"
+	"flag"
+	"fmt"
+	"go/format"
+	"io"
+	"net/http"
+	"os"
+	"sort"
+	"strings"
+)
+
+const (
+	nodeTypeNormal     = 0
+	nodeTypeException  = 1
+	nodeTypeParentOnly = 2
+)
+
+func nodeTypeString(n int) string {
+	switch n {
+	case nodeTypeNormal:
+		return "+"
+	case nodeTypeException:
+		return "!"
+	case nodeTypeParentOnly:
+		return "o"
+	}
+	panic("unreachable")
+}
+
+var (
+	labelEncoding = map[string]uint32{}
+	labelsList    = []string{}
+	labelsMap     = map[string]bool{}
+	rules         = []string{}
+
+	crush  = flag.Bool("crush", true, "make the generated node text as small as possible")
+	subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
+	url    = flag.String("url",
+		"http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1",
+		"URL of the publicsuffix.org list. If empty, stdin is read instead")
+	v       = flag.Bool("v", false, "verbose output (to stderr)")
+	version = flag.String("version", "", "the effective_tld_names.dat version")
+	test    = flag.Bool("test", false, "generate table_test.go")
+)
+
+func main() {
+	if err := main1(); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(1)
+	}
+}
+
+func main1() error {
+	flag.Parse()
+	if *version == "" {
+		return fmt.Errorf("-version was not specified")
+	}
+	var r io.Reader = os.Stdin
+	if *url != "" {
+		res, err := http.Get(*url)
+		if err != nil {
+			return err
+		}
+		if res.StatusCode != http.StatusOK {
+			return fmt.Errorf("bad GET status for %s: %d", *url, res.Status)
+		}
+		r = res.Body
+		defer res.Body.Close()
+	}
+
+	var root node
+	buf := new(bytes.Buffer)
+	br := bufio.NewReader(r)
+	for {
+		s, err := br.ReadString('\n')
+		if err != nil {
+			if err == io.EOF {
+				break
+			}
+			return err
+		}
+		s = strings.TrimSpace(s)
+		if s == "" || strings.HasPrefix(s, "//") || !isASCII(s) {
+			continue
+		}
+
+		if *subset {
+			switch {
+			case s == "ao" || strings.HasSuffix(s, ".ao"):
+			case s == "ar" || strings.HasSuffix(s, ".ar"):
+			case s == "arpa" || strings.HasSuffix(s, ".arpa"):
+			case s == "jp":
+			case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
+			case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
+			case s == "uk" || strings.HasSuffix(s, ".uk"):
+			case s == "zw" || strings.HasSuffix(s, ".zw"):
+			default:
+				continue
+			}
+		}
+
+		rules = append(rules, s)
+
+		nt, wildcard := nodeTypeNormal, false
+		switch {
+		case strings.HasPrefix(s, "*."):
+			s, nt = s[2:], nodeTypeParentOnly
+			wildcard = true
+		case strings.HasPrefix(s, "!"):
+			s, nt = s[1:], nodeTypeException
+		}
+		labels := strings.Split(s, ".")
+		for n, i := &root, len(labels)-1; i >= 0; i-- {
+			label := labels[i]
+			n = n.child(label)
+			if i == 0 {
+				if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
+					n.nodeType = nt
+				}
+				n.wildcard = n.wildcard || wildcard
+			}
+			labelsMap[label] = true
+		}
+	}
+	labelsList = make([]string, 0, len(labelsMap))
+	for label := range labelsMap {
+		labelsList = append(labelsList, label)
+	}
+	sort.Strings(labelsList)
+
+	p := printReal
+	if *test {
+		p = printTest
+	}
+	if err := p(buf, &root); err != nil {
+		return err
+	}
+
+	b, err := format.Source(buf.Bytes())
+	if err != nil {
+		return err
+	}
+	_, err = os.Stdout.Write(b)
+	return err
+}
+
+func isASCII(s string) bool {
+	for i := 0; i < len(s); i++ {
+		if s[i] < 32 || 127 < s[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func printTest(w io.Writer, n *node) error {
+	fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
+	fmt.Fprintf(w, "package publicsuffix\n\nvar rules = [...]string{\n")
+	for _, rule := range rules {
+		fmt.Fprintf(w, "%q,\n", rule)
+	}
+	fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
+	if err := n.walk(w, printNodeLabel); err != nil {
+		return err
+	}
+	fmt.Fprintf(w, "}\n")
+	return nil
+}
+
+func printReal(w io.Writer, n *node) error {
+	const header = `// generated by go run gen.go; DO NOT EDIT
+
+package publicsuffix
+
+const version = %q
+
+const (
+	nodeTypeNormal     = %d
+	nodeTypeException  = %d
+	nodeTypeParentOnly = %d
+)
+
+// numTLD is the number of top level domains.
+const numTLD = %d
+
+`
+	fmt.Fprintf(w, header, *version, nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
+
+	text := makeText()
+	if text == "" {
+		return fmt.Errorf("internal error: makeText returned no text")
+	}
+	for _, label := range labelsList {
+		offset, length := strings.Index(text, label), len(label)
+		if offset < 0 {
+			return fmt.Errorf("internal error: could not find %q in text %q", label, text)
+		}
+		if offset >= 1<<24 || length >= 1<<8 {
+			return fmt.Errorf("text offset/length is too large: %d/%d", offset, length)
+		}
+		labelEncoding[label] = uint32(offset)<<8 | uint32(length)
+	}
+	fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ")
+	for len(text) > 0 {
+		n, plus := len(text), ""
+		if n > 64 {
+			n, plus = 64, " +"
+		}
+		fmt.Fprintf(w, "%q%s\n", text[:n], plus)
+		text = text[n:]
+	}
+
+	n.walk(w, assignNodeIndexes)
+
+	fmt.Fprintf(w, `
+
+// Nodes is the list of nodes. Each node is encoded as two uint32 values.
+//
+// The first uint32 encodes the node's children, nodeType, and a wildcard bit.
+// In the //-comment after each node's data, the indexes of the children are
+// formatted as (0x1234-0x1256). The nodeType is printed as + for normal, ! for
+// exception, and o for parent-only nodes that have children but don't match a
+// domain in their own right. The * denotes the wildcard bit. The layout within
+// the uint32, from MSB to LSB, is:
+//	[2] nodeType [1] wildcard [13] number of children [16] first child.
+// If a node has no children then the low 29 bits are zero.
+//
+// The second uint32 encodes the node's text. The layout is:
+//	[24] text offset [8] text length.
+//
+// TODO(nigeltao): this table has a lot of zeroes, for childless nodes. It
+// would be tight, but it should be possible to use only 32 bits per node
+// instead of 64, with an offset into a parent-child table. A back-of-the-
+// envelope calculation suggests that at 6000 rows (of which 90%% are leaves),
+// this could save an extra 20KiB of data.
+var nodes = [...][2]uint32{
+`)
+	if err := n.walk(w, printNode); err != nil {
+		return err
+	}
+	fmt.Fprintf(w, "}\n")
+	return nil
+}
+
+type node struct {
+	label    string
+	nodeType int
+	wildcard bool
+	// index is the index of this node in the nodes array.
+	index int
+	// firstChild is the index of this node's first child, or zero if this
+	// node has no children.
+	firstChild int
+	// children are the node's children, in strictly increasing node label order.
+	children []*node
+}
+
+func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error {
+	if err := f(w, n); err != nil {
+		return err
+	}
+	for _, c := range n.children {
+		if err := c.walk(w, f); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// child returns the child of n with the given label. The child is created if
+// it did not exist beforehand.
+func (n *node) child(label string) *node {
+	for _, c := range n.children {
+		if c.label == label {
+			return c
+		}
+	}
+	c := &node{
+		label:    label,
+		nodeType: nodeTypeParentOnly,
+	}
+	n.children = append(n.children, c)
+	sort.Sort(byLabel(n.children))
+	return c
+}
+
+type byLabel []*node
+
+func (b byLabel) Len() int           { return len(b) }
+func (b byLabel) Swap(i, j int)      { b[i], b[j] = b[j], b[i] }
+func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
+
+var nextNodeIndex int
+
+func assignNodeIndexes(w io.Writer, n *node) error {
+	if len(n.children) != 0 {
+		n.firstChild = nextNodeIndex
+		for _, c := range n.children {
+			c.index = nextNodeIndex
+			nextNodeIndex++
+		}
+	}
+	return nil
+}
+
+func printNode(w io.Writer, n *node) error {
+	for _, c := range n.children {
+		s := "-------------"
+		if len(c.children) != 0 {
+			s = fmt.Sprintf("0x%04x-0x%04x", c.firstChild, c.firstChild+len(c.children))
+		}
+		wildcardBit, wildcardStr := uint32(0), ' '
+		if c.wildcard {
+			wildcardBit, wildcardStr = 1<<29, '*'
+		}
+		if c.firstChild >= 1<<16 || len(c.children) >= 1<<13 {
+			return fmt.Errorf("nodes offset/length is too large: %d/%d", c.firstChild, len(c.children))
+		}
+		encoding := uint32(c.nodeType<<30) | wildcardBit | uint32(len(c.children)<<16) | uint32(c.firstChild)
+		fmt.Fprintf(w, "{0x%08x, 0x%08x}, // 0x%04x (%s) %s%c %s\n",
+			encoding, labelEncoding[c.label], c.index, s,
+			nodeTypeString(c.nodeType), wildcardStr, c.label,
+		)
+	}
+	return nil
+}
+
+func printNodeLabel(w io.Writer, n *node) error {
+	for _, c := range n.children {
+		fmt.Fprintf(w, "%q,\n", c.label)
+	}
+	return nil
+}
+
+// makeText combines all the strings in labelsList to form one giant string.
+// If the crush flag is true, then overlapping strings will be merged: "arpa"
+// and "parliament" could yield "arparliament".
+func makeText() string {
+	if !*crush {
+		return strings.Join(labelsList, "")
+	}
+
+	beforeLength := 0
+	for _, s := range labelsList {
+		beforeLength += len(s)
+	}
+
+	// Make a copy of labelsList.
+	ss := append(make([]string, 0, len(labelsList)), labelsList...)
+
+	// Remove strings that are substrings of other strings.
+	for changed := true; changed; {
+		changed = false
+		for i, s := range ss {
+			if s == "" {
+				continue
+			}
+			for j, t := range ss {
+				if i != j && t != "" && strings.Contains(s, t) {
+					changed = true
+					ss[j] = ""
+				}
+			}
+		}
+	}
+
+	// Remove the empty strings.
+	sort.Strings(ss)
+	for len(ss) > 0 && ss[0] == "" {
+		ss = ss[1:]
+	}
+
+	// Join strings where one suffix matches another prefix.
+	for {
+		// Find best i, j, k such that ss[i][len-k:] == ss[j][:k],
+		// maximizing overlap length k.
+		besti := -1
+		bestj := -1
+		bestk := 0
+		for i, s := range ss {
+			if s == "" {
+				continue
+			}
+			for j, t := range ss {
+				if i == j {
+					continue
+				}
+				for k := bestk + 1; k <= len(s) && k <= len(t); k++ {
+					if s[len(s)-k:] == t[:k] {
+						besti = i
+						bestj = j
+						bestk = k
+					}
+				}
+			}
+		}
+		if bestk > 0 {
+			if *v {
+				fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d) out of (%4d,%4d): %q and %q\n",
+					bestk, besti, bestj, len(ss), len(ss), ss[besti], ss[bestj])
+			}
+			ss[besti] += ss[bestj][bestk:]
+			ss[bestj] = ""
+			continue
+		}
+		break
+	}
+
+	text := strings.Join(ss, "")
+	if *v {
+		fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
+	}
+	return text
+}

+ 97 - 0
publicsuffix/list.go

@@ -0,0 +1,97 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package publicsuffix provides a public suffix list based on data from
+// http://publicsuffix.org/. A public suffix is one under which Internet users
+// can directly register names.
+package publicsuffix
+
+// TODO(nigeltao): do we need to distinguish between ICANN domains and private
+// domains?
+
+import (
+	"exp/cookiejar"
+	"strings"
+)
+
+// List implements cookiejar.PublicSuffixList using a copy of the
+// publicsuffix.org database compiled into the library.
+var List cookiejar.PublicSuffixList = list{}
+
+type list struct{}
+
+func (list) String() string {
+	return version
+}
+
+func (list) PublicSuffix(domain string) string {
+	lo, hi := uint32(0), uint32(numTLD)
+	s, suffix, wildcard := domain, len(domain), false
+loop:
+	for {
+		dot := strings.LastIndex(s, ".")
+		if wildcard {
+			suffix = 1 + dot
+		}
+		if lo == hi {
+			break
+		}
+		f := find(s[1+dot:], lo, hi)
+		if f == notFound {
+			break
+		}
+
+		u := nodes[f][0]
+		lo = u & 0xffff
+		u >>= 16
+		hi = u&0x1fff + lo
+		u >>= 13
+		wildcard = u&0x01 != 0
+		u >>= 1
+		switch u {
+		case nodeTypeNormal:
+			suffix = 1 + dot
+		case nodeTypeException:
+			suffix = 1 + len(s)
+			break loop
+		}
+
+		if dot == -1 {
+			break
+		}
+		s = s[:dot]
+	}
+	if suffix == len(domain) {
+		// If no rules match, the prevailing rule is "*".
+		return domain[1+strings.LastIndex(domain, "."):]
+	}
+	return domain[suffix:]
+}
+
+const notFound uint32 = 1<<32 - 1
+
+// find returns the index of the node in the range [lo, hi) whose label equals
+// label, or notFound if there is no such node. The range is assumed to be in
+// strictly increasing node label order.
+func find(label string, lo, hi uint32) uint32 {
+	for lo < hi {
+		mid := lo + (hi-lo)/2
+		s := nodeLabel(mid)
+		if s < label {
+			lo = mid + 1
+		} else if s == label {
+			return mid
+		} else {
+			hi = mid
+		}
+	}
+	return notFound
+}
+
+// nodeLabel returns the label for the i'th node.
+func nodeLabel(i uint32) string {
+	x := nodes[i][1]
+	offset, length := x>>8, x&0xff
+	return text[offset : offset+length]
+}

+ 271 - 0
publicsuffix/list_test.go

@@ -0,0 +1,271 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package publicsuffix
+
+import (
+	"sort"
+	"strings"
+	"testing"
+)
+
+func TestNodeLabel(t *testing.T) {
+	for i, want := range nodeLabels {
+		got := nodeLabel(uint32(i))
+		if got != want {
+			t.Errorf("%d: got %q, want %q", i, got, want)
+		}
+	}
+}
+
+func TestFind(t *testing.T) {
+	testCases := []string{
+		"",
+		"a",
+		"a0",
+		"aaaa",
+		"ao",
+		"ap",
+		"ar",
+		"aro",
+		"arp",
+		"arpa",
+		"arpaa",
+		"arpb",
+		"az",
+		"b",
+		"b0",
+		"ba",
+		"z",
+		"zu",
+		"zv",
+		"zw",
+		"zx",
+		"zy",
+		"zz",
+		"zzzz",
+	}
+	for _, tc := range testCases {
+		got := find(tc, 0, numTLD)
+		want := notFound
+		for i := uint32(0); i < numTLD; i++ {
+			if tc == nodeLabel(i) {
+				want = i
+				break
+			}
+		}
+		if got != want {
+			t.Errorf("%q: got %d, want %d", tc, got, want)
+		}
+	}
+}
+
+var publicSuffixTestCases = []struct {
+	domain, want string
+}{
+	// Empty string.
+	{"", ""},
+
+	// The .ao rules are:
+	// ao
+	// ed.ao
+	// gv.ao
+	// og.ao
+	// co.ao
+	// pb.ao
+	// it.ao
+	{"ao", "ao"},
+	{"www.ao", "ao"},
+	{"pb.ao", "pb.ao"},
+	{"www.pb.ao", "pb.ao"},
+	{"www.xxx.yyy.zzz.pb.ao", "pb.ao"},
+
+	// The .ar rules are:
+	// *.ar
+	// !congresodelalengua3.ar
+	// !educ.ar
+	// !gobiernoelectronico.ar
+	// !mecon.ar
+	// !nacion.ar
+	// !nic.ar
+	// !promocion.ar
+	// !retina.ar
+	// !uba.ar
+	// blogspot.com.ar
+	{"ar", "ar"},
+	{"www.ar", "www.ar"},
+	{"nic.ar", "ar"},
+	{"www.nic.ar", "ar"},
+	{"com.ar", "com.ar"},
+	{"www.com.ar", "com.ar"},
+	{"blogspot.com.ar", "blogspot.com.ar"},
+	{"www.blogspot.com.ar", "blogspot.com.ar"},
+	{"www.xxx.yyy.zzz.blogspot.com.ar", "blogspot.com.ar"},
+	{"logspot.com.ar", "com.ar"},
+	{"zlogspot.com.ar", "com.ar"},
+	{"zblogspot.com.ar", "com.ar"},
+
+	// The .arpa rules are:
+	// e164.arpa
+	// in-addr.arpa
+	// ip6.arpa
+	// iris.arpa
+	// uri.arpa
+	// urn.arpa
+	{"arpa", "arpa"},
+	{"www.arpa", "arpa"},
+	{"urn.arpa", "urn.arpa"},
+	{"www.urn.arpa", "urn.arpa"},
+	{"www.xxx.yyy.zzz.urn.arpa", "urn.arpa"},
+
+	// The relevant {kobe,kyoto}.jp rules are:
+	// jp
+	// *.kobe.jp
+	// !city.kobe.jp
+	// kyoto.jp
+	// ide.kyoto.jp
+	{"jp", "jp"},
+	{"kobe.jp", "jp"},
+	{"c.kobe.jp", "c.kobe.jp"},
+	{"b.c.kobe.jp", "c.kobe.jp"},
+	{"a.b.c.kobe.jp", "c.kobe.jp"},
+	{"city.kobe.jp", "kobe.jp"},
+	{"www.city.kobe.jp", "kobe.jp"},
+	{"kyoto.jp", "kyoto.jp"},
+	{"test.kyoto.jp", "kyoto.jp"},
+	{"ide.kyoto.jp", "ide.kyoto.jp"},
+	{"b.ide.kyoto.jp", "ide.kyoto.jp"},
+	{"a.b.ide.kyoto.jp", "ide.kyoto.jp"},
+
+	// The .uk rules are:
+	// *.uk
+	// *.sch.uk
+	// !bl.uk
+	// !british-library.uk
+	// !jet.uk
+	// !mod.uk
+	// !national-library-scotland.uk
+	// !nel.uk
+	// !nic.uk
+	// !nls.uk
+	// !parliament.uk
+	// blogspot.co.uk
+	{"uk", "uk"},
+	{"aaa.uk", "aaa.uk"},
+	{"www.aaa.uk", "aaa.uk"},
+	{"mod.uk", "uk"},
+	{"www.mod.uk", "uk"},
+	{"sch.uk", "sch.uk"},
+	{"mod.sch.uk", "mod.sch.uk"},
+	{"www.sch.uk", "www.sch.uk"},
+	{"blogspot.co.uk", "blogspot.co.uk"},
+	{"blogspot.nic.uk", "uk"},
+	{"blogspot.sch.uk", "blogspot.sch.uk"},
+
+	// The .zw rules are:
+	// *.zw
+	{"zw", "zw"},
+	{"www.zw", "www.zw"},
+	{"zzz.zw", "zzz.zw"},
+	{"www.zzz.zw", "zzz.zw"},
+	{"www.xxx.yyy.zzz.zw", "zzz.zw"},
+
+	// There are no .nosuchtld rules.
+	{"nosuchtld", "nosuchtld"},
+	{"foo.nosuchtld", "nosuchtld"},
+	{"bar.foo.nosuchtld", "nosuchtld"},
+}
+
+func TestPublicSuffix(t *testing.T) {
+	for _, tc := range publicSuffixTestCases {
+		got := List.PublicSuffix(tc.domain)
+		if got != tc.want {
+			t.Errorf("%q: got %q, want %q", tc.domain, got, tc.want)
+		}
+	}
+}
+
+func TestSlowPublicSuffix(t *testing.T) {
+	for _, tc := range publicSuffixTestCases {
+		got := slowPublicSuffix(tc.domain)
+		if got != tc.want {
+			t.Errorf("%q: got %q, want %q", tc.domain, got, tc.want)
+		}
+	}
+}
+
+// slowPublicSuffix implements the canonical (but O(number of rules)) public
+// suffix algorithm described at http://publicsuffix.org/list/.
+//
+// 1. Match domain against all rules and take note of the matching ones.
+// 2. If no rules match, the prevailing rule is "*".
+// 3. If more than one rule matches, the prevailing rule is the one which is an exception rule.
+// 4. If there is no matching exception rule, the prevailing rule is the one with the most labels.
+// 5. If the prevailing rule is a exception rule, modify it by removing the leftmost label.
+// 6. The public suffix is the set of labels from the domain which directly match the labels of the prevailing rule (joined by dots).
+// 7. The registered or registrable domain is the public suffix plus one additional label.
+//
+// This function returns the public suffix, not the registrable domain, and so
+// it stops after step 6.
+func slowPublicSuffix(domain string) string {
+	match := func(rulePart, domainPart string) bool {
+		switch rulePart[0] {
+		case '*':
+			return true
+		case '!':
+			return rulePart[1:] == domainPart
+		}
+		return rulePart == domainPart
+	}
+
+	domainParts := strings.Split(domain, ".")
+	var matchingRules [][]string
+
+loop:
+	for _, rule := range rules {
+		ruleParts := strings.Split(rule, ".")
+		if len(domainParts) < len(ruleParts) {
+			continue
+		}
+		for i := range ruleParts {
+			rulePart := ruleParts[len(ruleParts)-1-i]
+			domainPart := domainParts[len(domainParts)-1-i]
+			if !match(rulePart, domainPart) {
+				continue loop
+			}
+		}
+		matchingRules = append(matchingRules, ruleParts)
+	}
+	if len(matchingRules) == 0 {
+		matchingRules = append(matchingRules, []string{"*"})
+	} else {
+		sort.Sort(byPriority(matchingRules))
+	}
+	prevailing := matchingRules[0]
+	if prevailing[0][0] == '!' {
+		prevailing = prevailing[1:]
+	}
+	if prevailing[0][0] == '*' {
+		replaced := domainParts[len(domainParts)-len(prevailing)]
+		prevailing = append([]string{replaced}, prevailing[1:]...)
+	}
+	return strings.Join(prevailing, ".")
+}
+
+type byPriority [][]string
+
+func (b byPriority) Len() int      { return len(b) }
+func (b byPriority) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
+func (b byPriority) Less(i, j int) bool {
+	if b[i][0][0] == '!' {
+		return true
+	}
+	if b[j][0][0] == '!' {
+		return false
+	}
+	return len(b[i]) > len(b[j])
+}
+
+// TODO(nigeltao): add the "Effective Top Level Domain Plus 1" tests from
+// http://mxr.mozilla.org/mozilla-central/source/netwerk/test/unit/data/test_psl.txt

+ 119 - 0
publicsuffix/table.go

@@ -0,0 +1,119 @@
+// generated by go run gen.go; DO NOT EDIT
+
+package publicsuffix
+
+const version = "subset of publicsuffix.org's effective_tld_names.dat, hg revision 05b11a8d1ace (2012-11-09)"
+
+const (
+	nodeTypeNormal     = 0
+	nodeTypeException  = 1
+	nodeTypeParentOnly = 2
+)
+
+// numTLD is the number of top level domains.
+const numTLD = 6
+
+// Text is the combined text of all labels.
+const text = "british-libraryawatarparliamentfukuchiyamashinacionakagyoyamazak" +
+	"itayabe164gvin-addretinagaokakyotambainelip6irisakyotanabejetjoy" +
+	"ojpblogspotkizujitawarakumiyamakyotangobiernoelectronicomecongre" +
+	"sodelalengua3kyotominamiyamashiromiyazurideducitymukobepromocion" +
+	"antanational-library-scotlandseikameokamodurnlschigashiyamaizuru" +
+	"bazwazuka"
+
+// Nodes is the list of nodes. Each node is encoded as two uint32 values.
+//
+// The first uint32 encodes the node's children, nodeType, and a wildcard bit.
+// In the //-comment after each node's data, the indexes of the children are
+// formatted as (0x1234-0x1256). The nodeType is printed as + for normal, ! for
+// exception, and o for parent-only nodes that have children but don't match a
+// domain in their own right. The * denotes the wildcard bit. The layout within
+// the uint32, from MSB to LSB, is:
+//	[2] nodeType [1] wildcard [13] number of children [16] first child.
+// If a node has no children then the low 29 bits are zero.
+//
+// The second uint32 encodes the node's text. The layout is:
+//	[24] text offset [8] text length.
+//
+// TODO(nigeltao): this table has a lot of zeroes, for childless nodes. It
+// would be tight, but it should be possible to use only 32 bits per node
+// instead of 64, with an offset into a parent-child table. A back-of-the-
+// envelope calculation suggests that at 6000 rows (of which 90% are leaves),
+// this could save an extra 20KiB of data.
+var nodes = [...][2]uint32{
+	{0x00060006, 0x00005902}, // 0x0000 (0x0006-0x000c) +  ao
+	{0xa00a000c, 0x00000c02}, // 0x0001 (0x000c-0x0016) o* ar
+	{0x80060017, 0x00001304}, // 0x0002 (0x0017-0x001d) o  arpa
+	{0x0002001d, 0x00008102}, // 0x0003 (0x001d-0x001f) +  jp
+	{0xa00b003f, 0x00002002}, // 0x0004 (0x003f-0x004a) o* uk
+	{0xa0000000, 0x00014202}, // 0x0005 (-------------) o* zw
+	{0x00000000, 0x0000b602}, // 0x0006 (-------------) +  co
+	{0x00000000, 0x0000ea02}, // 0x0007 (-------------) +  ed
+	{0x00000000, 0x00004a02}, // 0x0008 (-------------) +  gv
+	{0x00000000, 0x00000202}, // 0x0009 (-------------) +  it
+	{0x00000000, 0x00008502}, // 0x000a (-------------) +  og
+	{0x00000000, 0x00008202}, // 0x000b (-------------) +  pb
+	{0x80010016, 0x0000b603}, // 0x000c (0x0016-0x0017) o  com
+	{0x40000000, 0x0000ba13}, // 0x000d (-------------) !  congresodelalengua3
+	{0x40000000, 0x0000ea04}, // 0x000e (-------------) !  educ
+	{0x40000000, 0x0000a513}, // 0x000f (-------------) !  gobiernoelectronico
+	{0x40000000, 0x0000b805}, // 0x0010 (-------------) !  mecon
+	{0x40000000, 0x00002d06}, // 0x0011 (-------------) !  nacion
+	{0x40000000, 0x0000b403}, // 0x0012 (-------------) !  nic
+	{0x40000000, 0x0000f709}, // 0x0013 (-------------) !  promocion
+	{0x40000000, 0x00005206}, // 0x0014 (-------------) !  retina
+	{0x40000000, 0x00013f03}, // 0x0015 (-------------) !  uba
+	{0x00000000, 0x00008308}, // 0x0016 (-------------) +  blogspot
+	{0x00000000, 0x00004604}, // 0x0017 (-------------) +  e164
+	{0x00000000, 0x00004c07}, // 0x0018 (-------------) +  in-addr
+	{0x00000000, 0x00006903}, // 0x0019 (-------------) +  ip6
+	{0x00000000, 0x00006c04}, // 0x001a (-------------) +  iris
+	{0x00000000, 0x0000e603}, // 0x001b (-------------) +  uri
+	{0x00000000, 0x00012a03}, // 0x001c (-------------) +  urn
+	{0xa001001f, 0x0000f304}, // 0x001d (0x001f-0x0020) o* kobe
+	{0x001f0020, 0x0000cd05}, // 0x001e (0x0020-0x003f) +  kyoto
+	{0x40000000, 0x0000ed04}, // 0x001f (-------------) !  city
+	{0x00000000, 0x00004205}, // 0x0020 (-------------) +  ayabe
+	{0x00000000, 0x00001f0b}, // 0x0021 (-------------) +  fukuchiyama
+	{0x00000000, 0x0001300b}, // 0x0022 (-------------) +  higashiyama
+	{0x00000000, 0x0000e803}, // 0x0023 (-------------) +  ide
+	{0x00000000, 0x00006503}, // 0x0024 (-------------) +  ine
+	{0x00000000, 0x00007d04}, // 0x0025 (-------------) +  joyo
+	{0x00000000, 0x00012007}, // 0x0026 (-------------) +  kameoka
+	{0x00000000, 0x00012504}, // 0x0027 (-------------) +  kamo
+	{0x00000000, 0x00003f04}, // 0x0028 (-------------) +  kita
+	{0x00000000, 0x00008b04}, // 0x0029 (-------------) +  kizu
+	{0x00000000, 0x00009708}, // 0x002a (-------------) +  kumiyama
+	{0x00000000, 0x00005d08}, // 0x002b (-------------) +  kyotamba
+	{0x00000000, 0x00007109}, // 0x002c (-------------) +  kyotanabe
+	{0x00000000, 0x00009f08}, // 0x002d (-------------) +  kyotango
+	{0x00000000, 0x00013907}, // 0x002e (-------------) +  maizuru
+	{0x00000000, 0x0000d206}, // 0x002f (-------------) +  minami
+	{0x00000000, 0x0000d20f}, // 0x0030 (-------------) +  minamiyamashiro
+	{0x00000000, 0x0000e106}, // 0x0031 (-------------) +  miyazu
+	{0x00000000, 0x0000f104}, // 0x0032 (-------------) +  muko
+	{0x00000000, 0x0000560a}, // 0x0033 (-------------) +  nagaokakyo
+	{0x00000000, 0x00003207}, // 0x0034 (-------------) +  nakagyo
+	{0x00000000, 0x0000ff06}, // 0x0035 (-------------) +  nantan
+	{0x00000000, 0x00003809}, // 0x0036 (-------------) +  oyamazaki
+	{0x00000000, 0x00006f05}, // 0x0037 (-------------) +  sakyo
+	{0x00000000, 0x00011d05}, // 0x0038 (-------------) +  seika
+	{0x00000000, 0x00007406}, // 0x0039 (-------------) +  tanabe
+	{0x00000000, 0x00008e03}, // 0x003a (-------------) +  uji
+	{0x00000000, 0x00008e09}, // 0x003b (-------------) +  ujitawara
+	{0x00000000, 0x00014306}, // 0x003c (-------------) +  wazuka
+	{0x00000000, 0x00002609}, // 0x003d (-------------) +  yamashina
+	{0x00000000, 0x00000e06}, // 0x003e (-------------) +  yawata
+	{0x40000000, 0x00008302}, // 0x003f (-------------) !  bl
+	{0x40000000, 0x0000000f}, // 0x0040 (-------------) !  british-library
+	{0x8001004a, 0x0000b602}, // 0x0041 (0x004a-0x004b) o  co
+	{0x40000000, 0x00007a03}, // 0x0042 (-------------) !  jet
+	{0x40000000, 0x00012703}, // 0x0043 (-------------) !  mod
+	{0x40000000, 0x00010419}, // 0x0044 (-------------) !  national-library-scotland
+	{0x40000000, 0x00006603}, // 0x0045 (-------------) !  nel
+	{0x40000000, 0x0000b403}, // 0x0046 (-------------) !  nic
+	{0x40000000, 0x00012c03}, // 0x0047 (-------------) !  nls
+	{0x40000000, 0x0000150a}, // 0x0048 (-------------) !  parliament
+	{0xa0000000, 0x00012e03}, // 0x0049 (-------------) o* sch
+	{0x00000000, 0x00008308}, // 0x004a (-------------) +  blogspot
+}

+ 156 - 0
publicsuffix/table_test.go

@@ -0,0 +1,156 @@
+// generated by go run gen.go; DO NOT EDIT
+
+package publicsuffix
+
+var rules = [...]string{
+	"ao",
+	"ed.ao",
+	"gv.ao",
+	"og.ao",
+	"co.ao",
+	"pb.ao",
+	"it.ao",
+	"*.ar",
+	"!congresodelalengua3.ar",
+	"!educ.ar",
+	"!gobiernoelectronico.ar",
+	"!mecon.ar",
+	"!nacion.ar",
+	"!nic.ar",
+	"!promocion.ar",
+	"!retina.ar",
+	"!uba.ar",
+	"e164.arpa",
+	"in-addr.arpa",
+	"ip6.arpa",
+	"iris.arpa",
+	"uri.arpa",
+	"urn.arpa",
+	"jp",
+	"kyoto.jp",
+	"*.kobe.jp",
+	"!city.kobe.jp",
+	"ayabe.kyoto.jp",
+	"fukuchiyama.kyoto.jp",
+	"higashiyama.kyoto.jp",
+	"ide.kyoto.jp",
+	"ine.kyoto.jp",
+	"joyo.kyoto.jp",
+	"kameoka.kyoto.jp",
+	"kamo.kyoto.jp",
+	"kita.kyoto.jp",
+	"kizu.kyoto.jp",
+	"kumiyama.kyoto.jp",
+	"kyotamba.kyoto.jp",
+	"kyotanabe.kyoto.jp",
+	"kyotango.kyoto.jp",
+	"maizuru.kyoto.jp",
+	"minami.kyoto.jp",
+	"minamiyamashiro.kyoto.jp",
+	"miyazu.kyoto.jp",
+	"muko.kyoto.jp",
+	"nagaokakyo.kyoto.jp",
+	"nakagyo.kyoto.jp",
+	"nantan.kyoto.jp",
+	"oyamazaki.kyoto.jp",
+	"sakyo.kyoto.jp",
+	"seika.kyoto.jp",
+	"tanabe.kyoto.jp",
+	"uji.kyoto.jp",
+	"ujitawara.kyoto.jp",
+	"wazuka.kyoto.jp",
+	"yamashina.kyoto.jp",
+	"yawata.kyoto.jp",
+	"*.uk",
+	"*.sch.uk",
+	"!bl.uk",
+	"!british-library.uk",
+	"!jet.uk",
+	"!mod.uk",
+	"!national-library-scotland.uk",
+	"!nel.uk",
+	"!nic.uk",
+	"!nls.uk",
+	"!parliament.uk",
+	"*.zw",
+	"blogspot.co.uk",
+	"blogspot.com.ar",
+}
+
+var nodeLabels = [...]string{
+	"ao",
+	"ar",
+	"arpa",
+	"jp",
+	"uk",
+	"zw",
+	"co",
+	"ed",
+	"gv",
+	"it",
+	"og",
+	"pb",
+	"com",
+	"congresodelalengua3",
+	"educ",
+	"gobiernoelectronico",
+	"mecon",
+	"nacion",
+	"nic",
+	"promocion",
+	"retina",
+	"uba",
+	"blogspot",
+	"e164",
+	"in-addr",
+	"ip6",
+	"iris",
+	"uri",
+	"urn",
+	"kobe",
+	"kyoto",
+	"city",
+	"ayabe",
+	"fukuchiyama",
+	"higashiyama",
+	"ide",
+	"ine",
+	"joyo",
+	"kameoka",
+	"kamo",
+	"kita",
+	"kizu",
+	"kumiyama",
+	"kyotamba",
+	"kyotanabe",
+	"kyotango",
+	"maizuru",
+	"minami",
+	"minamiyamashiro",
+	"miyazu",
+	"muko",
+	"nagaokakyo",
+	"nakagyo",
+	"nantan",
+	"oyamazaki",
+	"sakyo",
+	"seika",
+	"tanabe",
+	"uji",
+	"ujitawara",
+	"wazuka",
+	"yamashina",
+	"yawata",
+	"bl",
+	"british-library",
+	"co",
+	"jet",
+	"mod",
+	"national-library-scotland",
+	"nel",
+	"nic",
+	"nls",
+	"parliament",
+	"sch",
+	"blogspot",
+}