|
|
@@ -12,14 +12,6 @@ package main
|
|
|
// go run gen.go -version "xxx" >table.go
|
|
|
// go run gen.go -version "xxx" -test >table_test.go
|
|
|
//
|
|
|
-// The first of those two will take around 20 minutes to complete, as the final
|
|
|
-// table is optimized for size. When testing the code generation workflow, pass
|
|
|
-// -crush=false to skip this optimization step, although the results of such a
|
|
|
-// run should not be committed, as the generated table can be around 50% larger
|
|
|
-// and, more importantly, require a larger number of scarce node table bits.
|
|
|
-// You may need to increase nodesBitsTextOffset or other constants to generate
|
|
|
-// a table with -crush=false.
|
|
|
-//
|
|
|
// Pass -v to print verbose progress information.
|
|
|
//
|
|
|
// The version is derived from information found at
|
|
|
@@ -110,7 +102,6 @@ var (
|
|
|
// letters are not allowed.
|
|
|
validSuffix = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
|
|
|
|
|
|
- crush = flag.Bool("crush", true, "make the generated node text as small as possible")
|
|
|
subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
|
|
|
url = flag.String("url",
|
|
|
"https://publicsuffix.org/list/effective_tld_names.dat",
|
|
|
@@ -301,7 +292,7 @@ const numTLD = %d
|
|
|
childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
|
|
|
nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
|
|
|
|
|
|
- text := makeText()
|
|
|
+ text := combineText(labelsList)
|
|
|
if text == "" {
|
|
|
return fmt.Errorf("internal error: makeText returned no text")
|
|
|
}
|
|
|
@@ -329,7 +320,9 @@ const numTLD = %d
|
|
|
text = text[n:]
|
|
|
}
|
|
|
|
|
|
- n.walk(w, assignIndexes)
|
|
|
+ if err := n.walk(w, assignIndexes); err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
|
|
|
fmt.Fprintf(w, `
|
|
|
|
|
|
@@ -544,34 +537,42 @@ func wildcardStr(wildcard bool) string {
|
|
|
return " "
|
|
|
}
|
|
|
|
|
|
-// makeText combines all the strings in labelsList to form one giant string.
|
|
|
-// If the crush flag is true, then overlapping strings will be merged: "arpa"
|
|
|
-// and "parliament" could yield "arparliament".
|
|
|
-func makeText() string {
|
|
|
- if !*crush {
|
|
|
- return strings.Join(labelsList, "")
|
|
|
- }
|
|
|
-
|
|
|
+// combineText combines all the strings in labelsList to form one giant string.
|
|
|
+// Overlapping strings will be merged: "arpa" and "parliament" could yield
|
|
|
+// "arparliament".
|
|
|
+func combineText(labelsList []string) string {
|
|
|
beforeLength := 0
|
|
|
for _, s := range labelsList {
|
|
|
beforeLength += len(s)
|
|
|
}
|
|
|
|
|
|
- // Make a copy of labelsList.
|
|
|
- ss := append(make([]string, 0, len(labelsList)), labelsList...)
|
|
|
+ text := crush(removeSubstrings(labelsList))
|
|
|
+ if *v {
|
|
|
+ fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
|
|
|
+ }
|
|
|
+ return text
|
|
|
+}
|
|
|
|
|
|
- // Remove strings that are substrings of other strings.
|
|
|
- for changed := true; changed; {
|
|
|
- changed = false
|
|
|
- for i, s := range ss {
|
|
|
- if s == "" {
|
|
|
- continue
|
|
|
- }
|
|
|
- for j, t := range ss {
|
|
|
- if i != j && t != "" && strings.Contains(s, t) {
|
|
|
- changed = true
|
|
|
- ss[j] = ""
|
|
|
- }
|
|
|
+type byLength []string
|
|
|
+
|
|
|
+func (s byLength) Len() int { return len(s) }
|
|
|
+func (s byLength) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
|
|
|
+func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
|
|
|
+
|
|
|
+// removeSubstrings returns a copy of its input with any strings removed
|
|
|
+// that are substrings of other provided strings.
|
|
|
+func removeSubstrings(input []string) []string {
|
|
|
+ // Make a copy of input.
|
|
|
+ ss := append(make([]string, 0, len(input)), input...)
|
|
|
+ sort.Sort(byLength(ss))
|
|
|
+
|
|
|
+ for i, shortString := range ss {
|
|
|
+ // For each string, only consider strings higher than it in sort order, i.e.
|
|
|
+ // of equal length or greater.
|
|
|
+ for _, longString := range ss[i+1:] {
|
|
|
+ if strings.Contains(longString, shortString) {
|
|
|
+ ss[i] = ""
|
|
|
+ break
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
@@ -581,46 +582,82 @@ func makeText() string {
|
|
|
for len(ss) > 0 && ss[0] == "" {
|
|
|
ss = ss[1:]
|
|
|
}
|
|
|
+ return ss
|
|
|
+}
|
|
|
|
|
|
- // Join strings where one suffix matches another prefix.
|
|
|
- for {
|
|
|
- // Find best i, j, k such that ss[i][len-k:] == ss[j][:k],
|
|
|
- // maximizing overlap length k.
|
|
|
- besti := -1
|
|
|
- bestj := -1
|
|
|
- bestk := 0
|
|
|
+// crush combines a list of strings, taking advantage of overlaps. It returns a
|
|
|
+// single string that contains each input string as a substring.
|
|
|
+func crush(ss []string) string {
|
|
|
+ maxLabelLen := 0
|
|
|
+ for _, s := range ss {
|
|
|
+ if maxLabelLen < len(s) {
|
|
|
+ maxLabelLen = len(s)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- {
|
|
|
+ prefixes := makePrefixMap(ss, prefixLen)
|
|
|
for i, s := range ss {
|
|
|
- if s == "" {
|
|
|
+ if len(s) <= prefixLen {
|
|
|
continue
|
|
|
}
|
|
|
- for j, t := range ss {
|
|
|
- if i == j {
|
|
|
- continue
|
|
|
- }
|
|
|
- for k := bestk + 1; k <= len(s) && k <= len(t); k++ {
|
|
|
- if s[len(s)-k:] == t[:k] {
|
|
|
- besti = i
|
|
|
- bestj = j
|
|
|
- bestk = k
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
+ mergeLabel(ss, i, prefixLen, prefixes)
|
|
|
}
|
|
|
- if bestk > 0 {
|
|
|
- if *v {
|
|
|
- fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d) out of (%4d,%4d): %q and %q\n",
|
|
|
- bestk, besti, bestj, len(ss), len(ss), ss[besti], ss[bestj])
|
|
|
- }
|
|
|
- ss[besti] += ss[bestj][bestk:]
|
|
|
- ss[bestj] = ""
|
|
|
+ }
|
|
|
+
|
|
|
+ return strings.Join(ss, "")
|
|
|
+}
|
|
|
+
|
|
|
+// mergeLabel merges the label at ss[i] with the first available matching label
|
|
|
+// in prefixMap, where the last "prefixLen" characters in ss[i] match the first
|
|
|
+// "prefixLen" characters in the matching label.
|
|
|
+// It will merge ss[i] repeatedly until no more matches are available.
|
|
|
+// All matching labels merged into ss[i] are replaced by "".
|
|
|
+func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) {
|
|
|
+ s := ss[i]
|
|
|
+ suffix := s[len(s)-prefixLen:]
|
|
|
+ for _, j := range prefixes[suffix] {
|
|
|
+ // Empty strings mean "already used." Also avoid merging with self.
|
|
|
+ if ss[j] == "" || i == j {
|
|
|
continue
|
|
|
}
|
|
|
- break
|
|
|
+ if *v {
|
|
|
+ fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n",
|
|
|
+ prefixLen, i, j, ss[i], ss[j], suffix)
|
|
|
+ }
|
|
|
+ ss[i] += ss[j][prefixLen:]
|
|
|
+ ss[j] = ""
|
|
|
+ // ss[i] has a new suffix, so merge again if possible.
|
|
|
+ // Note: we only have to merge again at the same prefix length. Shorter
|
|
|
+ // prefix lengths will be handled in the next iteration of crush's for loop.
|
|
|
+ // Can there be matches for longer prefix lengths, introduced by the merge?
|
|
|
+ // I believe that any such matches would by necessity have been eliminated
|
|
|
+ // during substring removal or merged at a higher prefix length. For
|
|
|
+ // instance, in crush("abc", "cde", "bcdef"), combining "abc" and "cde"
|
|
|
+ // would yield "abcde", which could be merged with "bcdef." However, in
|
|
|
+ // practice "cde" would already have been elimintated by removeSubstrings.
|
|
|
+ mergeLabel(ss, i, prefixLen, prefixes)
|
|
|
+ return
|
|
|
}
|
|
|
+}
|
|
|
|
|
|
- text := strings.Join(ss, "")
|
|
|
- if *v {
|
|
|
- fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
|
|
|
+// prefixMap maps from a prefix to a list of strings containing that prefix. The
|
|
|
+// list of strings is represented as indexes into a slice of strings stored
|
|
|
+// elsewhere.
|
|
|
+type prefixMap map[string][]int
|
|
|
+
|
|
|
+// makePrefixMap constructs a prefixMap from a slice of strings.
|
|
|
+func makePrefixMap(ss []string, prefixLen int) prefixMap {
|
|
|
+ prefixes := make(prefixMap)
|
|
|
+ for i, s := range ss {
|
|
|
+ // We use < rather than <= because if a label matches on a prefix equal to
|
|
|
+ // its full length, that's actually a substring match handled by
|
|
|
+ // removeSubstrings.
|
|
|
+ if prefixLen < len(s) {
|
|
|
+ prefix := s[:prefixLen]
|
|
|
+ prefixes[prefix] = append(prefixes[prefix], i)
|
|
|
+ }
|
|
|
}
|
|
|
- return text
|
|
|
+
|
|
|
+ return prefixes
|
|
|
}
|