10 years ago · 08f168e593
--- a/publicsuffix/gen.go
+++ b/publicsuffix/gen.go
@@ -12,14 +12,6 @@ package main
 
				 //	go run gen.go -version "xxx"       >table.go
			
 
				 //	go run gen.go -version "xxx" -test >table_test.go
			
 
				 //
			
 
				-// The first of those two will take around 20 minutes to complete, as the final
			
 
				-// table is optimized for size. When testing the code generation workflow, pass
			
 
				-// -crush=false to skip this optimization step, although the results of such a
			
 
				-// run should not be committed, as the generated table can be around 50% larger
			
 
				-// and, more importantly, require a larger number of scarce node table bits.
			
 
				-// You may need to increase nodesBitsTextOffset or other constants to generate
			
 
				-// a table with -crush=false.
			
 
				-//
			
 
				 // Pass -v to print verbose progress information.
			
 
				 //
			
 
				 // The version is derived from information found at
			
@@ -110,7 +102,6 @@ var (
 
				 	// letters are not allowed.
			
 
				 	validSuffix = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
			
 
				 
			
 
				-	crush  = flag.Bool("crush", true, "make the generated node text as small as possible")
			
 
				 	subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
			
 
				 	url    = flag.String("url",
			
 
				 		"https://publicsuffix.org/list/effective_tld_names.dat",
			
@@ -301,7 +292,7 @@ const numTLD = %d
 
				 		childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
			
 
				 		nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
			
 
				 
			
 
				-	text := makeText()
			
 
				+	text := combineText(labelsList)
			
 
				 	if text == "" {
			
 
				 		return fmt.Errorf("internal error: makeText returned no text")
			
 
				 	}
			
@@ -329,7 +320,9 @@ const numTLD = %d
 
				 		text = text[n:]
			
 
				 	}
			
 
				 
			
 
				-	n.walk(w, assignIndexes)
			
 
				+	if err := n.walk(w, assignIndexes); err != nil {
			
 
				+		return err
			
 
				+	}
			
 
				 
			
 
				 	fmt.Fprintf(w, `
			
 
				 
			
@@ -544,34 +537,42 @@ func wildcardStr(wildcard bool) string {
 
				 	return " "
			
 
				 }
			
 
				 
			
 
				-// makeText combines all the strings in labelsList to form one giant string.
			
 
				-// If the crush flag is true, then overlapping strings will be merged: "arpa"
			
 
				-// and "parliament" could yield "arparliament".
			
 
				-func makeText() string {
			
 
				-	if !*crush {
			
 
				-		return strings.Join(labelsList, "")
			
 
				-	}
			
 
				-
			
 
				+// combineText combines all the strings in labelsList to form one giant string.
			
 
				+// Overlapping strings will be merged: "arpa" and "parliament" could yield
			
 
				+// "arparliament".
			
 
				+func combineText(labelsList []string) string {
			
 
				 	beforeLength := 0
			
 
				 	for _, s := range labelsList {
			
 
				 		beforeLength += len(s)
			
 
				 	}
			
 
				 
			
 
				-	// Make a copy of labelsList.
			
 
				-	ss := append(make([]string, 0, len(labelsList)), labelsList...)
			
 
				+	text := crush(removeSubstrings(labelsList))
			
 
				+	if *v {
			
 
				+		fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
			
 
				+	}
			
 
				+	return text
			
 
				+}
			
 
				 
			
 
				-	// Remove strings that are substrings of other strings.
			
 
				-	for changed := true; changed; {
			
 
				-		changed = false
			
 
				-		for i, s := range ss {
			
 
				-			if s == "" {
			
 
				-				continue
			
 
				-			}
			
 
				-			for j, t := range ss {
			
 
				-				if i != j && t != "" && strings.Contains(s, t) {
			
 
				-					changed = true
			
 
				-					ss[j] = ""
			
 
				-				}
			
 
				+type byLength []string
			
 
				+
			
 
				+func (s byLength) Len() int           { return len(s) }
			
 
				+func (s byLength) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
			
 
				+func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
			
 
				+
			
 
				+// removeSubstrings returns a copy of its input with any strings removed
			
 
				+// that are substrings of other provided strings.
			
 
				+func removeSubstrings(input []string) []string {
			
 
				+	// Make a copy of input.
			
 
				+	ss := append(make([]string, 0, len(input)), input...)
			
 
				+	sort.Sort(byLength(ss))
			
 
				+
			
 
				+	for i, shortString := range ss {
			
 
				+		// For each string, only consider strings higher than it in sort order, i.e.
			
 
				+		// of equal length or greater.
			
 
				+		for _, longString := range ss[i+1:] {
			
 
				+			if strings.Contains(longString, shortString) {
			
 
				+				ss[i] = ""
			
 
				+				break
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
@@ -581,46 +582,82 @@ func makeText() string {
 
				 	for len(ss) > 0 && ss[0] == "" {
			
 
				 		ss = ss[1:]
			
 
				 	}
			
 
				+	return ss
			
 
				+}
			
 
				 
			
 
				-	// Join strings where one suffix matches another prefix.
			
 
				-	for {
			
 
				-		// Find best i, j, k such that ss[i][len-k:] == ss[j][:k],
			
 
				-		// maximizing overlap length k.
			
 
				-		besti := -1
			
 
				-		bestj := -1
			
 
				-		bestk := 0
			
 
				+// crush combines a list of strings, taking advantage of overlaps. It returns a
			
 
				+// single string that contains each input string as a substring.
			
 
				+func crush(ss []string) string {
			
 
				+	maxLabelLen := 0
			
 
				+	for _, s := range ss {
			
 
				+		if maxLabelLen < len(s) {
			
 
				+			maxLabelLen = len(s)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- {
			
 
				+		prefixes := makePrefixMap(ss, prefixLen)
			
 
				 		for i, s := range ss {
			
 
				-			if s == "" {
			
 
				+			if len(s) <= prefixLen {
			
 
				 				continue
			
 
				 			}
			
 
				-			for j, t := range ss {
			
 
				-				if i == j {
			
 
				-					continue
			
 
				-				}
			
 
				-				for k := bestk + 1; k <= len(s) && k <= len(t); k++ {
			
 
				-					if s[len(s)-k:] == t[:k] {
			
 
				-						besti = i
			
 
				-						bestj = j
			
 
				-						bestk = k
			
 
				-					}
			
 
				-				}
			
 
				-			}
			
 
				+			mergeLabel(ss, i, prefixLen, prefixes)
			
 
				 		}
			
 
				-		if bestk > 0 {
			
 
				-			if *v {
			
 
				-				fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d) out of (%4d,%4d): %q and %q\n",
			
 
				-					bestk, besti, bestj, len(ss), len(ss), ss[besti], ss[bestj])
			
 
				-			}
			
 
				-			ss[besti] += ss[bestj][bestk:]
			
 
				-			ss[bestj] = ""
			
 
				+	}
			
 
				+
			
 
				+	return strings.Join(ss, "")
			
 
				+}
			
 
				+
			
 
				+// mergeLabel merges the label at ss[i] with the first available matching label
			
 
				+// in prefixMap, where the last "prefixLen" characters in ss[i] match the first
			
 
				+// "prefixLen" characters in the matching label.
			
 
				+// It will merge ss[i] repeatedly until no more matches are available.
			
 
				+// All matching labels merged into ss[i] are replaced by "".
			
 
				+func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) {
			
 
				+	s := ss[i]
			
 
				+	suffix := s[len(s)-prefixLen:]
			
 
				+	for _, j := range prefixes[suffix] {
			
 
				+		// Empty strings mean "already used." Also avoid merging with self.
			
 
				+		if ss[j] == "" || i == j {
			
 
				 			continue
			
 
				 		}
			
 
				-		break
			
 
				+		if *v {
			
 
				+			fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n",
			
 
				+				prefixLen, i, j, ss[i], ss[j], suffix)
			
 
				+		}
			
 
				+		ss[i] += ss[j][prefixLen:]
			
 
				+		ss[j] = ""
			
 
				+		// ss[i] has a new suffix, so merge again if possible.
			
 
				+		// Note: we only have to merge again at the same prefix length. Shorter
			
 
				+		// prefix lengths will be handled in the next iteration of crush's for loop.
			
 
				+		// Can there be matches for longer prefix lengths, introduced by the merge?
			
 
				+		// I believe that any such matches would by necessity have been eliminated
			
 
				+		// during substring removal or merged at a higher prefix length. For
			
 
				+		// instance, in crush("abc", "cde", "bcdef"), combining "abc" and "cde"
			
 
				+		// would yield "abcde", which could be merged with "bcdef." However, in
			
 
				+		// practice "cde" would already have been elimintated by removeSubstrings.
			
 
				+		mergeLabel(ss, i, prefixLen, prefixes)
			
 
				+		return
			
 
				 	}
			
 
				+}
			
 
				 
			
 
				-	text := strings.Join(ss, "")
			
 
				-	if *v {
			
 
				-		fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
			
 
				+// prefixMap maps from a prefix to a list of strings containing that prefix. The
			
 
				+// list of strings is represented as indexes into a slice of strings stored
			
 
				+// elsewhere.
			
 
				+type prefixMap map[string][]int
			
 
				+
			
 
				+// makePrefixMap constructs a prefixMap from a slice of strings.
			
 
				+func makePrefixMap(ss []string, prefixLen int) prefixMap {
			
 
				+	prefixes := make(prefixMap)
			
 
				+	for i, s := range ss {
			
 
				+		// We use < rather than <= because if a label matches on a prefix equal to
			
 
				+		// its full length, that's actually a substring match handled by
			
 
				+		// removeSubstrings.
			
 
				+		if prefixLen < len(s) {
			
 
				+			prefix := s[:prefixLen]
			
 
				+			prefixes[prefix] = append(prefixes[prefix], i)
			
 
				+		}
			
 
				 	}
			
 
				-	return text
			
 
				+
			
 
				+	return prefixes
			
 
				 }