Browse Source

publicsuffix: Make gen.go faster.

Previously this took 20 minutes, now it takes 2-4 seconds.

Changes:

 - Iterate over prefix length, from longest to shortest.
 - Build a map of prefixes and reuse it to avoid one loop.
 - When removing simple substrings, sort by length and only consider strings
   long enough to be worth checking. Saves 600ms out of 800ms.
 - Removed the option to generate uncrushed table.
 - Fix an unhandled error in n.walk(w, assignIndexes)

Change-Id: I321d2c2bd18f4918479500f3c61d5e59ee173f3d
Reviewed-on: https://go-review.googlesource.com/20029
Reviewed-by: Nigel Tao <nigeltao@golang.org>
Jacob Hoffman-Andrews 9 năm trước cách đây
mục cha
commit
08f168e593
1 tập tin đã thay đổi với 102 bổ sung65 xóa
  1. 102 65
      publicsuffix/gen.go

+ 102 - 65
publicsuffix/gen.go

@@ -12,14 +12,6 @@ package main
 //	go run gen.go -version "xxx"       >table.go
 //	go run gen.go -version "xxx" -test >table_test.go
 //
-// The first of those two will take around 20 minutes to complete, as the final
-// table is optimized for size. When testing the code generation workflow, pass
-// -crush=false to skip this optimization step, although the results of such a
-// run should not be committed, as the generated table can be around 50% larger
-// and, more importantly, require a larger number of scarce node table bits.
-// You may need to increase nodesBitsTextOffset or other constants to generate
-// a table with -crush=false.
-//
 // Pass -v to print verbose progress information.
 //
 // The version is derived from information found at
@@ -110,7 +102,6 @@ var (
 	// letters are not allowed.
 	validSuffix = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
 
-	crush  = flag.Bool("crush", true, "make the generated node text as small as possible")
 	subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
 	url    = flag.String("url",
 		"https://publicsuffix.org/list/effective_tld_names.dat",
@@ -301,7 +292,7 @@ const numTLD = %d
 		childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
 		nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
 
-	text := makeText()
+	text := combineText(labelsList)
 	if text == "" {
 		return fmt.Errorf("internal error: makeText returned no text")
 	}
@@ -329,7 +320,9 @@ const numTLD = %d
 		text = text[n:]
 	}
 
-	n.walk(w, assignIndexes)
+	if err := n.walk(w, assignIndexes); err != nil {
+		return err
+	}
 
 	fmt.Fprintf(w, `
 
@@ -544,34 +537,42 @@ func wildcardStr(wildcard bool) string {
 	return " "
 }
 
-// makeText combines all the strings in labelsList to form one giant string.
-// If the crush flag is true, then overlapping strings will be merged: "arpa"
-// and "parliament" could yield "arparliament".
-func makeText() string {
-	if !*crush {
-		return strings.Join(labelsList, "")
-	}
-
+// combineText combines all the strings in labelsList to form one giant string.
+// Overlapping strings will be merged: "arpa" and "parliament" could yield
+// "arparliament".
+func combineText(labelsList []string) string {
 	beforeLength := 0
 	for _, s := range labelsList {
 		beforeLength += len(s)
 	}
 
-	// Make a copy of labelsList.
-	ss := append(make([]string, 0, len(labelsList)), labelsList...)
+	text := crush(removeSubstrings(labelsList))
+	if *v {
+		fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
+	}
+	return text
+}
 
-	// Remove strings that are substrings of other strings.
-	for changed := true; changed; {
-		changed = false
-		for i, s := range ss {
-			if s == "" {
-				continue
-			}
-			for j, t := range ss {
-				if i != j && t != "" && strings.Contains(s, t) {
-					changed = true
-					ss[j] = ""
-				}
+type byLength []string
+
+func (s byLength) Len() int           { return len(s) }
+func (s byLength) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
+func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
+
+// removeSubstrings returns a copy of its input with any strings removed
+// that are substrings of other provided strings.
+func removeSubstrings(input []string) []string {
+	// Make a copy of input.
+	ss := append(make([]string, 0, len(input)), input...)
+	sort.Sort(byLength(ss))
+
+	for i, shortString := range ss {
+		// For each string, only consider strings higher than it in sort order, i.e.
+		// of equal length or greater.
+		for _, longString := range ss[i+1:] {
+			if strings.Contains(longString, shortString) {
+				ss[i] = ""
+				break
 			}
 		}
 	}
@@ -581,46 +582,82 @@ func makeText() string {
 	for len(ss) > 0 && ss[0] == "" {
 		ss = ss[1:]
 	}
+	return ss
+}
 
-	// Join strings where one suffix matches another prefix.
-	for {
-		// Find best i, j, k such that ss[i][len-k:] == ss[j][:k],
-		// maximizing overlap length k.
-		besti := -1
-		bestj := -1
-		bestk := 0
+// crush combines a list of strings, taking advantage of overlaps. It returns a
+// single string that contains each input string as a substring.
+func crush(ss []string) string {
+	maxLabelLen := 0
+	for _, s := range ss {
+		if maxLabelLen < len(s) {
+			maxLabelLen = len(s)
+		}
+	}
+
+	for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- {
+		prefixes := makePrefixMap(ss, prefixLen)
 		for i, s := range ss {
-			if s == "" {
+			if len(s) <= prefixLen {
 				continue
 			}
-			for j, t := range ss {
-				if i == j {
-					continue
-				}
-				for k := bestk + 1; k <= len(s) && k <= len(t); k++ {
-					if s[len(s)-k:] == t[:k] {
-						besti = i
-						bestj = j
-						bestk = k
-					}
-				}
-			}
+			mergeLabel(ss, i, prefixLen, prefixes)
 		}
-		if bestk > 0 {
-			if *v {
-				fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d) out of (%4d,%4d): %q and %q\n",
-					bestk, besti, bestj, len(ss), len(ss), ss[besti], ss[bestj])
-			}
-			ss[besti] += ss[bestj][bestk:]
-			ss[bestj] = ""
+	}
+
+	return strings.Join(ss, "")
+}
+
+// mergeLabel merges the label at ss[i] with the first available matching label
+// in prefixMap, where the last "prefixLen" characters in ss[i] match the first
+// "prefixLen" characters in the matching label.
+// It will merge ss[i] repeatedly until no more matches are available.
+// All matching labels merged into ss[i] are replaced by "".
+func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) {
+	s := ss[i]
+	suffix := s[len(s)-prefixLen:]
+	for _, j := range prefixes[suffix] {
+		// Empty strings mean "already used." Also avoid merging with self.
+		if ss[j] == "" || i == j {
 			continue
 		}
-		break
+		if *v {
+			fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n",
+				prefixLen, i, j, ss[i], ss[j], suffix)
+		}
+		ss[i] += ss[j][prefixLen:]
+		ss[j] = ""
+		// ss[i] has a new suffix, so merge again if possible.
+		// Note: we only have to merge again at the same prefix length. Shorter
+		// prefix lengths will be handled in the next iteration of crush's for loop.
+		// Can there be matches for longer prefix lengths, introduced by the merge?
+		// I believe that any such matches would by necessity have been eliminated
+		// during substring removal or merged at a higher prefix length. For
+		// instance, in crush("abc", "cde", "bcdef"), combining "abc" and "cde"
+		// would yield "abcde", which could be merged with "bcdef." However, in
+		// practice "cde" would already have been elimintated by removeSubstrings.
+		mergeLabel(ss, i, prefixLen, prefixes)
+		return
 	}
+}
 
-	text := strings.Join(ss, "")
-	if *v {
-		fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
+// prefixMap maps from a prefix to a list of strings containing that prefix. The
+// list of strings is represented as indexes into a slice of strings stored
+// elsewhere.
+type prefixMap map[string][]int
+
+// makePrefixMap constructs a prefixMap from a slice of strings.
+func makePrefixMap(ss []string, prefixLen int) prefixMap {
+	prefixes := make(prefixMap)
+	for i, s := range ss {
+		// We use < rather than <= because if a label matches on a prefix equal to
+		// its full length, that's actually a substring match handled by
+		// removeSubstrings.
+		if prefixLen < len(s) {
+			prefix := s[:prefixLen]
+			prefixes[prefix] = append(prefixes[prefix], i)
+		}
 	}
-	return text
+
+	return prefixes
 }