gen.go 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // +build ignore
  5. package main
  6. // This program generates table.go and table_test.go.
  7. // Invoke as:
  8. //
  9. // go run gen.go -version "xxx" >table.go
  10. // go run gen.go -version "xxx" -test >table_test.go
  11. //
  12. // Pass -v to print verbose progress information.
  13. //
  14. // The version is derived from information found at
  15. // https://github.com/publicsuffix/list/commits/master/public_suffix_list.dat
  16. //
  17. // To fetch a particular git revision, such as 5c70ccd250, pass
  18. // -url "https://raw.githubusercontent.com/publicsuffix/list/5c70ccd250/public_suffix_list.dat"
  19. import (
  20. "bufio"
  21. "bytes"
  22. "flag"
  23. "fmt"
  24. "go/format"
  25. "io"
  26. "net/http"
  27. "os"
  28. "regexp"
  29. "sort"
  30. "strings"
  31. "golang.org/x/net/idna"
  32. )
  33. const (
  34. // These sum of these four values must be no greater than 32.
  35. nodesBitsChildren = 9
  36. nodesBitsICANN = 1
  37. nodesBitsTextOffset = 15
  38. nodesBitsTextLength = 6
  39. // These sum of these four values must be no greater than 32.
  40. childrenBitsWildcard = 1
  41. childrenBitsNodeType = 2
  42. childrenBitsHi = 14
  43. childrenBitsLo = 14
  44. )
  45. var (
  46. maxChildren int
  47. maxTextOffset int
  48. maxTextLength int
  49. maxHi uint32
  50. maxLo uint32
  51. )
  52. func max(a, b int) int {
  53. if a < b {
  54. return b
  55. }
  56. return a
  57. }
  58. func u32max(a, b uint32) uint32 {
  59. if a < b {
  60. return b
  61. }
  62. return a
  63. }
  64. const (
  65. nodeTypeNormal = 0
  66. nodeTypeException = 1
  67. nodeTypeParentOnly = 2
  68. numNodeType = 3
  69. )
  70. func nodeTypeStr(n int) string {
  71. switch n {
  72. case nodeTypeNormal:
  73. return "+"
  74. case nodeTypeException:
  75. return "!"
  76. case nodeTypeParentOnly:
  77. return "o"
  78. }
  79. panic("unreachable")
  80. }
  81. var (
  82. labelEncoding = map[string]uint32{}
  83. labelsList = []string{}
  84. labelsMap = map[string]bool{}
  85. rules = []string{}
  86. // validSuffix is used to check that the entries in the public suffix list
  87. // are in canonical form (after Punycode encoding). Specifically, capital
  88. // letters are not allowed.
  89. validSuffix = regexp.MustCompile(`^[a-z0-9_\!\*\-\.]+$`)
  90. subset = flag.Bool("subset", false, "generate only a subset of the full table, for debugging")
  91. url = flag.String("url",
  92. "https://publicsuffix.org/list/effective_tld_names.dat",
  93. "URL of the publicsuffix.org list. If empty, stdin is read instead")
  94. v = flag.Bool("v", false, "verbose output (to stderr)")
  95. version = flag.String("version", "", "the effective_tld_names.dat version")
  96. test = flag.Bool("test", false, "generate table_test.go")
  97. )
  98. func main() {
  99. if err := main1(); err != nil {
  100. fmt.Fprintln(os.Stderr, err)
  101. os.Exit(1)
  102. }
  103. }
  104. func main1() error {
  105. flag.Parse()
  106. if nodesBitsTextLength+nodesBitsTextOffset+nodesBitsICANN+nodesBitsChildren > 32 {
  107. return fmt.Errorf("not enough bits to encode the nodes table")
  108. }
  109. if childrenBitsLo+childrenBitsHi+childrenBitsNodeType+childrenBitsWildcard > 32 {
  110. return fmt.Errorf("not enough bits to encode the children table")
  111. }
  112. if *version == "" {
  113. return fmt.Errorf("-version was not specified")
  114. }
  115. var r io.Reader = os.Stdin
  116. if *url != "" {
  117. res, err := http.Get(*url)
  118. if err != nil {
  119. return err
  120. }
  121. if res.StatusCode != http.StatusOK {
  122. return fmt.Errorf("bad GET status for %s: %d", *url, res.Status)
  123. }
  124. r = res.Body
  125. defer res.Body.Close()
  126. }
  127. var root node
  128. icann := false
  129. buf := new(bytes.Buffer)
  130. br := bufio.NewReader(r)
  131. for {
  132. s, err := br.ReadString('\n')
  133. if err != nil {
  134. if err == io.EOF {
  135. break
  136. }
  137. return err
  138. }
  139. s = strings.TrimSpace(s)
  140. if strings.Contains(s, "BEGIN ICANN DOMAINS") {
  141. icann = true
  142. continue
  143. }
  144. if strings.Contains(s, "END ICANN DOMAINS") {
  145. icann = false
  146. continue
  147. }
  148. if s == "" || strings.HasPrefix(s, "//") {
  149. continue
  150. }
  151. s, err = idna.ToASCII(s)
  152. if err != nil {
  153. return err
  154. }
  155. if !validSuffix.MatchString(s) {
  156. return fmt.Errorf("bad publicsuffix.org list data: %q", s)
  157. }
  158. if *subset {
  159. switch {
  160. case s == "ac.jp" || strings.HasSuffix(s, ".ac.jp"):
  161. case s == "ak.us" || strings.HasSuffix(s, ".ak.us"):
  162. case s == "ao" || strings.HasSuffix(s, ".ao"):
  163. case s == "ar" || strings.HasSuffix(s, ".ar"):
  164. case s == "arpa" || strings.HasSuffix(s, ".arpa"):
  165. case s == "cy" || strings.HasSuffix(s, ".cy"):
  166. case s == "dyndns.org" || strings.HasSuffix(s, ".dyndns.org"):
  167. case s == "jp":
  168. case s == "kobe.jp" || strings.HasSuffix(s, ".kobe.jp"):
  169. case s == "kyoto.jp" || strings.HasSuffix(s, ".kyoto.jp"):
  170. case s == "om" || strings.HasSuffix(s, ".om"):
  171. case s == "uk" || strings.HasSuffix(s, ".uk"):
  172. case s == "uk.com" || strings.HasSuffix(s, ".uk.com"):
  173. case s == "tw" || strings.HasSuffix(s, ".tw"):
  174. case s == "zw" || strings.HasSuffix(s, ".zw"):
  175. case s == "xn--p1ai" || strings.HasSuffix(s, ".xn--p1ai"):
  176. // xn--p1ai is Russian-Cyrillic "рф".
  177. default:
  178. continue
  179. }
  180. }
  181. rules = append(rules, s)
  182. nt, wildcard := nodeTypeNormal, false
  183. switch {
  184. case strings.HasPrefix(s, "*."):
  185. s, nt = s[2:], nodeTypeParentOnly
  186. wildcard = true
  187. case strings.HasPrefix(s, "!"):
  188. s, nt = s[1:], nodeTypeException
  189. }
  190. labels := strings.Split(s, ".")
  191. for n, i := &root, len(labels)-1; i >= 0; i-- {
  192. label := labels[i]
  193. n = n.child(label)
  194. if i == 0 {
  195. if nt != nodeTypeParentOnly && n.nodeType == nodeTypeParentOnly {
  196. n.nodeType = nt
  197. }
  198. n.icann = n.icann && icann
  199. n.wildcard = n.wildcard || wildcard
  200. }
  201. labelsMap[label] = true
  202. }
  203. }
  204. labelsList = make([]string, 0, len(labelsMap))
  205. for label := range labelsMap {
  206. labelsList = append(labelsList, label)
  207. }
  208. sort.Strings(labelsList)
  209. p := printReal
  210. if *test {
  211. p = printTest
  212. }
  213. if err := p(buf, &root); err != nil {
  214. return err
  215. }
  216. b, err := format.Source(buf.Bytes())
  217. if err != nil {
  218. return err
  219. }
  220. _, err = os.Stdout.Write(b)
  221. return err
  222. }
  223. func printTest(w io.Writer, n *node) error {
  224. fmt.Fprintf(w, "// generated by go run gen.go; DO NOT EDIT\n\n")
  225. fmt.Fprintf(w, "package publicsuffix\n\nvar rules = [...]string{\n")
  226. for _, rule := range rules {
  227. fmt.Fprintf(w, "%q,\n", rule)
  228. }
  229. fmt.Fprintf(w, "}\n\nvar nodeLabels = [...]string{\n")
  230. if err := n.walk(w, printNodeLabel); err != nil {
  231. return err
  232. }
  233. fmt.Fprintf(w, "}\n")
  234. return nil
  235. }
  236. func printReal(w io.Writer, n *node) error {
  237. const header = `// generated by go run gen.go; DO NOT EDIT
  238. package publicsuffix
  239. const version = %q
  240. const (
  241. nodesBitsChildren = %d
  242. nodesBitsICANN = %d
  243. nodesBitsTextOffset = %d
  244. nodesBitsTextLength = %d
  245. childrenBitsWildcard = %d
  246. childrenBitsNodeType = %d
  247. childrenBitsHi = %d
  248. childrenBitsLo = %d
  249. )
  250. const (
  251. nodeTypeNormal = %d
  252. nodeTypeException = %d
  253. nodeTypeParentOnly = %d
  254. )
  255. // numTLD is the number of top level domains.
  256. const numTLD = %d
  257. `
  258. fmt.Fprintf(w, header, *version,
  259. nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength,
  260. childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo,
  261. nodeTypeNormal, nodeTypeException, nodeTypeParentOnly, len(n.children))
  262. text := combineText(labelsList)
  263. if text == "" {
  264. return fmt.Errorf("internal error: makeText returned no text")
  265. }
  266. for _, label := range labelsList {
  267. offset, length := strings.Index(text, label), len(label)
  268. if offset < 0 {
  269. return fmt.Errorf("internal error: could not find %q in text %q", label, text)
  270. }
  271. maxTextOffset, maxTextLength = max(maxTextOffset, offset), max(maxTextLength, length)
  272. if offset >= 1<<nodesBitsTextOffset {
  273. return fmt.Errorf("text offset %d is too large, or nodeBitsTextOffset is too small", offset)
  274. }
  275. if length >= 1<<nodesBitsTextLength {
  276. return fmt.Errorf("text length %d is too large, or nodeBitsTextLength is too small", length)
  277. }
  278. labelEncoding[label] = uint32(offset)<<nodesBitsTextLength | uint32(length)
  279. }
  280. fmt.Fprintf(w, "// Text is the combined text of all labels.\nconst text = ")
  281. for len(text) > 0 {
  282. n, plus := len(text), ""
  283. if n > 64 {
  284. n, plus = 64, " +"
  285. }
  286. fmt.Fprintf(w, "%q%s\n", text[:n], plus)
  287. text = text[n:]
  288. }
  289. if err := n.walk(w, assignIndexes); err != nil {
  290. return err
  291. }
  292. fmt.Fprintf(w, `
  293. // nodes is the list of nodes. Each node is represented as a uint32, which
  294. // encodes the node's children, wildcard bit and node type (as an index into
  295. // the children array), ICANN bit and text.
  296. //
  297. // In the //-comment after each node's data, the nodes indexes of the children
  298. // are formatted as (n0x1234-n0x1256), with * denoting the wildcard bit. The
  299. // nodeType is printed as + for normal, ! for exception, and o for parent-only
  300. // nodes that have children but don't match a domain label in their own right.
  301. // An I denotes an ICANN domain.
  302. //
  303. // The layout within the uint32, from MSB to LSB, is:
  304. // [%2d bits] unused
  305. // [%2d bits] children index
  306. // [%2d bits] ICANN bit
  307. // [%2d bits] text index
  308. // [%2d bits] text length
  309. var nodes = [...]uint32{
  310. `,
  311. 32-nodesBitsChildren-nodesBitsICANN-nodesBitsTextOffset-nodesBitsTextLength,
  312. nodesBitsChildren, nodesBitsICANN, nodesBitsTextOffset, nodesBitsTextLength)
  313. if err := n.walk(w, printNode); err != nil {
  314. return err
  315. }
  316. fmt.Fprintf(w, `}
  317. // children is the list of nodes' children, the parent's wildcard bit and the
  318. // parent's node type. If a node has no children then their children index
  319. // will be in the range [0, 6), depending on the wildcard bit and node type.
  320. //
  321. // The layout within the uint32, from MSB to LSB, is:
  322. // [%2d bits] unused
  323. // [%2d bits] wildcard bit
  324. // [%2d bits] node type
  325. // [%2d bits] high nodes index (exclusive) of children
  326. // [%2d bits] low nodes index (inclusive) of children
  327. var children=[...]uint32{
  328. `,
  329. 32-childrenBitsWildcard-childrenBitsNodeType-childrenBitsHi-childrenBitsLo,
  330. childrenBitsWildcard, childrenBitsNodeType, childrenBitsHi, childrenBitsLo)
  331. for i, c := range childrenEncoding {
  332. s := "---------------"
  333. lo := c & (1<<childrenBitsLo - 1)
  334. hi := (c >> childrenBitsLo) & (1<<childrenBitsHi - 1)
  335. if lo != hi {
  336. s = fmt.Sprintf("n0x%04x-n0x%04x", lo, hi)
  337. }
  338. nodeType := int(c>>(childrenBitsLo+childrenBitsHi)) & (1<<childrenBitsNodeType - 1)
  339. wildcard := c>>(childrenBitsLo+childrenBitsHi+childrenBitsNodeType) != 0
  340. fmt.Fprintf(w, "0x%08x, // c0x%04x (%s)%s %s\n",
  341. c, i, s, wildcardStr(wildcard), nodeTypeStr(nodeType))
  342. }
  343. fmt.Fprintf(w, "}\n\n")
  344. fmt.Fprintf(w, "// max children %d (capacity %d)\n", maxChildren, 1<<nodesBitsChildren-1)
  345. fmt.Fprintf(w, "// max text offset %d (capacity %d)\n", maxTextOffset, 1<<nodesBitsTextOffset-1)
  346. fmt.Fprintf(w, "// max text length %d (capacity %d)\n", maxTextLength, 1<<nodesBitsTextLength-1)
  347. fmt.Fprintf(w, "// max hi %d (capacity %d)\n", maxHi, 1<<childrenBitsHi-1)
  348. fmt.Fprintf(w, "// max lo %d (capacity %d)\n", maxLo, 1<<childrenBitsLo-1)
  349. return nil
  350. }
  351. type node struct {
  352. label string
  353. nodeType int
  354. icann bool
  355. wildcard bool
  356. // nodesIndex and childrenIndex are the index of this node in the nodes
  357. // and the index of its children offset/length in the children arrays.
  358. nodesIndex, childrenIndex int
  359. // firstChild is the index of this node's first child, or zero if this
  360. // node has no children.
  361. firstChild int
  362. // children are the node's children, in strictly increasing node label order.
  363. children []*node
  364. }
  365. func (n *node) walk(w io.Writer, f func(w1 io.Writer, n1 *node) error) error {
  366. if err := f(w, n); err != nil {
  367. return err
  368. }
  369. for _, c := range n.children {
  370. if err := c.walk(w, f); err != nil {
  371. return err
  372. }
  373. }
  374. return nil
  375. }
  376. // child returns the child of n with the given label. The child is created if
  377. // it did not exist beforehand.
  378. func (n *node) child(label string) *node {
  379. for _, c := range n.children {
  380. if c.label == label {
  381. return c
  382. }
  383. }
  384. c := &node{
  385. label: label,
  386. nodeType: nodeTypeParentOnly,
  387. icann: true,
  388. }
  389. n.children = append(n.children, c)
  390. sort.Sort(byLabel(n.children))
  391. return c
  392. }
  393. type byLabel []*node
  394. func (b byLabel) Len() int { return len(b) }
  395. func (b byLabel) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
  396. func (b byLabel) Less(i, j int) bool { return b[i].label < b[j].label }
  397. var nextNodesIndex int
  398. // childrenEncoding are the encoded entries in the generated children array.
  399. // All these pre-defined entries have no children.
  400. var childrenEncoding = []uint32{
  401. 0 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeNormal.
  402. 1 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeException.
  403. 2 << (childrenBitsLo + childrenBitsHi), // Without wildcard bit, nodeTypeParentOnly.
  404. 4 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeNormal.
  405. 5 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeException.
  406. 6 << (childrenBitsLo + childrenBitsHi), // With wildcard bit, nodeTypeParentOnly.
  407. }
  408. var firstCallToAssignIndexes = true
  409. func assignIndexes(w io.Writer, n *node) error {
  410. if len(n.children) != 0 {
  411. // Assign nodesIndex.
  412. n.firstChild = nextNodesIndex
  413. for _, c := range n.children {
  414. c.nodesIndex = nextNodesIndex
  415. nextNodesIndex++
  416. }
  417. // The root node's children is implicit.
  418. if firstCallToAssignIndexes {
  419. firstCallToAssignIndexes = false
  420. return nil
  421. }
  422. // Assign childrenIndex.
  423. maxChildren = max(maxChildren, len(childrenEncoding))
  424. if len(childrenEncoding) >= 1<<nodesBitsChildren {
  425. return fmt.Errorf("children table size %d is too large, or nodeBitsChildren is too small", len(childrenEncoding))
  426. }
  427. n.childrenIndex = len(childrenEncoding)
  428. lo := uint32(n.firstChild)
  429. hi := lo + uint32(len(n.children))
  430. maxLo, maxHi = u32max(maxLo, lo), u32max(maxHi, hi)
  431. if lo >= 1<<childrenBitsLo {
  432. return fmt.Errorf("children lo %d is too large, or childrenBitsLo is too small", lo)
  433. }
  434. if hi >= 1<<childrenBitsHi {
  435. return fmt.Errorf("children hi %d is too large, or childrenBitsHi is too small", hi)
  436. }
  437. enc := hi<<childrenBitsLo | lo
  438. enc |= uint32(n.nodeType) << (childrenBitsLo + childrenBitsHi)
  439. if n.wildcard {
  440. enc |= 1 << (childrenBitsLo + childrenBitsHi + childrenBitsNodeType)
  441. }
  442. childrenEncoding = append(childrenEncoding, enc)
  443. } else {
  444. n.childrenIndex = n.nodeType
  445. if n.wildcard {
  446. n.childrenIndex += numNodeType
  447. }
  448. }
  449. return nil
  450. }
  451. func printNode(w io.Writer, n *node) error {
  452. for _, c := range n.children {
  453. s := "---------------"
  454. if len(c.children) != 0 {
  455. s = fmt.Sprintf("n0x%04x-n0x%04x", c.firstChild, c.firstChild+len(c.children))
  456. }
  457. encoding := labelEncoding[c.label]
  458. if c.icann {
  459. encoding |= 1 << (nodesBitsTextLength + nodesBitsTextOffset)
  460. }
  461. encoding |= uint32(c.childrenIndex) << (nodesBitsTextLength + nodesBitsTextOffset + nodesBitsICANN)
  462. fmt.Fprintf(w, "0x%08x, // n0x%04x c0x%04x (%s)%s %s %s %s\n",
  463. encoding, c.nodesIndex, c.childrenIndex, s, wildcardStr(c.wildcard),
  464. nodeTypeStr(c.nodeType), icannStr(c.icann), c.label,
  465. )
  466. }
  467. return nil
  468. }
  469. func printNodeLabel(w io.Writer, n *node) error {
  470. for _, c := range n.children {
  471. fmt.Fprintf(w, "%q,\n", c.label)
  472. }
  473. return nil
  474. }
  475. func icannStr(icann bool) string {
  476. if icann {
  477. return "I"
  478. }
  479. return " "
  480. }
  481. func wildcardStr(wildcard bool) string {
  482. if wildcard {
  483. return "*"
  484. }
  485. return " "
  486. }
  487. // combineText combines all the strings in labelsList to form one giant string.
  488. // Overlapping strings will be merged: "arpa" and "parliament" could yield
  489. // "arparliament".
  490. func combineText(labelsList []string) string {
  491. beforeLength := 0
  492. for _, s := range labelsList {
  493. beforeLength += len(s)
  494. }
  495. text := crush(removeSubstrings(labelsList))
  496. if *v {
  497. fmt.Fprintf(os.Stderr, "crushed %d bytes to become %d bytes\n", beforeLength, len(text))
  498. }
  499. return text
  500. }
  501. type byLength []string
  502. func (s byLength) Len() int { return len(s) }
  503. func (s byLength) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
  504. func (s byLength) Less(i, j int) bool { return len(s[i]) < len(s[j]) }
  505. // removeSubstrings returns a copy of its input with any strings removed
  506. // that are substrings of other provided strings.
  507. func removeSubstrings(input []string) []string {
  508. // Make a copy of input.
  509. ss := append(make([]string, 0, len(input)), input...)
  510. sort.Sort(byLength(ss))
  511. for i, shortString := range ss {
  512. // For each string, only consider strings higher than it in sort order, i.e.
  513. // of equal length or greater.
  514. for _, longString := range ss[i+1:] {
  515. if strings.Contains(longString, shortString) {
  516. ss[i] = ""
  517. break
  518. }
  519. }
  520. }
  521. // Remove the empty strings.
  522. sort.Strings(ss)
  523. for len(ss) > 0 && ss[0] == "" {
  524. ss = ss[1:]
  525. }
  526. return ss
  527. }
  528. // crush combines a list of strings, taking advantage of overlaps. It returns a
  529. // single string that contains each input string as a substring.
  530. func crush(ss []string) string {
  531. maxLabelLen := 0
  532. for _, s := range ss {
  533. if maxLabelLen < len(s) {
  534. maxLabelLen = len(s)
  535. }
  536. }
  537. for prefixLen := maxLabelLen; prefixLen > 0; prefixLen-- {
  538. prefixes := makePrefixMap(ss, prefixLen)
  539. for i, s := range ss {
  540. if len(s) <= prefixLen {
  541. continue
  542. }
  543. mergeLabel(ss, i, prefixLen, prefixes)
  544. }
  545. }
  546. return strings.Join(ss, "")
  547. }
  548. // mergeLabel merges the label at ss[i] with the first available matching label
  549. // in prefixMap, where the last "prefixLen" characters in ss[i] match the first
  550. // "prefixLen" characters in the matching label.
  551. // It will merge ss[i] repeatedly until no more matches are available.
  552. // All matching labels merged into ss[i] are replaced by "".
  553. func mergeLabel(ss []string, i, prefixLen int, prefixes prefixMap) {
  554. s := ss[i]
  555. suffix := s[len(s)-prefixLen:]
  556. for _, j := range prefixes[suffix] {
  557. // Empty strings mean "already used." Also avoid merging with self.
  558. if ss[j] == "" || i == j {
  559. continue
  560. }
  561. if *v {
  562. fmt.Fprintf(os.Stderr, "%d-length overlap at (%4d,%4d): %q and %q share %q\n",
  563. prefixLen, i, j, ss[i], ss[j], suffix)
  564. }
  565. ss[i] += ss[j][prefixLen:]
  566. ss[j] = ""
  567. // ss[i] has a new suffix, so merge again if possible.
  568. // Note: we only have to merge again at the same prefix length. Shorter
  569. // prefix lengths will be handled in the next iteration of crush's for loop.
  570. // Can there be matches for longer prefix lengths, introduced by the merge?
  571. // I believe that any such matches would by necessity have been eliminated
  572. // during substring removal or merged at a higher prefix length. For
  573. // instance, in crush("abc", "cde", "bcdef"), combining "abc" and "cde"
  574. // would yield "abcde", which could be merged with "bcdef." However, in
  575. // practice "cde" would already have been elimintated by removeSubstrings.
  576. mergeLabel(ss, i, prefixLen, prefixes)
  577. return
  578. }
  579. }
  580. // prefixMap maps from a prefix to a list of strings containing that prefix. The
  581. // list of strings is represented as indexes into a slice of strings stored
  582. // elsewhere.
  583. type prefixMap map[string][]int
  584. // makePrefixMap constructs a prefixMap from a slice of strings.
  585. func makePrefixMap(ss []string, prefixLen int) prefixMap {
  586. prefixes := make(prefixMap)
  587. for i, s := range ss {
  588. // We use < rather than <= because if a label matches on a prefix equal to
  589. // its full length, that's actually a substring match handled by
  590. // removeSubstrings.
  591. if prefixLen < len(s) {
  592. prefix := s[:prefixLen]
  593. prefixes[prefix] = append(prefixes[prefix], i)
  594. }
  595. }
  596. return prefixes
  597. }