trie.go 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. package stringx
  2. import "github.com/tal-tech/go-zero/core/lang"
  3. const defaultMask = '*'
  4. type (
  5. // TrieOption defines the method to customize a Trie.
  6. TrieOption func(trie *trieNode)
  7. // A Trie is a tree implementation that used to find elements rapidly.
  8. Trie interface {
  9. Filter(text string) (string, []string, bool)
  10. FindKeywords(text string) []string
  11. }
  12. trieNode struct {
  13. node
  14. mask rune
  15. }
  16. scope struct {
  17. start int
  18. stop int
  19. }
  20. )
  21. // NewTrie returns a Trie.
  22. func NewTrie(words []string, opts ...TrieOption) Trie {
  23. n := new(trieNode)
  24. for _, opt := range opts {
  25. opt(n)
  26. }
  27. if n.mask == 0 {
  28. n.mask = defaultMask
  29. }
  30. for _, word := range words {
  31. n.add(word)
  32. }
  33. return n
  34. }
  35. func (n *trieNode) Filter(text string) (sentence string, keywords []string, found bool) {
  36. chars := []rune(text)
  37. if len(chars) == 0 {
  38. return text, nil, false
  39. }
  40. scopes := n.findKeywordScopes(chars)
  41. keywords = n.collectKeywords(chars, scopes)
  42. for _, match := range scopes {
  43. // we don't care about overlaps, not bringing a performance improvement
  44. n.replaceWithAsterisk(chars, match.start, match.stop)
  45. }
  46. return string(chars), keywords, len(keywords) > 0
  47. }
  48. func (n *trieNode) FindKeywords(text string) []string {
  49. chars := []rune(text)
  50. if len(chars) == 0 {
  51. return nil
  52. }
  53. scopes := n.findKeywordScopes(chars)
  54. return n.collectKeywords(chars, scopes)
  55. }
  56. func (n *trieNode) collectKeywords(chars []rune, scopes []scope) []string {
  57. set := make(map[string]lang.PlaceholderType)
  58. for _, v := range scopes {
  59. set[string(chars[v.start:v.stop])] = lang.Placeholder
  60. }
  61. var i int
  62. keywords := make([]string, len(set))
  63. for k := range set {
  64. keywords[i] = k
  65. i++
  66. }
  67. return keywords
  68. }
  69. func (n *trieNode) findKeywordScopes(chars []rune) []scope {
  70. var scopes []scope
  71. size := len(chars)
  72. start := -1
  73. for i := 0; i < size; i++ {
  74. child, ok := n.children[chars[i]]
  75. if !ok {
  76. continue
  77. }
  78. if start < 0 {
  79. start = i
  80. }
  81. if child.end {
  82. scopes = append(scopes, scope{
  83. start: start,
  84. stop: i + 1,
  85. })
  86. }
  87. for j := i + 1; j < size; j++ {
  88. grandchild, ok := child.children[chars[j]]
  89. if !ok {
  90. break
  91. }
  92. child = grandchild
  93. if child.end {
  94. scopes = append(scopes, scope{
  95. start: start,
  96. stop: j + 1,
  97. })
  98. }
  99. }
  100. start = -1
  101. }
  102. return scopes
  103. }
  104. func (n *trieNode) replaceWithAsterisk(chars []rune, start, stop int) {
  105. for i := start; i < stop; i++ {
  106. chars[i] = n.mask
  107. }
  108. }
  109. // WithMask customizes a Trie with keywords masked as given mask char.
  110. func WithMask(mask rune) TrieOption {
  111. return func(n *trieNode) {
  112. n.mask = mask
  113. }
  114. }