trie.go 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. package stringx
  2. import "github.com/tal-tech/go-zero/core/lang"
  3. const defaultMask = '*'
  4. type (
  5. TrieOption func(trie *trieNode)
  6. Trie interface {
  7. Filter(text string) (string, []string, bool)
  8. FindKeywords(text string) []string
  9. }
  10. trieNode struct {
  11. node
  12. mask rune
  13. }
  14. scope struct {
  15. start int
  16. stop int
  17. }
  18. )
  19. func NewTrie(words []string, opts ...TrieOption) Trie {
  20. n := new(trieNode)
  21. for _, opt := range opts {
  22. opt(n)
  23. }
  24. if n.mask == 0 {
  25. n.mask = defaultMask
  26. }
  27. for _, word := range words {
  28. n.add(word)
  29. }
  30. return n
  31. }
  32. func (n *trieNode) Filter(text string) (sentence string, keywords []string, found bool) {
  33. chars := []rune(text)
  34. if len(chars) == 0 {
  35. return text, nil, false
  36. }
  37. scopes := n.findKeywordScopes(chars)
  38. keywords = n.collectKeywords(chars, scopes)
  39. for _, match := range scopes {
  40. // we don't care about overlaps, not bringing a performance improvement
  41. n.replaceWithAsterisk(chars, match.start, match.stop)
  42. }
  43. return string(chars), keywords, len(keywords) > 0
  44. }
  45. func (n *trieNode) FindKeywords(text string) []string {
  46. chars := []rune(text)
  47. if len(chars) == 0 {
  48. return nil
  49. }
  50. scopes := n.findKeywordScopes(chars)
  51. return n.collectKeywords(chars, scopes)
  52. }
  53. func (n *trieNode) collectKeywords(chars []rune, scopes []scope) []string {
  54. set := make(map[string]lang.PlaceholderType)
  55. for _, v := range scopes {
  56. set[string(chars[v.start:v.stop])] = lang.Placeholder
  57. }
  58. var i int
  59. keywords := make([]string, len(set))
  60. for k := range set {
  61. keywords[i] = k
  62. i++
  63. }
  64. return keywords
  65. }
  66. func (n *trieNode) findKeywordScopes(chars []rune) []scope {
  67. var scopes []scope
  68. size := len(chars)
  69. start := -1
  70. for i := 0; i < size; i++ {
  71. child, ok := n.children[chars[i]]
  72. if !ok {
  73. continue
  74. }
  75. if start < 0 {
  76. start = i
  77. }
  78. if child.end {
  79. scopes = append(scopes, scope{
  80. start: start,
  81. stop: i + 1,
  82. })
  83. }
  84. for j := i + 1; j < size; j++ {
  85. grandchild, ok := child.children[chars[j]]
  86. if !ok {
  87. break
  88. }
  89. child = grandchild
  90. if child.end {
  91. scopes = append(scopes, scope{
  92. start: start,
  93. stop: j + 1,
  94. })
  95. }
  96. }
  97. start = -1
  98. }
  99. return scopes
  100. }
  101. func (n *trieNode) replaceWithAsterisk(chars []rune, start, stop int) {
  102. for i := start; i < stop; i++ {
  103. chars[i] = n.mask
  104. }
  105. }
  106. func WithMask(mask rune) TrieOption {
  107. return func(n *trieNode) {
  108. n.mask = mask
  109. }
  110. }