123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177 |
- package utilities
- import (
- "sort"
- )
- // DoubleArray is a Double Array implementation of trie on sequences of strings.
- type DoubleArray struct {
- // Encoding keeps an encoding from string to int
- Encoding map[string]int
- // Base is the base array of Double Array
- Base []int
- // Check is the check array of Double Array
- Check []int
- }
- // NewDoubleArray builds a DoubleArray from a set of sequences of strings.
- func NewDoubleArray(seqs [][]string) *DoubleArray {
- da := &DoubleArray{Encoding: make(map[string]int)}
- if len(seqs) == 0 {
- return da
- }
- encoded := registerTokens(da, seqs)
- sort.Sort(byLex(encoded))
- root := node{row: -1, col: -1, left: 0, right: len(encoded)}
- addSeqs(da, encoded, 0, root)
- for i := len(da.Base); i > 0; i-- {
- if da.Check[i-1] != 0 {
- da.Base = da.Base[:i]
- da.Check = da.Check[:i]
- break
- }
- }
- return da
- }
- func registerTokens(da *DoubleArray, seqs [][]string) [][]int {
- var result [][]int
- for _, seq := range seqs {
- var encoded []int
- for _, token := range seq {
- if _, ok := da.Encoding[token]; !ok {
- da.Encoding[token] = len(da.Encoding)
- }
- encoded = append(encoded, da.Encoding[token])
- }
- result = append(result, encoded)
- }
- for i := range result {
- result[i] = append(result[i], len(da.Encoding))
- }
- return result
- }
- type node struct {
- row, col int
- left, right int
- }
- func (n node) value(seqs [][]int) int {
- return seqs[n.row][n.col]
- }
- func (n node) children(seqs [][]int) []*node {
- var result []*node
- lastVal := int(-1)
- last := new(node)
- for i := n.left; i < n.right; i++ {
- if lastVal == seqs[i][n.col+1] {
- continue
- }
- last.right = i
- last = &node{
- row: i,
- col: n.col + 1,
- left: i,
- }
- result = append(result, last)
- }
- last.right = n.right
- return result
- }
- func addSeqs(da *DoubleArray, seqs [][]int, pos int, n node) {
- ensureSize(da, pos)
- children := n.children(seqs)
- var i int
- for i = 1; ; i++ {
- ok := func() bool {
- for _, child := range children {
- code := child.value(seqs)
- j := i + code
- ensureSize(da, j)
- if da.Check[j] != 0 {
- return false
- }
- }
- return true
- }()
- if ok {
- break
- }
- }
- da.Base[pos] = i
- for _, child := range children {
- code := child.value(seqs)
- j := i + code
- da.Check[j] = pos + 1
- }
- terminator := len(da.Encoding)
- for _, child := range children {
- code := child.value(seqs)
- if code == terminator {
- continue
- }
- j := i + code
- addSeqs(da, seqs, j, *child)
- }
- }
- func ensureSize(da *DoubleArray, i int) {
- for i >= len(da.Base) {
- da.Base = append(da.Base, make([]int, len(da.Base)+1)...)
- da.Check = append(da.Check, make([]int, len(da.Check)+1)...)
- }
- }
- type byLex [][]int
- func (l byLex) Len() int { return len(l) }
- func (l byLex) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
- func (l byLex) Less(i, j int) bool {
- si := l[i]
- sj := l[j]
- var k int
- for k = 0; k < len(si) && k < len(sj); k++ {
- if si[k] < sj[k] {
- return true
- }
- if si[k] > sj[k] {
- return false
- }
- }
- if k < len(sj) {
- return true
- }
- return false
- }
- // HasCommonPrefix determines if any sequence in the DoubleArray is a prefix of the given sequence.
- func (da *DoubleArray) HasCommonPrefix(seq []string) bool {
- if len(da.Base) == 0 {
- return false
- }
- var i int
- for _, t := range seq {
- code, ok := da.Encoding[t]
- if !ok {
- break
- }
- j := da.Base[i] + code
- if len(da.Check) <= j || da.Check[j] != i+1 {
- break
- }
- i = j
- }
- j := da.Base[i] + len(da.Encoding)
- if len(da.Check) <= j || da.Check[j] != i+1 {
- return false
- }
- return true
- }
|