123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357 |
- // Copyright 2015 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- package search
- import (
- "reflect"
- "strings"
- "testing"
- "golang.org/x/text/language"
- )
- func TestCompile(t *testing.T) {
- for i, tc := range []struct {
- desc string
- pattern string
- options []Option
- n int
- }{{
- desc: "empty",
- pattern: "",
- n: 0,
- }, {
- desc: "single",
- pattern: "a",
- n: 1,
- }, {
- desc: "keep modifier",
- pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT
- n: 2,
- }, {
- desc: "remove modifier",
- pattern: "a\u0300", // U+0300: COMBINING GRAVE ACCENT
- options: []Option{IgnoreDiacritics},
- n: 1,
- }, {
- desc: "single with double collation element",
- pattern: "ä",
- n: 2,
- }, {
- desc: "leading variable",
- pattern: " a",
- n: 2,
- }, {
- desc: "trailing variable",
- pattern: "aa ",
- n: 3,
- }, {
- desc: "leading and trailing variable",
- pattern: " äb ",
- n: 5,
- }, {
- desc: "keep interior variable",
- pattern: " ä b ",
- n: 6,
- }, {
- desc: "keep interior variables",
- pattern: " b ä ",
- n: 7,
- }, {
- desc: "remove ignoreables (zero-weights across the board)",
- pattern: "\u009Db\u009Dä\u009D", // U+009D: OPERATING SYSTEM COMMAND
- n: 3,
- }} {
- m := New(language.Und, tc.options...)
- p := m.CompileString(tc.pattern)
- if len(p.ce) != tc.n {
- t.Errorf("%d:%s: Compile(%+q): got %d; want %d", i, tc.desc, tc.pattern, len(p.ce), tc.n)
- }
- }
- }
- func TestNorm(t *testing.T) {
- // U+0300: COMBINING GRAVE ACCENT (CCC=230)
- // U+031B: COMBINING HORN (CCC=216)
- for _, tc := range []struct {
- desc string
- a string
- b string
- want bool // a and b compile into the same pattern?
- }{{
- "simple",
- "eee\u0300\u031b",
- "eee\u031b\u0300",
- true,
- }, {
- "large number of modifiers in pattern",
- strings.Repeat("\u0300", 29) + "\u0318",
- "\u0318" + strings.Repeat("\u0300", 29),
- true,
- }, {
- "modifier overflow in pattern",
- strings.Repeat("\u0300", 30) + "\u0318",
- "\u0318" + strings.Repeat("\u0300", 30),
- false,
- }} {
- m := New(language.Und)
- a := m.CompileString(tc.a)
- b := m.CompileString(tc.b)
- if got := reflect.DeepEqual(a, b); got != tc.want {
- t.Errorf("Compile(a) == Compile(b) == %v; want %v", got, tc.want)
- }
- }
- }
- func TestForwardSearch(t *testing.T) {
- for i, tc := range []struct {
- desc string
- tag string
- options []Option
- pattern string
- text string
- want []int
- }{{
- // The semantics of an empty search is to match nothing.
- // TODO: change this to be in line with strings.Index? It is quite a
- // different beast, so not sure yet.
- desc: "empty pattern and text",
- tag: "und",
- pattern: "",
- text: "",
- want: nil, // TODO: consider: []int{0, 0},
- }, {
- desc: "non-empty pattern and empty text",
- tag: "und",
- pattern: " ",
- text: "",
- want: nil,
- }, {
- desc: "empty pattern and non-empty text",
- tag: "und",
- pattern: "",
- text: "abc",
- want: nil, // TODO: consider: []int{0, 0, 1, 1, 2, 2, 3, 3},
- }, {
- // Variable-only patterns. We don't support variables at the moment,
- // but verify that, given this, the behavior is indeed as expected.
- desc: "exact match of variable",
- tag: "und",
- pattern: " ",
- text: " ",
- want: []int{0, 1},
- }, {
- desc: "variables not handled by default",
- tag: "und",
- pattern: "- ",
- text: " -",
- want: nil, // Would be (1, 2) for a median match with variable}.
- }, {
- desc: "multiple subsequent identical variables",
- tag: "und",
- pattern: " ",
- text: " ",
- want: []int{0, 1, 1, 2, 2, 3, 3, 4},
- }, {
- desc: "text with variables",
- tag: "und",
- options: []Option{IgnoreDiacritics},
- pattern: "abc",
- text: "3 abc 3",
- want: []int{2, 5},
- }, {
- desc: "pattern with interior variables",
- tag: "und",
- options: []Option{IgnoreDiacritics},
- pattern: "a b c",
- text: "3 a b c abc a b c 3",
- want: []int{2, 7}, // Would have 3 matches using variable.
- // TODO: Different variable handling settings.
- }, {
- // Options.
- desc: "match all levels",
- tag: "und",
- pattern: "Abc",
- text: "abcAbcABCÁbcábc",
- want: []int{3, 6},
- }, {
- desc: "ignore diacritics in text",
- tag: "und",
- options: []Option{IgnoreDiacritics},
- pattern: "Abc",
- text: "Ábc",
- want: []int{0, 4},
- }, {
- desc: "ignore diacritics in pattern",
- tag: "und",
- options: []Option{IgnoreDiacritics},
- pattern: "Ábc",
- text: "Abc",
- want: []int{0, 3},
- }, {
- desc: "ignore diacritics",
- tag: "und",
- options: []Option{IgnoreDiacritics},
- pattern: "Abc",
- text: "abcAbcABCÁbcábc",
- want: []int{3, 6, 9, 13},
- }, {
- desc: "ignore case",
- tag: "und",
- options: []Option{IgnoreCase},
- pattern: "Abc",
- text: "abcAbcABCÁbcábc",
- want: []int{0, 3, 3, 6, 6, 9},
- }, {
- desc: "ignore case and diacritics",
- tag: "und",
- options: []Option{IgnoreCase, IgnoreDiacritics},
- pattern: "Abc",
- text: "abcAbcABCÁbcábc",
- want: []int{0, 3, 3, 6, 6, 9, 9, 13, 13, 17},
- }, {
- desc: "ignore width to fullwidth",
- tag: "und",
- options: []Option{IgnoreWidth},
- pattern: "abc",
- text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
- want: []int{4, 13},
- }, {
- // TODO: distinguish between case and width.
- desc: "don't ignore width to fullwidth, ignoring only case",
- tag: "und",
- options: []Option{IgnoreCase},
- pattern: "abc",
- text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
- want: []int{4, 13},
- }, {
- desc: "ignore width to fullwidth and diacritics",
- tag: "und",
- options: []Option{IgnoreWidth, IgnoreDiacritics},
- pattern: "abc",
- text: "123 \uFF41\uFF42\uFF43 123", // U+FF41-3: FULLWIDTH LATIN SMALL LETTER A-C
- want: []int{4, 13},
- }, {
- desc: "whole grapheme, single rune",
- tag: "und",
- pattern: "eee",
- text: "123 eeé 123",
- want: nil,
- }, {
- // Note: rules on when to apply contractions may, for certain languages,
- // differ between search and collation. For example, "ch" is not
- // considered a contraction for the purpose of searching in Spanish.
- // Therefore, be careful picking this test.
- desc: "whole grapheme, contractions",
- tag: "da",
- pattern: "aba",
- // Fails at the primary level, because "aa" is a contraction.
- text: "123 abaa 123",
- want: []int{},
- }, {
- desc: "whole grapheme, trailing modifier",
- tag: "und",
- pattern: "eee",
- text: "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT
- want: nil,
- }, {
- // Language-specific matching.
- desc: "",
- tag: "da",
- options: []Option{IgnoreCase},
- pattern: "Århus",
- text: "AarhusÅrhus Århus ",
- want: []int{0, 6, 6, 12, 14, 20},
- }, {
- desc: "",
- tag: "da",
- options: []Option{IgnoreCase},
- pattern: "Aarhus",
- text: "Århus Aarhus",
- want: []int{0, 6, 7, 13},
- }, {
- desc: "",
- tag: "en", // Å does not match A for English.
- options: []Option{IgnoreCase},
- pattern: "Aarhus",
- text: "Århus",
- want: nil,
- }, {
- desc: "ignore modifier in text",
- options: []Option{IgnoreDiacritics},
- tag: "und",
- pattern: "eee",
- text: "123 eee\u0300 123", // U+0300: COMBINING GRAVE ACCENT
- want: []int{4, 9}, // Matches on grapheme boundary.
- }, {
- desc: "ignore multiple modifiers in text",
- options: []Option{IgnoreDiacritics},
- tag: "und",
- pattern: "eee",
- text: "123 eee\u0300\u0300 123", // U+0300: COMBINING GRAVE ACCENT
- want: []int{4, 11}, // Matches on grapheme boundary.
- }, {
- desc: "ignore modifier in pattern",
- options: []Option{IgnoreDiacritics},
- tag: "und",
- pattern: "eee\u0300", // U+0300: COMBINING GRAVE ACCENT
- text: "123 eee 123",
- want: []int{4, 7},
- }, {
- desc: "ignore multiple modifiers in pattern",
- options: []Option{IgnoreDiacritics},
- tag: "und",
- pattern: "eee\u0300\u0300", // U+0300: COMBINING GRAVE ACCENT
- text: "123 eee 123",
- want: []int{4, 7},
- }, {
- desc: "match non-normalized pattern",
- tag: "und",
- // U+0300: COMBINING GRAVE ACCENT (CCC=230)
- // U+031B: COMBINING HORN (CCC=216)
- pattern: "eee\u0300\u031b",
- text: "123 eee\u031b\u0300 123",
- want: []int{4, 11},
- }, {
- desc: "match non-normalized text",
- tag: "und",
- // U+0300: COMBINING GRAVE ACCENT (CCC=230)
- // U+031B: COMBINING HORN (CCC=216)
- pattern: "eee\u031b\u0300",
- text: "123 eee\u0300\u031b 123",
- want: []int{4, 11},
- }} {
- m := New(language.MustParse(tc.tag), tc.options...)
- p := m.CompileString(tc.pattern)
- for j := 0; j < len(tc.text); {
- start, end := p.IndexString(tc.text[j:])
- if start == -1 && end == -1 {
- j++
- continue
- }
- start += j
- end += j
- j = end
- if len(tc.want) == 0 {
- t.Errorf("%d:%s: found unexpected result [%d %d]", i, tc.desc, start, end)
- break
- }
- if tc.want[0] != start || tc.want[1] != end {
- t.Errorf("%d:%s: got [%d %d]; want %v", i, tc.desc, start, end, tc.want[:2])
- tc.want = tc.want[2:]
- break
- }
- tc.want = tc.want[2:]
- }
- if len(tc.want) != 0 {
- t.Errorf("%d:%s: %d extra results", i, tc.desc, len(tc.want)/2)
- }
- }
- }
|