icu_test.go 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:build icu
  5. // +build icu
  6. package cases
  7. import (
  8. "path"
  9. "strings"
  10. "testing"
  11. "golang.org/x/text/internal/testtext"
  12. "golang.org/x/text/language"
  13. "golang.org/x/text/unicode/norm"
  14. )
  15. func TestICUConformance(t *testing.T) {
  16. // Build test set.
  17. input := []string{
  18. "a.a a_a",
  19. "a\u05d0a",
  20. "\u05d0'a",
  21. "a\u03084a",
  22. "a\u0308a",
  23. "a3\u30a3a",
  24. "a\u303aa",
  25. "a_\u303a_a",
  26. "1_a..a",
  27. "1_a.a",
  28. "a..a.",
  29. "a--a-",
  30. "a-a-",
  31. "a\u200ba",
  32. "a\u200b\u200ba",
  33. "a\u00ad\u00ada", // Format
  34. "a\u00ada",
  35. "a''a", // SingleQuote
  36. "a'a",
  37. "a::a", // MidLetter
  38. "a:a",
  39. "a..a", // MidNumLet
  40. "a.a",
  41. "a;;a", // MidNum
  42. "a;a",
  43. "a__a", // ExtendNumlet
  44. "a_a",
  45. "ΟΣ''a",
  46. }
  47. add := func(x interface{}) {
  48. switch v := x.(type) {
  49. case string:
  50. input = append(input, v)
  51. case []string:
  52. for _, s := range v {
  53. input = append(input, s)
  54. }
  55. }
  56. }
  57. for _, tc := range testCases {
  58. add(tc.src)
  59. add(tc.lower)
  60. add(tc.upper)
  61. add(tc.title)
  62. }
  63. for _, tc := range bufferTests {
  64. add(tc.src)
  65. }
  66. for _, tc := range breakTest {
  67. add(strings.Replace(tc, "|", "", -1))
  68. }
  69. for _, tc := range foldTestCases {
  70. add(tc)
  71. }
  72. // Compare ICU to Go.
  73. for _, c := range []string{"lower", "upper", "title", "fold"} {
  74. for _, tag := range []string{
  75. "und", "af", "az", "el", "lt", "nl", "tr",
  76. } {
  77. for _, s := range input {
  78. if exclude(c, tag, s) {
  79. continue
  80. }
  81. testtext.Run(t, path.Join(c, tag, s), func(t *testing.T) {
  82. want := doICU(tag, c, s)
  83. got := doGo(tag, c, s)
  84. if norm.NFC.String(got) != norm.NFC.String(want) {
  85. t.Errorf("\n in %[3]q (%+[3]q)\n got %[1]q (%+[1]q)\n want %[2]q (%+[2]q)", got, want, s)
  86. }
  87. })
  88. }
  89. }
  90. }
  91. }
  92. // exclude indicates if a string should be excluded from testing.
  93. func exclude(cm, tag, s string) bool {
  94. list := []struct{ cm, tags, pattern string }{
  95. // TODO: Go does not handle certain esoteric breaks correctly. This will be
  96. // fixed once we have a real word break iterator. Alternatively, it
  97. // seems like we're not too far off from making it work, so we could
  98. // fix these last steps. But first verify that using a separate word
  99. // breaker does not hurt performance.
  100. {"title", "af nl", "a''a"},
  101. {"", "", "א'a"},
  102. // All the exclusions below seem to be issues with the ICU
  103. // implementation (at version 57) and thus are not marked as TODO.
  104. // ICU does not handle leading apostrophe for Dutch and
  105. // Afrikaans correctly. See https://unicode.org/cldr/trac/ticket/7078.
  106. {"title", "af nl", "'n"},
  107. {"title", "af nl", "'N"},
  108. // Go terminates the final sigma check after a fixed number of
  109. // ignorables have been found. This ensures that the algorithm can make
  110. // progress in a streaming scenario.
  111. {"lower title", "", "\u039f\u03a3...............................a"},
  112. // This also applies to upper in Greek.
  113. // NOTE: we could fix the following two cases by adding state to elUpper
  114. // and aztrLower. However, considering a modifier to not belong to the
  115. // preceding letter after the maximum modifiers count is reached is
  116. // consistent with the behavior of unicode/norm.
  117. {"upper", "el", "\u03bf" + strings.Repeat("\u0321", 29) + "\u0313"},
  118. {"lower", "az tr lt", "I" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
  119. {"upper", "lt", "i" + strings.Repeat("\u0321", 30) + "\u0307\u0300"},
  120. {"lower", "lt", "I" + strings.Repeat("\u0321", 30) + "\u0300"},
  121. // ICU title case seems to erroneously removes \u0307 from an upper case
  122. // I unconditionally, instead of only when lowercasing. The ICU
  123. // transform algorithm transforms these cases consistently with our
  124. // implementation.
  125. {"title", "az tr", "\u0307"},
  126. // The spec says to remove \u0307 after Soft-Dotted characters. ICU
  127. // transforms conform but ucasemap_utf8ToUpper does not.
  128. {"upper title", "lt", "i\u0307"},
  129. {"upper title", "lt", "i" + strings.Repeat("\u0321", 29) + "\u0307\u0300"},
  130. // Both Unicode and CLDR prescribe an extra explicit dot above after a
  131. // Soft_Dotted character if there are other modifiers.
  132. // ucasemap_utf8ToUpper does not do this; ICU transforms do.
  133. // The issue with ucasemap_utf8ToUpper seems to be that it does not
  134. // consider the modifiers that are part of composition in the evaluation
  135. // of More_Above. For instance, according to the More_Above rule for lt,
  136. // a dotted capital I (U+0130) becomes i\u0307\u0307 (an small i with
  137. // two additional dots). This seems odd, but is correct. ICU is
  138. // definitely not correct as it produces different results for different
  139. // normal forms. For instance, for an İ:
  140. // \u0130 (NFC) -> i\u0307 (incorrect)
  141. // I\u0307 (NFD) -> i\u0307\u0307 (correct)
  142. // We could argue that we should not add a \u0307 if there already is
  143. // one, but this may be hard to get correct and is not conform the
  144. // standard.
  145. {"lower title", "lt", "\u0130"},
  146. {"lower title", "lt", "\u00cf"},
  147. // We are conform ICU ucasemap_utf8ToUpper if we remove support for
  148. // elUpper. However, this is clearly not conform the spec. Moreover, the
  149. // ICU transforms _do_ implement this transform and produces results
  150. // consistent with our implementation. Note that we still prefer to use
  151. // ucasemap_utf8ToUpper instead of transforms as the latter have
  152. // inconsistencies in the word breaking algorithm.
  153. {"upper", "el", "\u0386"}, // GREEK CAPITAL LETTER ALPHA WITH TONOS
  154. {"upper", "el", "\u0389"}, // GREEK CAPITAL LETTER ETA WITH TONOS
  155. {"upper", "el", "\u038A"}, // GREEK CAPITAL LETTER IOTA WITH TONOS
  156. {"upper", "el", "\u0391"}, // GREEK CAPITAL LETTER ALPHA
  157. {"upper", "el", "\u0397"}, // GREEK CAPITAL LETTER ETA
  158. {"upper", "el", "\u0399"}, // GREEK CAPITAL LETTER IOTA
  159. {"upper", "el", "\u03AC"}, // GREEK SMALL LETTER ALPHA WITH TONOS
  160. {"upper", "el", "\u03AE"}, // GREEK SMALL LETTER ALPHA WITH ETA
  161. {"upper", "el", "\u03AF"}, // GREEK SMALL LETTER ALPHA WITH IOTA
  162. {"upper", "el", "\u03B1"}, // GREEK SMALL LETTER ALPHA
  163. {"upper", "el", "\u03B7"}, // GREEK SMALL LETTER ETA
  164. {"upper", "el", "\u03B9"}, // GREEK SMALL LETTER IOTA
  165. }
  166. for _, x := range list {
  167. if x.cm != "" && strings.Index(x.cm, cm) == -1 {
  168. continue
  169. }
  170. if x.tags != "" && strings.Index(x.tags, tag) == -1 {
  171. continue
  172. }
  173. if strings.Index(s, x.pattern) != -1 {
  174. return true
  175. }
  176. }
  177. return false
  178. }
  179. func doGo(tag, caser, input string) string {
  180. var c Caser
  181. t := language.MustParse(tag)
  182. switch caser {
  183. case "lower":
  184. c = Lower(t)
  185. case "upper":
  186. c = Upper(t)
  187. case "title":
  188. c = Title(t)
  189. case "fold":
  190. c = Fold()
  191. }
  192. return c.String(input)
  193. }