ucd.go 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package ucd provides a parser for Unicode Character Database files, the
  5. // format of which is defined in https://www.unicode.org/reports/tr44/. See
  6. // https://www.unicode.org/Public/UCD/latest/ucd/ for example files.
  7. //
  8. // It currently does not support substitutions of missing fields.
  9. package ucd // import "golang.org/x/text/internal/ucd"
  10. import (
  11. "bufio"
  12. "errors"
  13. "fmt"
  14. "io"
  15. "log"
  16. "regexp"
  17. "strconv"
  18. "strings"
  19. )
  20. // UnicodeData.txt fields.
  21. const (
  22. CodePoint = iota
  23. Name
  24. GeneralCategory
  25. CanonicalCombiningClass
  26. BidiClass
  27. DecompMapping
  28. DecimalValue
  29. DigitValue
  30. NumericValue
  31. BidiMirrored
  32. Unicode1Name
  33. ISOComment
  34. SimpleUppercaseMapping
  35. SimpleLowercaseMapping
  36. SimpleTitlecaseMapping
  37. )
  38. // Parse calls f for each entry in the given reader of a UCD file. It will close
  39. // the reader upon return. It will call log.Fatal if any error occurred.
  40. //
  41. // This implements the most common usage pattern of using Parser.
  42. func Parse(r io.ReadCloser, f func(p *Parser)) {
  43. defer r.Close()
  44. p := New(r)
  45. for p.Next() {
  46. f(p)
  47. }
  48. if err := p.Err(); err != nil {
  49. r.Close() // os.Exit will cause defers not to be called.
  50. log.Fatal(err)
  51. }
  52. }
  53. // An Option is used to configure a Parser.
  54. type Option func(p *Parser)
  55. func keepRanges(p *Parser) {
  56. p.keepRanges = true
  57. }
  58. var (
  59. // KeepRanges prevents the expansion of ranges. The raw ranges can be
  60. // obtained by calling Range(0) on the parser.
  61. KeepRanges Option = keepRanges
  62. )
  63. // The Part option register a handler for lines starting with a '@'. The text
  64. // after a '@' is available as the first field. Comments are handled as usual.
  65. func Part(f func(p *Parser)) Option {
  66. return func(p *Parser) {
  67. p.partHandler = f
  68. }
  69. }
  70. // The CommentHandler option passes comments that are on a line by itself to
  71. // a given handler.
  72. func CommentHandler(f func(s string)) Option {
  73. return func(p *Parser) {
  74. p.commentHandler = f
  75. }
  76. }
  77. // A Parser parses Unicode Character Database (UCD) files.
  78. type Parser struct {
  79. scanner *bufio.Scanner
  80. keepRanges bool // Don't expand rune ranges in field 0.
  81. err error
  82. comment string
  83. field []string
  84. // parsedRange is needed in case Range(0) is called more than once for one
  85. // field. In some cases this requires scanning ahead.
  86. line int
  87. parsedRange bool
  88. rangeStart, rangeEnd rune
  89. partHandler func(p *Parser)
  90. commentHandler func(s string)
  91. }
  92. func (p *Parser) setError(err error, msg string) {
  93. if p.err == nil && err != nil {
  94. if msg == "" {
  95. p.err = fmt.Errorf("ucd:line:%d: %v", p.line, err)
  96. } else {
  97. p.err = fmt.Errorf("ucd:line:%d:%s: %v", p.line, msg, err)
  98. }
  99. }
  100. }
  101. func (p *Parser) getField(i int) string {
  102. if i >= len(p.field) {
  103. return ""
  104. }
  105. return p.field[i]
  106. }
  107. // Err returns a non-nil error if any error occurred during parsing.
  108. func (p *Parser) Err() error {
  109. return p.err
  110. }
  111. // New returns a Parser for the given Reader.
  112. func New(r io.Reader, o ...Option) *Parser {
  113. p := &Parser{
  114. scanner: bufio.NewScanner(r),
  115. }
  116. for _, f := range o {
  117. f(p)
  118. }
  119. return p
  120. }
  121. // Next parses the next line in the file. It returns true if a line was parsed
  122. // and false if it reached the end of the file.
  123. func (p *Parser) Next() bool {
  124. if !p.keepRanges && p.rangeStart < p.rangeEnd {
  125. p.rangeStart++
  126. return true
  127. }
  128. p.comment = ""
  129. p.field = p.field[:0]
  130. p.parsedRange = false
  131. for p.scanner.Scan() && p.err == nil {
  132. p.line++
  133. s := p.scanner.Text()
  134. if s == "" {
  135. continue
  136. }
  137. if s[0] == '#' {
  138. if p.commentHandler != nil {
  139. p.commentHandler(strings.TrimSpace(s[1:]))
  140. }
  141. continue
  142. }
  143. // Parse line
  144. if i := strings.IndexByte(s, '#'); i != -1 {
  145. p.comment = strings.TrimSpace(s[i+1:])
  146. s = s[:i]
  147. }
  148. if s[0] == '@' {
  149. if p.partHandler != nil {
  150. p.field = append(p.field, strings.TrimSpace(s[1:]))
  151. p.partHandler(p)
  152. p.field = p.field[:0]
  153. }
  154. p.comment = ""
  155. continue
  156. }
  157. for {
  158. i := strings.IndexByte(s, ';')
  159. if i == -1 {
  160. p.field = append(p.field, strings.TrimSpace(s))
  161. break
  162. }
  163. p.field = append(p.field, strings.TrimSpace(s[:i]))
  164. s = s[i+1:]
  165. }
  166. if !p.keepRanges {
  167. p.rangeStart, p.rangeEnd = p.getRange(0)
  168. }
  169. return true
  170. }
  171. p.setError(p.scanner.Err(), "scanner failed")
  172. return false
  173. }
  174. func parseRune(b string) (rune, error) {
  175. if len(b) > 2 && b[0] == 'U' && b[1] == '+' {
  176. b = b[2:]
  177. }
  178. x, err := strconv.ParseUint(b, 16, 32)
  179. return rune(x), err
  180. }
  181. func (p *Parser) parseRune(s string) rune {
  182. x, err := parseRune(s)
  183. p.setError(err, "failed to parse rune")
  184. return x
  185. }
  186. // Rune parses and returns field i as a rune.
  187. func (p *Parser) Rune(i int) rune {
  188. if i > 0 || p.keepRanges {
  189. return p.parseRune(p.getField(i))
  190. }
  191. return p.rangeStart
  192. }
  193. // Runes interprets and returns field i as a sequence of runes.
  194. func (p *Parser) Runes(i int) (runes []rune) {
  195. add := func(s string) {
  196. if s = strings.TrimSpace(s); len(s) > 0 {
  197. runes = append(runes, p.parseRune(s))
  198. }
  199. }
  200. for b := p.getField(i); ; {
  201. i := strings.IndexByte(b, ' ')
  202. if i == -1 {
  203. add(b)
  204. break
  205. }
  206. add(b[:i])
  207. b = b[i+1:]
  208. }
  209. return
  210. }
  211. var (
  212. errIncorrectLegacyRange = errors.New("ucd: unmatched <* First>")
  213. // reRange matches one line of a legacy rune range.
  214. reRange = regexp.MustCompile("^([0-9A-F]*);<([^,]*), ([^>]*)>(.*)$")
  215. )
  216. // Range parses and returns field i as a rune range. A range is inclusive at
  217. // both ends. If the field only has one rune, first and last will be identical.
  218. // It supports the legacy format for ranges used in UnicodeData.txt.
  219. func (p *Parser) Range(i int) (first, last rune) {
  220. if !p.keepRanges {
  221. return p.rangeStart, p.rangeStart
  222. }
  223. return p.getRange(i)
  224. }
  225. func (p *Parser) getRange(i int) (first, last rune) {
  226. b := p.getField(i)
  227. if k := strings.Index(b, ".."); k != -1 {
  228. return p.parseRune(b[:k]), p.parseRune(b[k+2:])
  229. }
  230. // The first field may not be a rune, in which case we may ignore any error
  231. // and set the range as 0..0.
  232. x, err := parseRune(b)
  233. if err != nil {
  234. // Disable range parsing henceforth. This ensures that an error will be
  235. // returned if the user subsequently will try to parse this field as
  236. // a Rune.
  237. p.keepRanges = true
  238. }
  239. // Special case for UnicodeData that was retained for backwards compatibility.
  240. if i == 0 && len(p.field) > 1 && strings.HasSuffix(p.field[1], "First>") {
  241. if p.parsedRange {
  242. return p.rangeStart, p.rangeEnd
  243. }
  244. mf := reRange.FindStringSubmatch(p.scanner.Text())
  245. p.line++
  246. if mf == nil || !p.scanner.Scan() {
  247. p.setError(errIncorrectLegacyRange, "")
  248. return x, x
  249. }
  250. // Using Bytes would be more efficient here, but Text is a lot easier
  251. // and this is not a frequent case.
  252. ml := reRange.FindStringSubmatch(p.scanner.Text())
  253. if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] {
  254. p.setError(errIncorrectLegacyRange, "")
  255. return x, x
  256. }
  257. p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Text()[:len(ml[1])])
  258. p.parsedRange = true
  259. return p.rangeStart, p.rangeEnd
  260. }
  261. return x, x
  262. }
  263. // bools recognizes all valid UCD boolean values.
  264. var bools = map[string]bool{
  265. "": false,
  266. "N": false,
  267. "No": false,
  268. "F": false,
  269. "False": false,
  270. "Y": true,
  271. "Yes": true,
  272. "T": true,
  273. "True": true,
  274. }
  275. // Bool parses and returns field i as a boolean value.
  276. func (p *Parser) Bool(i int) bool {
  277. f := p.getField(i)
  278. for s, v := range bools {
  279. if f == s {
  280. return v
  281. }
  282. }
  283. p.setError(strconv.ErrSyntax, "error parsing bool")
  284. return false
  285. }
  286. // Int parses and returns field i as an integer value.
  287. func (p *Parser) Int(i int) int {
  288. x, err := strconv.ParseInt(string(p.getField(i)), 10, 64)
  289. p.setError(err, "error parsing int")
  290. return int(x)
  291. }
  292. // Uint parses and returns field i as an unsigned integer value.
  293. func (p *Parser) Uint(i int) uint {
  294. x, err := strconv.ParseUint(string(p.getField(i)), 10, 64)
  295. p.setError(err, "error parsing uint")
  296. return uint(x)
  297. }
  298. // Float parses and returns field i as a decimal value.
  299. func (p *Parser) Float(i int) float64 {
  300. x, err := strconv.ParseFloat(string(p.getField(i)), 64)
  301. p.setError(err, "error parsing float")
  302. return x
  303. }
  304. // String parses and returns field i as a string value.
  305. func (p *Parser) String(i int) string {
  306. return string(p.getField(i))
  307. }
  308. // Strings parses and returns field i as a space-separated list of strings.
  309. func (p *Parser) Strings(i int) []string {
  310. ss := strings.Split(string(p.getField(i)), " ")
  311. for i, s := range ss {
  312. ss[i] = strings.TrimSpace(s)
  313. }
  314. return ss
  315. }
  316. // Comment returns the comments for the current line.
  317. func (p *Parser) Comment() string {
  318. return string(p.comment)
  319. }
  320. var errUndefinedEnum = errors.New("ucd: undefined enum value")
  321. // Enum interprets and returns field i as a value that must be one of the values
  322. // in enum.
  323. func (p *Parser) Enum(i int, enum ...string) string {
  324. f := p.getField(i)
  325. for _, s := range enum {
  326. if f == s {
  327. return s
  328. }
  329. }
  330. p.setError(errUndefinedEnum, "error parsing enum")
  331. return ""
  332. }