text_parser.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670
  1. // Go support for Protocol Buffers - Google's data interchange format
  2. //
  3. // Copyright 2010 The Go Authors. All rights reserved.
  4. // http://code.google.com/p/goprotobuf/
  5. //
  6. // Redistribution and use in source and binary forms, with or without
  7. // modification, are permitted provided that the following conditions are
  8. // met:
  9. //
  10. // * Redistributions of source code must retain the above copyright
  11. // notice, this list of conditions and the following disclaimer.
  12. // * Redistributions in binary form must reproduce the above
  13. // copyright notice, this list of conditions and the following disclaimer
  14. // in the documentation and/or other materials provided with the
  15. // distribution.
  16. // * Neither the name of Google Inc. nor the names of its
  17. // contributors may be used to endorse or promote products derived from
  18. // this software without specific prior written permission.
  19. //
  20. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  21. // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  22. // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  23. // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  24. // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  25. // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  26. // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  27. // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  28. // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  29. // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  30. // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  31. package proto
  32. // Functions for parsing the Text protocol buffer format.
  33. // TODO: message sets.
  34. import (
  35. "errors"
  36. "fmt"
  37. "reflect"
  38. "strconv"
  39. "strings"
  40. "unicode/utf8"
  41. )
  42. type ParseError struct {
  43. Message string
  44. Line int // 1-based line number
  45. Offset int // 0-based byte offset from start of input
  46. }
  47. func (p *ParseError) Error() string {
  48. if p.Line == 1 {
  49. // show offset only for first line
  50. return fmt.Sprintf("line 1.%d: %v", p.Offset, p.Message)
  51. }
  52. return fmt.Sprintf("line %d: %v", p.Line, p.Message)
  53. }
  54. type token struct {
  55. value string
  56. err *ParseError
  57. line int // line number
  58. offset int // byte number from start of input, not start of line
  59. unquoted string // the unquoted version of value, if it was a quoted string
  60. }
  61. func (t *token) String() string {
  62. if t.err == nil {
  63. return fmt.Sprintf("%q (line=%d, offset=%d)", t.value, t.line, t.offset)
  64. }
  65. return fmt.Sprintf("parse error: %v", t.err)
  66. }
  67. type textParser struct {
  68. s string // remaining input
  69. done bool // whether the parsing is finished (success or error)
  70. backed bool // whether back() was called
  71. offset, line int
  72. cur token
  73. }
  74. func newTextParser(s string) *textParser {
  75. p := new(textParser)
  76. p.s = s
  77. p.line = 1
  78. p.cur.line = 1
  79. return p
  80. }
  81. func (p *textParser) errorf(format string, a ...interface{}) *ParseError {
  82. pe := &ParseError{fmt.Sprintf(format, a...), p.cur.line, p.cur.offset}
  83. p.cur.err = pe
  84. p.done = true
  85. return pe
  86. }
  87. // Numbers and identifiers are matched by [-+._A-Za-z0-9]
  88. func isIdentOrNumberChar(c byte) bool {
  89. switch {
  90. case 'A' <= c && c <= 'Z', 'a' <= c && c <= 'z':
  91. return true
  92. case '0' <= c && c <= '9':
  93. return true
  94. }
  95. switch c {
  96. case '-', '+', '.', '_':
  97. return true
  98. }
  99. return false
  100. }
  101. func isWhitespace(c byte) bool {
  102. switch c {
  103. case ' ', '\t', '\n', '\r':
  104. return true
  105. }
  106. return false
  107. }
  108. func (p *textParser) skipWhitespace() {
  109. i := 0
  110. for i < len(p.s) && (isWhitespace(p.s[i]) || p.s[i] == '#') {
  111. if p.s[i] == '#' {
  112. // comment; skip to end of line or input
  113. for i < len(p.s) && p.s[i] != '\n' {
  114. i++
  115. }
  116. if i == len(p.s) {
  117. break
  118. }
  119. }
  120. if p.s[i] == '\n' {
  121. p.line++
  122. }
  123. i++
  124. }
  125. p.offset += i
  126. p.s = p.s[i:len(p.s)]
  127. if len(p.s) == 0 {
  128. p.done = true
  129. }
  130. }
  131. func (p *textParser) advance() {
  132. // Skip whitespace
  133. p.skipWhitespace()
  134. if p.done {
  135. return
  136. }
  137. // Start of non-whitespace
  138. p.cur.err = nil
  139. p.cur.offset, p.cur.line = p.offset, p.line
  140. p.cur.unquoted = ""
  141. switch p.s[0] {
  142. case '<', '>', '{', '}', ':', '[', ']', ';', ',':
  143. // Single symbol
  144. p.cur.value, p.s = p.s[0:1], p.s[1:len(p.s)]
  145. case '"', '\'':
  146. // Quoted string
  147. i := 1
  148. for i < len(p.s) && p.s[i] != p.s[0] && p.s[i] != '\n' {
  149. if p.s[i] == '\\' && i+1 < len(p.s) {
  150. // skip escaped char
  151. i++
  152. }
  153. i++
  154. }
  155. if i >= len(p.s) || p.s[i] != p.s[0] {
  156. p.errorf("unmatched quote")
  157. return
  158. }
  159. unq, err := unquoteC(p.s[1:i], rune(p.s[0]))
  160. if err != nil {
  161. p.errorf("invalid quoted string %v", p.s[0:i+1])
  162. return
  163. }
  164. p.cur.value, p.s = p.s[0:i+1], p.s[i+1:len(p.s)]
  165. p.cur.unquoted = unq
  166. default:
  167. i := 0
  168. for i < len(p.s) && isIdentOrNumberChar(p.s[i]) {
  169. i++
  170. }
  171. if i == 0 {
  172. p.errorf("unexpected byte %#x", p.s[0])
  173. return
  174. }
  175. p.cur.value, p.s = p.s[0:i], p.s[i:len(p.s)]
  176. }
  177. p.offset += len(p.cur.value)
  178. }
  179. var (
  180. errBadUTF8 = errors.New("proto: bad UTF-8")
  181. errBadHex = errors.New("proto: bad hexadecimal")
  182. )
  183. func unquoteC(s string, quote rune) (string, error) {
  184. // This is based on C++'s tokenizer.cc.
  185. // Despite its name, this is *not* parsing C syntax.
  186. // For instance, "\0" is an invalid quoted string.
  187. // Avoid allocation in trivial cases.
  188. simple := true
  189. for _, r := range s {
  190. if r == '\\' || r == quote {
  191. simple = false
  192. break
  193. }
  194. }
  195. if simple {
  196. return s, nil
  197. }
  198. buf := make([]byte, 0, 3*len(s)/2)
  199. for len(s) > 0 {
  200. r, n := utf8.DecodeRuneInString(s)
  201. if r == utf8.RuneError && n == 1 {
  202. return "", errBadUTF8
  203. }
  204. s = s[n:]
  205. if r != '\\' {
  206. if r < utf8.RuneSelf {
  207. buf = append(buf, byte(r))
  208. } else {
  209. buf = append(buf, string(r)...)
  210. }
  211. continue
  212. }
  213. ch, tail, err := unescape(s)
  214. if err != nil {
  215. return "", err
  216. }
  217. buf = append(buf, ch...)
  218. s = tail
  219. }
  220. return string(buf), nil
  221. }
  222. func unescape(s string) (ch string, tail string, err error) {
  223. r, n := utf8.DecodeRuneInString(s)
  224. if r == utf8.RuneError && n == 1 {
  225. return "", "", errBadUTF8
  226. }
  227. s = s[n:]
  228. switch r {
  229. case 'a':
  230. return "\a", s, nil
  231. case 'b':
  232. return "\b", s, nil
  233. case 'f':
  234. return "\f", s, nil
  235. case 'n':
  236. return "\n", s, nil
  237. case 'r':
  238. return "\r", s, nil
  239. case 't':
  240. return "\t", s, nil
  241. case 'v':
  242. return "\v", s, nil
  243. case '?':
  244. return "?", s, nil // trigraph workaround
  245. case '\'', '"', '\\':
  246. return string(r), s, nil
  247. case '0', '1', '2', '3', '4', '5', '6', '7', 'x', 'X':
  248. if len(s) < 2 {
  249. return "", "", fmt.Errorf(`\%c requires 2 following digits`, r)
  250. }
  251. base := 8
  252. ss := s[:2]
  253. s = s[2:]
  254. if r == 'x' || r == 'X' {
  255. base = 16
  256. } else {
  257. ss = string(r) + ss
  258. }
  259. i, err := strconv.ParseUint(ss, base, 8)
  260. if err != nil {
  261. return "", "", err
  262. }
  263. return string([]byte{byte(i)}), s, nil
  264. case 'u', 'U':
  265. n := 4
  266. if r == 'U' {
  267. n = 8
  268. }
  269. if len(s) < n {
  270. return "", "", fmt.Errorf(`\%c requires %d digits`, r, n)
  271. }
  272. bs := make([]byte, n/2)
  273. for i := 0; i < n; i += 2 {
  274. a, ok1 := unhex(s[i])
  275. b, ok2 := unhex(s[i+1])
  276. if !ok1 || !ok2 {
  277. return "", "", errBadHex
  278. }
  279. bs[i/2] = a<<4 | b
  280. }
  281. s = s[n:]
  282. return string(bs), s, nil
  283. }
  284. return "", "", fmt.Errorf(`unknown escape \%c`, r)
  285. }
  286. // Adapted from src/pkg/strconv/quote.go.
  287. func unhex(b byte) (v byte, ok bool) {
  288. switch {
  289. case '0' <= b && b <= '9':
  290. return b - '0', true
  291. case 'a' <= b && b <= 'f':
  292. return b - 'a' + 10, true
  293. case 'A' <= b && b <= 'F':
  294. return b - 'A' + 10, true
  295. }
  296. return 0, false
  297. }
  298. // Back off the parser by one token. Can only be done between calls to next().
  299. // It makes the next advance() a no-op.
  300. func (p *textParser) back() { p.backed = true }
  301. // Advances the parser and returns the new current token.
  302. func (p *textParser) next() *token {
  303. if p.backed || p.done {
  304. p.backed = false
  305. return &p.cur
  306. }
  307. p.advance()
  308. if p.done {
  309. p.cur.value = ""
  310. } else if len(p.cur.value) > 0 && p.cur.value[0] == '"' {
  311. // Look for multiple quoted strings separated by whitespace,
  312. // and concatenate them.
  313. cat := p.cur
  314. for {
  315. p.skipWhitespace()
  316. if p.done || p.s[0] != '"' {
  317. break
  318. }
  319. p.advance()
  320. if p.cur.err != nil {
  321. return &p.cur
  322. }
  323. cat.value += " " + p.cur.value
  324. cat.unquoted += p.cur.unquoted
  325. }
  326. p.done = false // parser may have seen EOF, but we want to return cat
  327. p.cur = cat
  328. }
  329. return &p.cur
  330. }
  331. // Return an error indicating which required field was not set.
  332. func (p *textParser) missingRequiredFieldError(sv reflect.Value) *ParseError {
  333. st := sv.Type()
  334. sprops := GetProperties(st)
  335. for i := 0; i < st.NumField(); i++ {
  336. if !isNil(sv.Field(i)) {
  337. continue
  338. }
  339. props := sprops.Prop[i]
  340. if props.Required {
  341. return p.errorf("message %v missing required field %q", st, props.OrigName)
  342. }
  343. }
  344. return p.errorf("message %v missing required field", st) // should not happen
  345. }
  346. // Returns the index in the struct for the named field, as well as the parsed tag properties.
  347. func structFieldByName(st reflect.Type, name string) (int, *Properties, bool) {
  348. sprops := GetProperties(st)
  349. i, ok := sprops.decoderOrigNames[name]
  350. if ok {
  351. return i, sprops.Prop[i], true
  352. }
  353. return -1, nil, false
  354. }
  355. // Consume a ':' from the input stream (if the next token is a colon),
  356. // returning an error if a colon is needed but not present.
  357. func (p *textParser) checkForColon(props *Properties, typ reflect.Type) *ParseError {
  358. tok := p.next()
  359. if tok.err != nil {
  360. return tok.err
  361. }
  362. if tok.value != ":" {
  363. // Colon is optional when the field is a group or message.
  364. needColon := true
  365. switch props.Wire {
  366. case "group":
  367. needColon = false
  368. case "bytes":
  369. // A "bytes" field is either a message, a string, or a repeated field;
  370. // those three become *T, *string and []T respectively, so we can check for
  371. // this field being a pointer to a non-string.
  372. if typ.Kind() == reflect.Ptr {
  373. // *T or *string
  374. if typ.Elem().Kind() == reflect.String {
  375. break
  376. }
  377. } else if typ.Kind() == reflect.Slice {
  378. // []T or []*T
  379. if typ.Elem().Kind() != reflect.Ptr {
  380. break
  381. }
  382. }
  383. needColon = false
  384. }
  385. if needColon {
  386. return p.errorf("expected ':', found %q", tok.value)
  387. }
  388. p.back()
  389. }
  390. return nil
  391. }
  392. func (p *textParser) readStruct(sv reflect.Value, terminator string) *ParseError {
  393. st := sv.Type()
  394. reqCount := GetProperties(st).reqCount
  395. // A struct is a sequence of "name: value", terminated by one of
  396. // '>' or '}', or the end of the input. A name may also be
  397. // "[extension]".
  398. for {
  399. tok := p.next()
  400. if tok.err != nil {
  401. return tok.err
  402. }
  403. if tok.value == terminator {
  404. break
  405. }
  406. if tok.value == "[" {
  407. // Looks like an extension.
  408. //
  409. // TODO: Check whether we need to handle
  410. // namespace rooted names (e.g. ".something.Foo").
  411. tok = p.next()
  412. if tok.err != nil {
  413. return tok.err
  414. }
  415. var desc *ExtensionDesc
  416. // This could be faster, but it's functional.
  417. // TODO: Do something smarter than a linear scan.
  418. for _, d := range RegisteredExtensions(reflect.New(st).Interface().(Message)) {
  419. if d.Name == tok.value {
  420. desc = d
  421. break
  422. }
  423. }
  424. if desc == nil {
  425. return p.errorf("unrecognized extension %q", tok.value)
  426. }
  427. // Check the extension terminator.
  428. tok = p.next()
  429. if tok.err != nil {
  430. return tok.err
  431. }
  432. if tok.value != "]" {
  433. return p.errorf("unrecognized extension terminator %q", tok.value)
  434. }
  435. props := &Properties{}
  436. props.Parse(desc.Tag)
  437. typ := reflect.TypeOf(desc.ExtensionType)
  438. if err := p.checkForColon(props, typ); err != nil {
  439. return err
  440. }
  441. rep := desc.repeated()
  442. // Read the extension structure, and set it in
  443. // the value we're constructing.
  444. var ext reflect.Value
  445. if !rep {
  446. ext = reflect.New(typ).Elem()
  447. } else {
  448. ext = reflect.New(typ.Elem()).Elem()
  449. }
  450. if err := p.readAny(ext, props); err != nil {
  451. return err
  452. }
  453. ep := sv.Addr().Interface().(extendableProto)
  454. if !rep {
  455. SetExtension(ep, desc, ext.Interface())
  456. } else {
  457. old, err := GetExtension(ep, desc)
  458. var sl reflect.Value
  459. if err == nil {
  460. sl = reflect.ValueOf(old) // existing slice
  461. } else {
  462. sl = reflect.MakeSlice(typ, 0, 1)
  463. }
  464. sl = reflect.Append(sl, ext)
  465. SetExtension(ep, desc, sl.Interface())
  466. }
  467. } else {
  468. // This is a normal, non-extension field.
  469. fi, props, ok := structFieldByName(st, tok.value)
  470. if !ok {
  471. return p.errorf("unknown field name %q in %v", tok.value, st)
  472. }
  473. dst := sv.Field(fi)
  474. isDstNil := isNil(dst)
  475. // Check that it's not already set if it's not a repeated field.
  476. if !props.Repeated && !isDstNil {
  477. return p.errorf("non-repeated field %q was repeated", tok.value)
  478. }
  479. if err := p.checkForColon(props, st.Field(fi).Type); err != nil {
  480. return err
  481. }
  482. // Parse into the field.
  483. if err := p.readAny(dst, props); err != nil {
  484. return err
  485. }
  486. if props.Required {
  487. reqCount--
  488. }
  489. }
  490. // For backward compatibility, permit a semicolon or comma after a field.
  491. tok = p.next()
  492. if tok.err != nil {
  493. return tok.err
  494. }
  495. if tok.value != ";" && tok.value != "," {
  496. p.back()
  497. }
  498. }
  499. if reqCount > 0 {
  500. return p.missingRequiredFieldError(sv)
  501. }
  502. return nil
  503. }
  504. func (p *textParser) readAny(v reflect.Value, props *Properties) *ParseError {
  505. tok := p.next()
  506. if tok.err != nil {
  507. return tok.err
  508. }
  509. if tok.value == "" {
  510. return p.errorf("unexpected EOF")
  511. }
  512. switch fv := v; fv.Kind() {
  513. case reflect.Slice:
  514. at := v.Type()
  515. if at.Elem().Kind() == reflect.Uint8 {
  516. // Special case for []byte
  517. if tok.value[0] != '"' && tok.value[0] != '\'' {
  518. // Deliberately written out here, as the error after
  519. // this switch statement would write "invalid []byte: ...",
  520. // which is not as user-friendly.
  521. return p.errorf("invalid string: %v", tok.value)
  522. }
  523. bytes := []byte(tok.unquoted)
  524. fv.Set(reflect.ValueOf(bytes))
  525. return nil
  526. }
  527. // Repeated field. May already exist.
  528. flen := fv.Len()
  529. if flen == fv.Cap() {
  530. nav := reflect.MakeSlice(at, flen, 2*flen+1)
  531. reflect.Copy(nav, fv)
  532. fv.Set(nav)
  533. }
  534. fv.SetLen(flen + 1)
  535. // Read one.
  536. p.back()
  537. return p.readAny(fv.Index(flen), props)
  538. case reflect.Bool:
  539. // Either "true", "false", 1 or 0.
  540. switch tok.value {
  541. case "true", "1":
  542. fv.SetBool(true)
  543. return nil
  544. case "false", "0":
  545. fv.SetBool(false)
  546. return nil
  547. }
  548. case reflect.Float32, reflect.Float64:
  549. v := tok.value
  550. // Ignore 'f' for compatibility with output generated by C++, but don't
  551. // remove 'f' when the value is "-inf" or "inf".
  552. if strings.HasSuffix(v, "f") && tok.value != "-inf" && tok.value != "inf" {
  553. v = v[:len(v)-1]
  554. }
  555. if f, err := strconv.ParseFloat(v, fv.Type().Bits()); err == nil {
  556. fv.SetFloat(f)
  557. return nil
  558. }
  559. case reflect.Int32:
  560. if x, err := strconv.ParseInt(tok.value, 0, 32); err == nil {
  561. fv.SetInt(x)
  562. return nil
  563. }
  564. if len(props.Enum) == 0 {
  565. break
  566. }
  567. m, ok := enumValueMaps[props.Enum]
  568. if !ok {
  569. break
  570. }
  571. x, ok := m[tok.value]
  572. if !ok {
  573. break
  574. }
  575. fv.SetInt(int64(x))
  576. return nil
  577. case reflect.Int64:
  578. if x, err := strconv.ParseInt(tok.value, 0, 64); err == nil {
  579. fv.SetInt(x)
  580. return nil
  581. }
  582. case reflect.Ptr:
  583. // A basic field (indirected through pointer), or a repeated message/group
  584. p.back()
  585. fv.Set(reflect.New(fv.Type().Elem()))
  586. return p.readAny(fv.Elem(), props)
  587. case reflect.String:
  588. if tok.value[0] == '"' || tok.value[0] == '\'' {
  589. fv.SetString(tok.unquoted)
  590. return nil
  591. }
  592. case reflect.Struct:
  593. var terminator string
  594. switch tok.value {
  595. case "{":
  596. terminator = "}"
  597. case "<":
  598. terminator = ">"
  599. default:
  600. return p.errorf("expected '{' or '<', found %q", tok.value)
  601. }
  602. return p.readStruct(fv, terminator)
  603. case reflect.Uint32:
  604. if x, err := strconv.ParseUint(tok.value, 0, 32); err == nil {
  605. fv.SetUint(uint64(x))
  606. return nil
  607. }
  608. case reflect.Uint64:
  609. if x, err := strconv.ParseUint(tok.value, 0, 64); err == nil {
  610. fv.SetUint(x)
  611. return nil
  612. }
  613. }
  614. return p.errorf("invalid %v: %v", v.Type(), tok.value)
  615. }
  616. // UnmarshalText reads a protocol buffer in Text format. UnmarshalText resets pb
  617. // before starting to unmarshal, so any existing data in pb is always removed.
  618. func UnmarshalText(s string, pb Message) error {
  619. pb.Reset()
  620. v := reflect.ValueOf(pb)
  621. if pe := newTextParser(s).readStruct(v.Elem(), ""); pe != nil {
  622. return pe
  623. }
  624. return nil
  625. }