doctype.go 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package html
  5. import (
  6. "strings"
  7. )
  8. // parseDoctype parses the data from a DoctypeToken into a name,
  9. // public identifier, and system identifier. It returns a Node whose Type
  10. // is DoctypeNode, whose Data is the name, and which has attributes
  11. // named "system" and "public" for the two identifiers if they were present.
  12. // quirks is whether the document should be parsed in "quirks mode".
  13. func parseDoctype(s string) (n *Node, quirks bool) {
  14. n = &Node{Type: DoctypeNode}
  15. // Find the name.
  16. space := strings.IndexAny(s, whitespace)
  17. if space == -1 {
  18. space = len(s)
  19. }
  20. n.Data = s[:space]
  21. // The comparison to "html" is case-sensitive.
  22. if n.Data != "html" {
  23. quirks = true
  24. }
  25. n.Data = strings.ToLower(n.Data)
  26. s = strings.TrimLeft(s[space:], whitespace)
  27. if len(s) < 6 {
  28. // It can't start with "PUBLIC" or "SYSTEM".
  29. // Ignore the rest of the string.
  30. return n, quirks || s != ""
  31. }
  32. key := strings.ToLower(s[:6])
  33. s = s[6:]
  34. for key == "public" || key == "system" {
  35. s = strings.TrimLeft(s, whitespace)
  36. if s == "" {
  37. break
  38. }
  39. quote := s[0]
  40. if quote != '"' && quote != '\'' {
  41. break
  42. }
  43. s = s[1:]
  44. q := strings.IndexRune(s, rune(quote))
  45. var id string
  46. if q == -1 {
  47. id = s
  48. s = ""
  49. } else {
  50. id = s[:q]
  51. s = s[q+1:]
  52. }
  53. n.Attr = append(n.Attr, Attribute{Key: key, Val: id})
  54. if key == "public" {
  55. key = "system"
  56. } else {
  57. key = ""
  58. }
  59. }
  60. if key != "" || s != "" {
  61. quirks = true
  62. } else if len(n.Attr) > 0 {
  63. if n.Attr[0].Key == "public" {
  64. public := strings.ToLower(n.Attr[0].Val)
  65. switch public {
  66. case "-//w3o//dtd w3 html strict 3.0//en//", "-/w3d/dtd html 4.0 transitional/en", "html":
  67. quirks = true
  68. default:
  69. for _, q := range quirkyIDs {
  70. if strings.HasPrefix(public, q) {
  71. quirks = true
  72. break
  73. }
  74. }
  75. }
  76. // The following two public IDs only cause quirks mode if there is no system ID.
  77. if len(n.Attr) == 1 && (strings.HasPrefix(public, "-//w3c//dtd html 4.01 frameset//") ||
  78. strings.HasPrefix(public, "-//w3c//dtd html 4.01 transitional//")) {
  79. quirks = true
  80. }
  81. }
  82. if lastAttr := n.Attr[len(n.Attr)-1]; lastAttr.Key == "system" &&
  83. strings.ToLower(lastAttr.Val) == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd" {
  84. quirks = true
  85. }
  86. }
  87. return n, quirks
  88. }
  89. // quirkyIDs is a list of public doctype identifiers that cause a document
  90. // to be interpreted in quirks mode. The identifiers should be in lower case.
  91. var quirkyIDs = []string{
  92. "+//silmaril//dtd html pro v0r11 19970101//",
  93. "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
  94. "-//as//dtd html 3.0 aswedit + extensions//",
  95. "-//ietf//dtd html 2.0 level 1//",
  96. "-//ietf//dtd html 2.0 level 2//",
  97. "-//ietf//dtd html 2.0 strict level 1//",
  98. "-//ietf//dtd html 2.0 strict level 2//",
  99. "-//ietf//dtd html 2.0 strict//",
  100. "-//ietf//dtd html 2.0//",
  101. "-//ietf//dtd html 2.1e//",
  102. "-//ietf//dtd html 3.0//",
  103. "-//ietf//dtd html 3.2 final//",
  104. "-//ietf//dtd html 3.2//",
  105. "-//ietf//dtd html 3//",
  106. "-//ietf//dtd html level 0//",
  107. "-//ietf//dtd html level 1//",
  108. "-//ietf//dtd html level 2//",
  109. "-//ietf//dtd html level 3//",
  110. "-//ietf//dtd html strict level 0//",
  111. "-//ietf//dtd html strict level 1//",
  112. "-//ietf//dtd html strict level 2//",
  113. "-//ietf//dtd html strict level 3//",
  114. "-//ietf//dtd html strict//",
  115. "-//ietf//dtd html//",
  116. "-//metrius//dtd metrius presentational//",
  117. "-//microsoft//dtd internet explorer 2.0 html strict//",
  118. "-//microsoft//dtd internet explorer 2.0 html//",
  119. "-//microsoft//dtd internet explorer 2.0 tables//",
  120. "-//microsoft//dtd internet explorer 3.0 html strict//",
  121. "-//microsoft//dtd internet explorer 3.0 html//",
  122. "-//microsoft//dtd internet explorer 3.0 tables//",
  123. "-//netscape comm. corp.//dtd html//",
  124. "-//netscape comm. corp.//dtd strict html//",
  125. "-//o'reilly and associates//dtd html 2.0//",
  126. "-//o'reilly and associates//dtd html extended 1.0//",
  127. "-//o'reilly and associates//dtd html extended relaxed 1.0//",
  128. "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
  129. "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
  130. "-//spyglass//dtd html 2.0 extended//",
  131. "-//sq//dtd html 2.0 hotmetal + extensions//",
  132. "-//sun microsystems corp.//dtd hotjava html//",
  133. "-//sun microsystems corp.//dtd hotjava strict html//",
  134. "-//w3c//dtd html 3 1995-03-24//",
  135. "-//w3c//dtd html 3.2 draft//",
  136. "-//w3c//dtd html 3.2 final//",
  137. "-//w3c//dtd html 3.2//",
  138. "-//w3c//dtd html 3.2s draft//",
  139. "-//w3c//dtd html 4.0 frameset//",
  140. "-//w3c//dtd html 4.0 transitional//",
  141. "-//w3c//dtd html experimental 19960712//",
  142. "-//w3c//dtd html experimental 970421//",
  143. "-//w3c//dtd w3 html//",
  144. "-//w3o//dtd w3 html 3.0//",
  145. "-//webtechs//dtd mozilla html 2.0//",
  146. "-//webtechs//dtd mozilla html//",
  147. }