Просмотр исходного кода

Improve heuristics preventing CPU/memory abuse (#515)

This addresses the following items:

==== Parse time of excessively deep nested or indented documents

Parsing these documents is non-linear; limiting stack depth to 10,000 keeps parse times of pathological documents sub-second (~.25 seconds in benchmarks)

==== Alias node expansion limits

The current limit allows 10,000% expansion, which is too permissive for large documents.

Limiting to 10% expansion for larger documents allows callers to use input size as an effective way to limit resource usage. Continuing to allow larger expansion rates (up to the current 10,000% limit) for smaller documents does not unduly affect memory use.

This change bounds decode operations from alias expansion to ~400,000 operations for small documents (worst-case ~100-150MB) or 10% of the input document for large documents, whichever is greater.
Jordan Liggitt 6 лет назад
Родитель
Сommit
f221b8435c
3 измененных файлов с 165 добавлено и 1 удалено
  1. 123 0
      benchmark_test.go
  2. 26 1
      decode.go
  3. 16 0
      scannerc.go

+ 123 - 0
benchmark_test.go

@@ -0,0 +1,123 @@
+package yaml_test
+
+import (
+	"strings"
+	"testing"
+
+	. "gopkg.in/check.v1"
+	"gopkg.in/yaml.v2"
+)
+
+type testcase struct {
+	name  string
+	data  []byte
+	error string
+}
+
+func testcases() []testcase {
+	return []testcase{
+		{
+			name:  "1000kb of maps with 100 aliases",
+			data:  []byte(`{a: &a [{a}` + strings.Repeat(`,{a}`, 1000*1024/4-100) + `], b: &b [*a` + strings.Repeat(`,*a`, 99) + `]}`),
+			error: "yaml: document contains excessive aliasing",
+		},
+		{
+			name:  "1000kb of deeply nested slices",
+			data:  []byte(strings.Repeat(`[`, 1000*1024)),
+			error: "yaml: exceeded max depth of 10000",
+		},
+		{
+			name:  "1000kb of deeply nested maps",
+			data:  []byte("x: " + strings.Repeat(`{`, 1000*1024)),
+			error: "yaml: exceeded max depth of 10000",
+		},
+		{
+			name:  "1000kb of deeply nested indents",
+			data:  []byte(strings.Repeat(`- `, 1000*1024)),
+			error: "yaml: exceeded max depth of 10000",
+		},
+		{
+			name: "1000kb of 1000-indent lines",
+			data: []byte(strings.Repeat(strings.Repeat(`- `, 1000)+"\n", 1024/2)),
+		},
+		{name: "1kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 1*1024/4-1) + `]`)},
+		{name: "10kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 10*1024/4-1) + `]`)},
+		{name: "100kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 100*1024/4-1) + `]`)},
+		{name: "1000kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 1000*1024/4-1) + `]`)},
+	}
+}
+
+func (s *S) TestLimits(c *C) {
+	if testing.Short() {
+		return
+	}
+	for _, tc := range testcases() {
+		var v interface{}
+		err := yaml.Unmarshal(tc.data, &v)
+		if len(tc.error) > 0 {
+			c.Assert(err, ErrorMatches, tc.error, Commentf("testcase: %s", tc.name))
+		} else {
+			c.Assert(err, IsNil, Commentf("testcase: %s", tc.name))
+		}
+	}
+}
+
+func Benchmark1000KB100Aliases(b *testing.B) {
+	benchmark(b, "1000kb of maps with 100 aliases")
+}
+func Benchmark1000KBDeeplyNestedSlices(b *testing.B) {
+	benchmark(b, "1000kb of deeply nested slices")
+}
+func Benchmark1000KBDeeplyNestedMaps(b *testing.B) {
+	benchmark(b, "1000kb of deeply nested maps")
+}
+func Benchmark1000KBDeeplyNestedIndents(b *testing.B) {
+	benchmark(b, "1000kb of deeply nested indents")
+}
+func Benchmark1000KB1000IndentLines(b *testing.B) {
+	benchmark(b, "1000kb of 1000-indent lines")
+}
+func Benchmark1KBMaps(b *testing.B) {
+	benchmark(b, "1kb of maps")
+}
+func Benchmark10KBMaps(b *testing.B) {
+	benchmark(b, "10kb of maps")
+}
+func Benchmark100KBMaps(b *testing.B) {
+	benchmark(b, "100kb of maps")
+}
+func Benchmark1000KBMaps(b *testing.B) {
+	benchmark(b, "1000kb of maps")
+}
+
+func benchmark(b *testing.B, name string) {
+	var tc testcase
+	for _, t := range testcases() {
+		if t.name == name {
+			tc = t
+			break
+		}
+	}
+	if tc.name != name {
+		b.Errorf("testcase %q not found", name)
+		return
+	}
+
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		var v interface{}
+		err := yaml.Unmarshal(tc.data, &v)
+		if len(tc.error) > 0 {
+			if err == nil {
+				b.Errorf("expected error, got none")
+			} else if err.Error() != tc.error {
+				b.Errorf("expected error '%s', got '%s'", tc.error, err.Error())
+			}
+		} else {
+			if err != nil {
+				b.Errorf("unexpected error: %v", err)
+			}
+		}
+	}
+}

+ 26 - 1
decode.go

@@ -318,12 +318,37 @@ func (d *decoder) prepare(n *node, out reflect.Value) (newout reflect.Value, unm
 	return out, false, false
 }
 
+const (
+	// 400,000 decode operations is ~500kb of dense object declarations, or ~5kb of dense object declarations with 10000% alias expansion
+	alias_ratio_range_low = 400000
+	// 4,000,000 decode operations is ~5MB of dense object declarations, or ~4.5MB of dense object declarations with 10% alias expansion
+	alias_ratio_range_high = 4000000
+	// alias_ratio_range is the range over which we scale allowed alias ratios
+	alias_ratio_range = float64(alias_ratio_range_high - alias_ratio_range_low)
+)
+
+func allowedAliasRatio(decodeCount int) float64 {
+	switch {
+	case decodeCount <= alias_ratio_range_low:
+		// allow 99% to come from alias expansion for small-to-medium documents
+		return 0.99
+	case decodeCount >= alias_ratio_range_high:
+		// allow 10% to come from alias expansion for very large documents
+		return 0.10
+	default:
+		// scale smoothly from 99% down to 10% over the range.
+		// this maps to 396,000 - 400,000 allowed alias-driven decodes over the range.
+		// 400,000 decode operations is ~100MB of allocations in worst-case scenarios (single-item maps).
+		return 0.99 - 0.89*(float64(decodeCount-alias_ratio_range_low)/alias_ratio_range)
+	}
+}
+
 func (d *decoder) unmarshal(n *node, out reflect.Value) (good bool) {
 	d.decodeCount++
 	if d.aliasDepth > 0 {
 		d.aliasCount++
 	}
-	if d.aliasCount > 100 && d.decodeCount > 1000 && float64(d.aliasCount)/float64(d.decodeCount) > 0.99 {
+	if d.aliasCount > 100 && d.decodeCount > 1000 && float64(d.aliasCount)/float64(d.decodeCount) > allowedAliasRatio(d.decodeCount) {
 		failf("document contains excessive aliasing")
 	}
 	switch n.kind {

+ 16 - 0
scannerc.go

@@ -906,6 +906,9 @@ func yaml_parser_remove_simple_key(parser *yaml_parser_t) bool {
 	return true
 }
 
+// max_flow_level limits the flow_level
+const max_flow_level = 10000
+
 // Increase the flow level and resize the simple key list if needed.
 func yaml_parser_increase_flow_level(parser *yaml_parser_t) bool {
 	// Reset the simple key on the next level.
@@ -913,6 +916,11 @@ func yaml_parser_increase_flow_level(parser *yaml_parser_t) bool {
 
 	// Increase the flow level.
 	parser.flow_level++
+	if parser.flow_level > max_flow_level {
+		return yaml_parser_set_scanner_error(parser,
+			"while increasing flow level", parser.simple_keys[len(parser.simple_keys)-1].mark,
+			fmt.Sprintf("exceeded max depth of %d", max_flow_level))
+	}
 	return true
 }
 
@@ -925,6 +933,9 @@ func yaml_parser_decrease_flow_level(parser *yaml_parser_t) bool {
 	return true
 }
 
+// max_indents limits the indents stack size
+const max_indents = 10000
+
 // Push the current indentation level to the stack and set the new level
 // the current column is greater than the indentation level.  In this case,
 // append or insert the specified token into the token queue.
@@ -939,6 +950,11 @@ func yaml_parser_roll_indent(parser *yaml_parser_t, column, number int, typ yaml
 		// indentation level.
 		parser.indents = append(parser.indents, parser.indent)
 		parser.indent = column
+		if len(parser.indents) > max_indents {
+			return yaml_parser_set_scanner_error(parser,
+				"while increasing indent level", parser.simple_keys[len(parser.simple_keys)-1].mark,
+				fmt.Sprintf("exceeded max depth of %d", max_indents))
+		}
 
 		// Create a token and insert it into the queue.
 		token := yaml_token_t{