From a6ecf24a6d716d933bcbc255a2f5d492285b54f5 Mon Sep 17 00:00:00 2001
From: Gustavo Niemeyer <gustavo@niemeyer.net>
Date: Tue, 21 Jan 2020 17:51:48 +0000
Subject: [PATCH] Port CJ's simple_keys optimization to v3 (#555)

Message from original commit (53403b5):

This change introduces an index to lookup token numbers referenced by simple_keys in O(1),
thus significantly reducing the performance impact of certain abusively constructed snippets.

When we build up the simple_keys stack, we count on the (formerly named) staleness check to
catch errors where a simple key is required but would be > 1024 chars or span lines. The previous
simplification that searches the stack from the top can go 1024 keys deep before finding a "stale"
key and stopping. I added a test that shows that this consumes ~3s per 1MB of document size.
---
 limit_test.go | 15 +++++++++++++++
 scannerc.go   | 46 ++++++++++++++++++++--------------------------
 yamlh.go      |  1 +
 3 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/limit_test.go b/limit_test.go
index ee0f3160..07a3cbd4 100644
--- a/limit_test.go
+++ b/limit_test.go
@@ -37,6 +37,9 @@ var limitTests = []struct {
 	{name: "10kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 10*1024/4-1) + `]`)},
 	{name: "100kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 100*1024/4-1) + `]`)},
 	{name: "1000kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 1000*1024/4-1) + `]`)},
+	{name: "1000kb slice nested at max-depth", data: []byte(strings.Repeat(`[`, 10000) + `1` + strings.Repeat(`,1`, 1000*1024/2-20000-1) + strings.Repeat(`]`, 10000))},
+	{name: "1000kb slice nested in maps at max-depth", data: []byte("{a,b:\n" + strings.Repeat(" {a,b:", 10000-2) + ` [1` + strings.Repeat(",1", 1000*1024/2-6*10000-1) + `]` + strings.Repeat(`}`, 10000-1))},
+	{name: "1000kb of 10000-nested lines", data: []byte(strings.Repeat(`- `+strings.Repeat(`[`, 10000)+strings.Repeat(`]`, 10000)+"\n", 1000*1024/20000))},
 }
 
 func (s *S) TestLimits(c *C) {
@@ -82,6 +85,18 @@ func Benchmark1000KBMaps(b *testing.B) {
 	benchmark(b, "1000kb of maps")
 }
 
+func BenchmarkDeepSlice(b *testing.B) {
+	benchmark(b, "1000kb slice nested at max-depth")
+}
+
+func BenchmarkDeepFlow(b *testing.B) {
+	benchmark(b, "1000kb slice nested in maps at max-depth")
+}
+
+func Benchmark1000KBMaxDepthNested(b *testing.B) {
+	benchmark(b, "1000kb of 10000-nested lines")
+}
+
 func benchmark(b *testing.B, name string) {
 	for _, t := range limitTests {
 		if t.name != name {
diff --git a/scannerc.go b/scannerc.go
index 359855fc..57e954ca 100644
--- a/scannerc.go
+++ b/scannerc.go
@@ -657,35 +657,22 @@ func trace(args ...interface{}) func() {
 func yaml_parser_fetch_more_tokens(parser *yaml_parser_t) bool {
 	// While we need more tokens to fetch, do it.
 	for {
-		// Check if we really need to fetch more tokens.
-		need_more_tokens := false
-
 		// [Go] The comment parsing logic requires a lookahead of two tokens
 		// so that foot comments may be parsed in time of associating them
 		// with the tokens that are parsed before them, and also for line
 		// comments to be transformed into head comments in some edge cases.
-		if parser.tokens_head >= len(parser.tokens)-2 {
-			need_more_tokens = true
-		} else {
-			// Check if any potential simple key may occupy the head position.
-			for i := len(parser.simple_keys) - 1; i >= 0; i-- {
-				simple_key := &parser.simple_keys[i]
-				if simple_key.token_number < parser.tokens_parsed {
-					break
-				}
-				if valid, ok := yaml_simple_key_is_valid(parser, simple_key); !ok {
-					return false
-				} else if valid && simple_key.token_number == parser.tokens_parsed {
-					need_more_tokens = true
-					break
-				}
+		if parser.tokens_head < len(parser.tokens)-2 {
+			// If a potential simple key is at the head position, we need to fetch
+			// the next token to disambiguate it.
+			head_tok_idx, ok := parser.simple_keys_by_tok[parser.tokens_parsed]
+			if !ok {
+				break
+			} else if valid, ok := yaml_simple_key_is_valid(parser, &parser.simple_keys[head_tok_idx]); !ok {
+				return false
+			} else if !valid {
+				break
 			}
 		}
-
-		// We are finished.
-		if !need_more_tokens {
-			break
-		}
 		// Fetch the next token.
 		if !yaml_parser_fetch_next_token(parser) {
 			return false
@@ -938,6 +925,7 @@ func yaml_parser_save_simple_key(parser *yaml_parser_t) bool {
 			return false
 		}
 		parser.simple_keys[len(parser.simple_keys)-1] = simple_key
+		parser.simple_keys_by_tok[simple_key.token_number] = len(parser.simple_keys) - 1
 	}
 	return true
 }
@@ -952,9 +940,10 @@ func yaml_parser_remove_simple_key(parser *yaml_parser_t) bool {
 				"while scanning a simple key", parser.simple_keys[i].mark,
 				"could not find expected ':'")
 		}
+		// Remove the key from the stack.
+		parser.simple_keys[i].possible = false
+		delete(parser.simple_keys_by_tok, parser.simple_keys[i].token_number)
 	}
-	// Remove the key from the stack.
-	parser.simple_keys[i].possible = false
 	return true
 }
 
@@ -985,7 +974,9 @@ func yaml_parser_increase_flow_level(parser *yaml_parser_t) bool {
 func yaml_parser_decrease_flow_level(parser *yaml_parser_t) bool {
 	if parser.flow_level > 0 {
 		parser.flow_level--
-		parser.simple_keys = parser.simple_keys[:len(parser.simple_keys)-1]
+		last := len(parser.simple_keys) - 1
+		delete(parser.simple_keys_by_tok, parser.simple_keys[last].token_number)
+		parser.simple_keys = parser.simple_keys[:last]
 	}
 	return true
 }
@@ -1092,6 +1083,8 @@ func yaml_parser_fetch_stream_start(parser *yaml_parser_t) bool {
 	// Initialize the simple key stack.
 	parser.simple_keys = append(parser.simple_keys, yaml_simple_key_t{})
 
+	parser.simple_keys_by_tok = make(map[int]int)
+
 	// A simple key is allowed at the beginning of the stream.
 	parser.simple_key_allowed = true
 
@@ -1396,6 +1389,7 @@ func yaml_parser_fetch_value(parser *yaml_parser_t) bool {
 
 		// Remove the simple key.
 		simple_key.possible = false
+		delete(parser.simple_keys_by_tok, simple_key.token_number)
 
 		// A simple key cannot follow another simple key.
 		parser.simple_key_allowed = false
diff --git a/yamlh.go b/yamlh.go
index d5ea07c7..2719cfbb 100644
--- a/yamlh.go
+++ b/yamlh.go
@@ -622,6 +622,7 @@ type yaml_parser_t struct {
 
 	simple_key_allowed bool                // May a simple key occur at the current position?
 	simple_keys        []yaml_simple_key_t // The stack of simple keys.
+	simple_keys_by_tok map[int]int         // possible simple_key indexes indexed by token_number
 
 	// Parser stuff