From a6ecf24a6d716d933bcbc255a2f5d492285b54f5 Mon Sep 17 00:00:00 2001 From: Gustavo Niemeyer Date: Tue, 21 Jan 2020 17:51:48 +0000 Subject: [PATCH] Port CJ's simple_keys optimization to v3 (#555) Message from original commit (53403b5): This change introduces an index to lookup token numbers referenced by simple_keys in O(1), thus significantly reducing the performance impact of certain abusively constructed snippets. When we build up the simple_keys stack, we count on the (formerly named) staleness check to catch errors where a simple key is required but would be > 1024 chars or span lines. The previous simplification that searches the stack from the top can go 1024 keys deep before finding a "stale" key and stopping. I added a test that shows that this consumes ~3s per 1MB of document size. --- limit_test.go | 15 +++++++++++++++ scannerc.go | 46 ++++++++++++++++++++-------------------------- yamlh.go | 1 + 3 files changed, 36 insertions(+), 26 deletions(-) diff --git a/limit_test.go b/limit_test.go index ee0f3160..07a3cbd4 100644 --- a/limit_test.go +++ b/limit_test.go @@ -37,6 +37,9 @@ var limitTests = []struct { {name: "10kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 10*1024/4-1) + `]`)}, {name: "100kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 100*1024/4-1) + `]`)}, {name: "1000kb of maps", data: []byte(`a: &a [{a}` + strings.Repeat(`,{a}`, 1000*1024/4-1) + `]`)}, + {name: "1000kb slice nested at max-depth", data: []byte(strings.Repeat(`[`, 10000) + `1` + strings.Repeat(`,1`, 1000*1024/2-20000-1) + strings.Repeat(`]`, 10000))}, + {name: "1000kb slice nested in maps at max-depth", data: []byte("{a,b:\n" + strings.Repeat(" {a,b:", 10000-2) + ` [1` + strings.Repeat(",1", 1000*1024/2-6*10000-1) + `]` + strings.Repeat(`}`, 10000-1))}, + {name: "1000kb of 10000-nested lines", data: []byte(strings.Repeat(`- `+strings.Repeat(`[`, 10000)+strings.Repeat(`]`, 10000)+"\n", 1000*1024/20000))}, } func (s *S) TestLimits(c *C) { @@ -82,6 +85,18 @@ func Benchmark1000KBMaps(b *testing.B) { benchmark(b, "1000kb of maps") } +func BenchmarkDeepSlice(b *testing.B) { + benchmark(b, "1000kb slice nested at max-depth") +} + +func BenchmarkDeepFlow(b *testing.B) { + benchmark(b, "1000kb slice nested in maps at max-depth") +} + +func Benchmark1000KBMaxDepthNested(b *testing.B) { + benchmark(b, "1000kb of 10000-nested lines") +} + func benchmark(b *testing.B, name string) { for _, t := range limitTests { if t.name != name { diff --git a/scannerc.go b/scannerc.go index 359855fc..57e954ca 100644 --- a/scannerc.go +++ b/scannerc.go @@ -657,35 +657,22 @@ func trace(args ...interface{}) func() { func yaml_parser_fetch_more_tokens(parser *yaml_parser_t) bool { // While we need more tokens to fetch, do it. for { - // Check if we really need to fetch more tokens. - need_more_tokens := false - // [Go] The comment parsing logic requires a lookahead of two tokens // so that foot comments may be parsed in time of associating them // with the tokens that are parsed before them, and also for line // comments to be transformed into head comments in some edge cases. - if parser.tokens_head >= len(parser.tokens)-2 { - need_more_tokens = true - } else { - // Check if any potential simple key may occupy the head position. - for i := len(parser.simple_keys) - 1; i >= 0; i-- { - simple_key := &parser.simple_keys[i] - if simple_key.token_number < parser.tokens_parsed { - break - } - if valid, ok := yaml_simple_key_is_valid(parser, simple_key); !ok { - return false - } else if valid && simple_key.token_number == parser.tokens_parsed { - need_more_tokens = true - break - } + if parser.tokens_head < len(parser.tokens)-2 { + // If a potential simple key is at the head position, we need to fetch + // the next token to disambiguate it. + head_tok_idx, ok := parser.simple_keys_by_tok[parser.tokens_parsed] + if !ok { + break + } else if valid, ok := yaml_simple_key_is_valid(parser, &parser.simple_keys[head_tok_idx]); !ok { + return false + } else if !valid { + break } } - - // We are finished. - if !need_more_tokens { - break - } // Fetch the next token. if !yaml_parser_fetch_next_token(parser) { return false @@ -938,6 +925,7 @@ func yaml_parser_save_simple_key(parser *yaml_parser_t) bool { return false } parser.simple_keys[len(parser.simple_keys)-1] = simple_key + parser.simple_keys_by_tok[simple_key.token_number] = len(parser.simple_keys) - 1 } return true } @@ -952,9 +940,10 @@ func yaml_parser_remove_simple_key(parser *yaml_parser_t) bool { "while scanning a simple key", parser.simple_keys[i].mark, "could not find expected ':'") } + // Remove the key from the stack. + parser.simple_keys[i].possible = false + delete(parser.simple_keys_by_tok, parser.simple_keys[i].token_number) } - // Remove the key from the stack. - parser.simple_keys[i].possible = false return true } @@ -985,7 +974,9 @@ func yaml_parser_increase_flow_level(parser *yaml_parser_t) bool { func yaml_parser_decrease_flow_level(parser *yaml_parser_t) bool { if parser.flow_level > 0 { parser.flow_level-- - parser.simple_keys = parser.simple_keys[:len(parser.simple_keys)-1] + last := len(parser.simple_keys) - 1 + delete(parser.simple_keys_by_tok, parser.simple_keys[last].token_number) + parser.simple_keys = parser.simple_keys[:last] } return true } @@ -1092,6 +1083,8 @@ func yaml_parser_fetch_stream_start(parser *yaml_parser_t) bool { // Initialize the simple key stack. parser.simple_keys = append(parser.simple_keys, yaml_simple_key_t{}) + parser.simple_keys_by_tok = make(map[int]int) + // A simple key is allowed at the beginning of the stream. parser.simple_key_allowed = true @@ -1396,6 +1389,7 @@ func yaml_parser_fetch_value(parser *yaml_parser_t) bool { // Remove the simple key. simple_key.possible = false + delete(parser.simple_keys_by_tok, simple_key.token_number) // A simple key cannot follow another simple key. parser.simple_key_allowed = false diff --git a/yamlh.go b/yamlh.go index d5ea07c7..2719cfbb 100644 --- a/yamlh.go +++ b/yamlh.go @@ -622,6 +622,7 @@ type yaml_parser_t struct { simple_key_allowed bool // May a simple key occur at the current position? simple_keys []yaml_simple_key_t // The stack of simple keys. + simple_keys_by_tok map[int]int // possible simple_key indexes indexed by token_number // Parser stuff