From af0c94af7c5eb5e66fa8ce9ccbc3c21094907b24 Mon Sep 17 00:00:00 2001 From: James Rasell Date: Tue, 18 Oct 2022 09:13:28 +0100 Subject: [PATCH 1/3] acl: correctly resolve ACL roles within client cache. The client ACL cache was not accounting for tokens which included ACL role links. This change modifies the behaviour to resolve role links to policies. It will also now store ACL roles within the cache for quick lookup. The cache TTL is configurable in the same manner as policies or tokens. Another small fix is included that takes into account the ACL token expiry time. This was not included, which meant tokens with expiry could be used past the expiry time, until they were GC'd. --- client/acl.go | 145 ++++++++++++++++++++++++++++- client/acl_test.go | 63 +++++++++++++ client/config/config.go | 4 + command/agent/agent.go | 1 + command/agent/config.go | 13 +++ command/agent/config_parse.go | 1 + command/agent/config_parse_test.go | 2 + command/agent/config_test.go | 2 + command/agent/testdata/basic.hcl | 1 + command/agent/testdata/basic.json | 1 + 10 files changed, 231 insertions(+), 2 deletions(-) diff --git a/client/acl.go b/client/acl.go index bf666fbaa05..bdc147aaed7 100644 --- a/client/acl.go +++ b/client/acl.go @@ -21,6 +21,11 @@ const ( // tokenCacheSize is the number of ACL tokens to keep cached. Tokens have a fetching cost, // so we keep the hot tokens cached to reduce the lookups. tokenCacheSize = 64 + + // roleCacheSize is the number of ACL roles to keep cached. Looking up + // roles requires an RPC call, so we keep the hot roles cached to reduce + // the number of lookups. + roleCacheSize = 64 ) // clientACLResolver holds the state required for client resolution @@ -34,6 +39,10 @@ type clientACLResolver struct { // tokenCache is used to maintain the fetched token objects tokenCache *lru.TwoQueueCache + + // roleCache is used to maintain a cache of the fetched ACL roles. Each + // entry is keyed by the role ID. + roleCache *lru.TwoQueueCache } // init is used to setup the client resolver state @@ -52,13 +61,19 @@ func (c *clientACLResolver) init() error { if err != nil { return err } + c.roleCache, err = lru.New2Q(roleCacheSize) + if err != nil { + return err + } return nil } -// cachedACLValue is used to manage ACL Token or Policy TTLs +// cachedACLValue is used to manage ACL Token, Policy, or Role cache entries +// and their TTLs. type cachedACLValue struct { Token *structs.ACLToken Policy *structs.ACLPolicy + Role *structs.ACLRole CacheTime time.Time } @@ -94,14 +109,27 @@ func (c *Client) resolveTokenAndACL(secretID string) (*acl.ACL, *structs.ACLToke if token == nil { return nil, nil, structs.ErrTokenNotFound } + if token.IsExpired(time.Now()) { + return nil, nil, structs.ErrTokenExpired + } // Check if this is a management token if token.Type == structs.ACLManagementToken { return acl.ManagementACL, token, nil } + // Resolve the policy links within the token ACL roles. + policyNames, err := c.resolveTokenACLRoles(secretID, token.Roles) + if err != nil { + return nil, nil, err + } + + // Generate a slice of all policy names included within the token, taken + // from both the ACL roles and the direct assignments. + policyNames = append(policyNames, token.Policies...) + // Resolve the policies - policies, err := c.resolvePolicies(token.SecretID, token.Policies) + policies, err := c.resolvePolicies(token.SecretID, policyNames) if err != nil { return nil, nil, err } @@ -227,3 +255,116 @@ func (c *Client) resolvePolicies(secretID string, policies []string) ([]*structs // Return the valid policies return out, nil } + +// resolveTokenACLRoles is used to unpack an ACL roles and their policy +// assignments into a list of ACL policy names. This can then be used to +// compile an ACL object. +// +// When roles need to be looked up from state via server RPC, we may use the +// expired cache version. This can only occur if we can fully resolve the role +// via the cache. +func (c *Client) resolveTokenACLRoles(secretID string, roleLinks []*structs.ACLTokenRoleLink) ([]string, error) { + + var ( + // policyNames tracks the resolved ACL policies which are linked to the + // role. This is the output object and represents the authorisation + // this role provides token bearers. + policyNames []string + + // missingRoleIDs are the roles linked which are not found within our + // cache. These must be looked up from the server via and RPC, so we + // can correctly identify the policy links. + missingRoleIDs []string + + // expiredRoleIDs are the roles linked which have been found within our + // cache, but are expired. These must be looked up from the server via + // and RPC, so we can correctly identify the policy links. + expiredRoleIDs []string + ) + + for _, roleLink := range roleLinks { + + // Look within the cache to see if the role is already present. If we + // do not find it, add the ID to our tracking, so we look this up via + // RPC. + raw, ok := c.roleCache.Get(roleLink.ID) + if !ok { + missingRoleIDs = append(missingRoleIDs, roleLink.ID) + continue + } + + // If the cached value is expired, add the ID to our tracking, so we + // look this up via RPC. Otherwise, iterate the policy links and add + // each policy name to our return object tracking. + cached := raw.(*cachedACLValue) + if cached.Age() <= c.GetConfig().ACLRoleTTL { + for _, policyLink := range cached.Role.Policies { + policyNames = append(policyNames, policyLink.Name) + } + } else { + expiredRoleIDs = append(expiredRoleIDs, cached.Role.ID) + } + } + + // Hot-path: we were able to resolve all ACL roles via the cache and + // generate a list of linked policy names. Therefore, we can avoid making + // any RPC calls. + if len(missingRoleIDs)+len(expiredRoleIDs) == 0 { + return policyNames, nil + } + + // Created a combined list of role IDs that we need to lookup from server + // state. + roleIDsToFetch := missingRoleIDs + roleIDsToFetch = append(roleIDsToFetch, expiredRoleIDs...) + + // Generate an RPC request to detail all the ACL roles that we did not find + // or were expired within the cache. + roleByIDReq := structs.ACLRolesByIDRequest{ + ACLRoleIDs: roleIDsToFetch, + QueryOptions: structs.QueryOptions{ + Region: c.Region(), + AuthToken: secretID, + AllowStale: true, + }, + } + + var roleByIDResp structs.ACLRolesByIDResponse + + // Perform the RPC call to detail the required ACL roles. If the RPC call + // fails, and we are only updating expired cache entries, use the expired + // entries. This allows use to handle intermittent failures. + err := c.RPC(structs.ACLGetRolesByIDRPCMethod, &roleByIDReq, &roleByIDResp) + if err != nil { + if len(missingRoleIDs) == 0 { + c.logger.Warn("failed to resolve ACL roles, using expired cached value", "error", err) + for _, aclRole := range roleByIDResp.ACLRoles { + for _, rolePolicyLink := range aclRole.Policies { + policyNames = append(policyNames, rolePolicyLink.Name) + } + } + return policyNames, nil + } + return nil, err + } + + // Generate a timestamp for the cache entry. We do not need to use a + // timestamp per ACL role response integration. + now := time.Now() + + for _, aclRole := range roleByIDResp.ACLRoles { + + // Add an entry to the cache using the generated timestamp for future + // expiry calculations. Any existing, expired entry will be + // overwritten. + c.roleCache.Add(aclRole.ID, &cachedACLValue{Role: aclRole, CacheTime: now}) + + // Iterate the role policy links, extracting the name and adding this + // to our return response tracking. + for _, rolePolicyLink := range aclRole.Policies { + policyNames = append(policyNames, rolePolicyLink.Name) + } + } + + return policyNames, nil +} diff --git a/client/acl_test.go b/client/acl_test.go index b1cc0a3151e..4662a326555 100644 --- a/client/acl_test.go +++ b/client/acl_test.go @@ -2,17 +2,29 @@ package client import ( "testing" + "time" "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/config" + "github.com/hashicorp/nomad/helper/pointer" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) +func Test_clientACLResolver_init(t *testing.T) { + resolver := &clientACLResolver{} + require.NoError(t, resolver.init()) + require.NotNil(t, resolver.aclCache) + require.NotNil(t, resolver.policyCache) + require.NotNil(t, resolver.tokenCache) + require.NotNil(t, resolver.roleCache) +} + func TestClient_ACL_resolveTokenValue(t *testing.T) { ci.Parallel(t) @@ -106,6 +118,47 @@ func TestClient_ACL_resolvePolicies(t *testing.T) { } } +func TestClient_resolveTokenACLRoles(t *testing.T) { + ci.Parallel(t) + + testServer, _, rootACLToken, testServerCleanupS1 := testACLServer(t, nil) + defer testServerCleanupS1() + testutil.WaitForLeader(t, testServer.RPC) + + testClient, cleanup := TestClient(t, func(c *config.Config) { + c.RPCHandler = testServer + c.ACLEnabled = true + }) + defer cleanup() + + // Create an ACL Role and a client token which is linked to this. + mockACLRole := mock.ACLRole() + + mockACLToken := mock.ACLToken() + mockACLToken.Policies = []string{} + mockACLToken.Roles = []*structs.ACLTokenRoleLink{{ID: mockACLRole.ID}} + + err := testServer.State().UpsertACLRoles(structs.MsgTypeTestSetup, 10, []*structs.ACLRole{mockACLRole}, true) + require.NoError(t, err) + err = testServer.State().UpsertACLTokens(structs.MsgTypeTestSetup, 20, []*structs.ACLToken{mockACLToken}) + require.NoError(t, err) + + // Resolve the ACL policies linked via the role. + resolvedRoles1, err := testClient.resolveTokenACLRoles(rootACLToken.SecretID, mockACLToken.Roles) + require.NoError(t, err) + require.Len(t, resolvedRoles1, 2) + + // Test the cache directly and check that the ACL role previously queried + // is now cached. + require.Equal(t, 1, testClient.roleCache.Len()) + require.True(t, testClient.roleCache.Contains(mockACLRole.ID)) + + // Resolve the roles again to check we get the same results. + resolvedRoles2, err := testClient.resolveTokenACLRoles(rootACLToken.SecretID, mockACLToken.Roles) + require.NoError(t, err) + require.ElementsMatch(t, resolvedRoles1, resolvedRoles2) +} + func TestClient_ACL_ResolveToken_Disabled(t *testing.T) { ci.Parallel(t) @@ -199,4 +252,14 @@ func TestClient_ACL_ResolveSecretToken(t *testing.T) { assert.NotEmpty(t, respToken.AccessorID) } + // Create and upset a token which has just expired. + mockExpiredToken := mock.ACLToken() + mockExpiredToken.ExpirationTime = pointer.Of(time.Now().Add(-5 * time.Minute)) + + err = s1.State().UpsertACLTokens(structs.MsgTypeTestSetup, 120, []*structs.ACLToken{mockExpiredToken}) + require.NoError(t, err) + + expiredTokenResp, err := c1.ResolveSecretToken(mockExpiredToken.SecretID) + require.Nil(t, expiredTokenResp) + require.ErrorContains(t, err, "ACL token expired") } diff --git a/client/config/config.go b/client/config/config.go index fa77df31f0c..646dc1fca19 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -209,6 +209,10 @@ type Config struct { // ACLPolicyTTL is how long we cache policy values for ACLPolicyTTL time.Duration + // ACLRoleTTL is how long we cache ACL role value for within each Nomad + // client. + ACLRoleTTL time.Duration + // DisableRemoteExec disables remote exec targeting tasks on this client DisableRemoteExec bool diff --git a/command/agent/agent.go b/command/agent/agent.go index 6232f9900f2..c6e1cfcd0fd 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -718,6 +718,7 @@ func convertClientConfig(agentConfig *Config) (*clientconfig.Config, error) { conf.ACLEnabled = agentConfig.ACL.Enabled conf.ACLTokenTTL = agentConfig.ACL.TokenTTL conf.ACLPolicyTTL = agentConfig.ACL.PolicyTTL + conf.ACLRoleTTL = agentConfig.ACL.RoleTTL // Setup networking configuration conf.CNIPath = agentConfig.Client.CNIPath diff --git a/command/agent/config.go b/command/agent/config.go index df7dc91425e..1266ad26d0f 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -376,6 +376,12 @@ type ACLConfig struct { PolicyTTL time.Duration PolicyTTLHCL string `hcl:"policy_ttl" json:"-"` + // RoleTTL controls how long we cache ACL roles. This controls how stale + // they can be when we are enforcing policies. Defaults to "30s". + // Reducing this impacts performance by forcing more frequent resolution. + RoleTTL time.Duration + RoleTTLHCL string `hcl:"role_ttl" json:"-"` + // ReplicationToken is used by servers to replicate tokens and policies // from the authoritative region. This must be a valid management token // within the authoritative region. @@ -1250,6 +1256,7 @@ func DefaultConfig() *Config { Enabled: false, TokenTTL: 30 * time.Second, PolicyTTL: 30 * time.Second, + RoleTTL: 30 * time.Second, }, SyslogFacility: "LOCAL0", Telemetry: &Telemetry{ @@ -1755,6 +1762,12 @@ func (a *ACLConfig) Merge(b *ACLConfig) *ACLConfig { if b.PolicyTTLHCL != "" { result.PolicyTTLHCL = b.PolicyTTLHCL } + if b.RoleTTL != 0 { + result.RoleTTL = b.RoleTTL + } + if b.RoleTTLHCL != "" { + result.RoleTTLHCL = b.RoleTTLHCL + } if b.TokenMinExpirationTTL != 0 { result.TokenMinExpirationTTL = b.TokenMinExpirationTTL } diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index 5004aee0e07..8bacdc836a5 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -66,6 +66,7 @@ func ParseConfigFile(path string) (*Config, error) { {"gc_interval", &c.Client.GCInterval, &c.Client.GCIntervalHCL, nil}, {"acl.token_ttl", &c.ACL.TokenTTL, &c.ACL.TokenTTLHCL, nil}, {"acl.policy_ttl", &c.ACL.PolicyTTL, &c.ACL.PolicyTTLHCL, nil}, + {"acl.policy_ttl", &c.ACL.RoleTTL, &c.ACL.RoleTTLHCL, nil}, {"acl.token_min_expiration_ttl", &c.ACL.TokenMinExpirationTTL, &c.ACL.TokenMinExpirationTTLHCL, nil}, {"acl.token_max_expiration_ttl", &c.ACL.TokenMaxExpirationTTL, &c.ACL.TokenMaxExpirationTTLHCL, nil}, {"client.server_join.retry_interval", &c.Client.ServerJoin.RetryInterval, &c.Client.ServerJoin.RetryIntervalHCL, nil}, diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 4ba63eb61f7..94d60cdf691 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -155,6 +155,8 @@ var basicConfig = &Config{ TokenTTLHCL: "60s", PolicyTTL: 60 * time.Second, PolicyTTLHCL: "60s", + RoleTTLHCL: "60s", + RoleTTL: 60 * time.Second, TokenMinExpirationTTLHCL: "1h", TokenMinExpirationTTL: 1 * time.Hour, TokenMaxExpirationTTLHCL: "100h", diff --git a/command/agent/config_test.go b/command/agent/config_test.go index 42ec370a087..f63538ab3e3 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -158,6 +158,7 @@ func TestConfig_Merge(t *testing.T) { Enabled: true, TokenTTL: 60 * time.Second, PolicyTTL: 60 * time.Second, + RoleTTL: 60 * time.Second, TokenMinExpirationTTL: 60 * time.Second, TokenMaxExpirationTTL: 60 * time.Second, ReplicationToken: "foo", @@ -360,6 +361,7 @@ func TestConfig_Merge(t *testing.T) { Enabled: true, TokenTTL: 20 * time.Second, PolicyTTL: 20 * time.Second, + RoleTTL: 20 * time.Second, TokenMinExpirationTTL: 20 * time.Second, TokenMaxExpirationTTL: 20 * time.Second, ReplicationToken: "foobar", diff --git a/command/agent/testdata/basic.hcl b/command/agent/testdata/basic.hcl index 1f614b83d1d..55b9977b5b6 100644 --- a/command/agent/testdata/basic.hcl +++ b/command/agent/testdata/basic.hcl @@ -163,6 +163,7 @@ acl { enabled = true token_ttl = "60s" policy_ttl = "60s" + role_ttl = "60s" token_min_expiration_ttl = "1h" token_max_expiration_ttl = "100h" replication_token = "foobar" diff --git a/command/agent/testdata/basic.json b/command/agent/testdata/basic.json index b0e2850ee0c..990400acf2d 100644 --- a/command/agent/testdata/basic.json +++ b/command/agent/testdata/basic.json @@ -5,6 +5,7 @@ "policy_ttl": "60s", "replication_token": "foobar", "token_ttl": "60s", + "role_ttl": "60s", "token_min_expiration_ttl": "1h", "token_max_expiration_ttl": "100h" } From 9213e17545b761c6a09eca6c4d5078e77277807c Mon Sep 17 00:00:00 2001 From: James Rasell Date: Tue, 18 Oct 2022 09:40:25 +0100 Subject: [PATCH 2/3] docs: add changelog and website changes for #14922 --- .changelog/14922.txt | 7 +++++++ website/content/docs/configuration/acl.mdx | 7 +++++++ 2 files changed, 14 insertions(+) create mode 100644 .changelog/14922.txt diff --git a/.changelog/14922.txt b/.changelog/14922.txt new file mode 100644 index 00000000000..6fac2606cc8 --- /dev/null +++ b/.changelog/14922.txt @@ -0,0 +1,7 @@ +```release-note:bug +client: Resolve ACL roles within client ACL cache +``` + +```release-note:bug +client: Check ACL token expiry when resolving token within ACL cache +``` diff --git a/website/content/docs/configuration/acl.mdx b/website/content/docs/configuration/acl.mdx index 9cfd8abebfa..530623ac1d7 100644 --- a/website/content/docs/configuration/acl.mdx +++ b/website/content/docs/configuration/acl.mdx @@ -19,6 +19,7 @@ acl { enabled = true token_ttl = "30s" policy_ttl = "60s" + role_ttl = "60s" } ``` @@ -42,6 +43,12 @@ acl { the request load against servers. If a client cannot reach a server, for example because of an outage, the TTL will be ignored and the cached value used. +- `role_ttl` `(string: "30s")` - Specifies the maximum time-to-live (TTL) for + cached ACL roles. This does not affect servers, since they do not cache roles. + Setting this value lower reduces how stale a role can be, but increases the + request load against servers. If a client cannot reach a server, for example + because of an outage, the TTL will be ignored and the cached value used. + - `replication_token` `(string: "")` - Specifies the Secret ID of the ACL token to use for replicating policies and tokens. This is used by servers in non-authoritative region to mirror the policies and tokens into the local region from [authoritative_region][authoritative-region]. From 7a96ee0ccd291ee2995e3e7950aced1896e14f49 Mon Sep 17 00:00:00 2001 From: James Rasell Date: Wed, 19 Oct 2022 11:59:58 +0100 Subject: [PATCH 3/3] review: update based on feedback from Tim --- client/acl.go | 5 ++++- client/acl_test.go | 36 ++++++++++++++++++------------------ 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/client/acl.go b/client/acl.go index bdc147aaed7..19efb6cd083 100644 --- a/client/acl.go +++ b/client/acl.go @@ -109,7 +109,10 @@ func (c *Client) resolveTokenAndACL(secretID string) (*acl.ACL, *structs.ACLToke if token == nil { return nil, nil, structs.ErrTokenNotFound } - if token.IsExpired(time.Now()) { + + // Give the token expiry some slight leeway in case the client and server + // clocks are skewed. + if token.IsExpired(time.Now().Add(2 * time.Second)) { return nil, nil, structs.ErrTokenExpired } diff --git a/client/acl_test.go b/client/acl_test.go index 4662a326555..7a108b8db89 100644 --- a/client/acl_test.go +++ b/client/acl_test.go @@ -12,17 +12,17 @@ import ( "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" + "github.com/shoenig/test/must" "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" ) func Test_clientACLResolver_init(t *testing.T) { resolver := &clientACLResolver{} - require.NoError(t, resolver.init()) - require.NotNil(t, resolver.aclCache) - require.NotNil(t, resolver.policyCache) - require.NotNil(t, resolver.tokenCache) - require.NotNil(t, resolver.roleCache) + must.NoError(t, resolver.init()) + must.NotNil(t, resolver.aclCache) + must.NotNil(t, resolver.policyCache) + must.NotNil(t, resolver.tokenCache) + must.NotNil(t, resolver.roleCache) } func TestClient_ACL_resolveTokenValue(t *testing.T) { @@ -139,24 +139,24 @@ func TestClient_resolveTokenACLRoles(t *testing.T) { mockACLToken.Roles = []*structs.ACLTokenRoleLink{{ID: mockACLRole.ID}} err := testServer.State().UpsertACLRoles(structs.MsgTypeTestSetup, 10, []*structs.ACLRole{mockACLRole}, true) - require.NoError(t, err) + must.NoError(t, err) err = testServer.State().UpsertACLTokens(structs.MsgTypeTestSetup, 20, []*structs.ACLToken{mockACLToken}) - require.NoError(t, err) + must.NoError(t, err) // Resolve the ACL policies linked via the role. resolvedRoles1, err := testClient.resolveTokenACLRoles(rootACLToken.SecretID, mockACLToken.Roles) - require.NoError(t, err) - require.Len(t, resolvedRoles1, 2) + must.NoError(t, err) + must.Len(t, 2, resolvedRoles1) // Test the cache directly and check that the ACL role previously queried // is now cached. - require.Equal(t, 1, testClient.roleCache.Len()) - require.True(t, testClient.roleCache.Contains(mockACLRole.ID)) + must.Eq(t, 1, testClient.roleCache.Len()) + must.True(t, testClient.roleCache.Contains(mockACLRole.ID)) // Resolve the roles again to check we get the same results. resolvedRoles2, err := testClient.resolveTokenACLRoles(rootACLToken.SecretID, mockACLToken.Roles) - require.NoError(t, err) - require.ElementsMatch(t, resolvedRoles1, resolvedRoles2) + must.NoError(t, err) + must.SliceContainsAll(t, resolvedRoles1, resolvedRoles2) } func TestClient_ACL_ResolveToken_Disabled(t *testing.T) { @@ -252,14 +252,14 @@ func TestClient_ACL_ResolveSecretToken(t *testing.T) { assert.NotEmpty(t, respToken.AccessorID) } - // Create and upset a token which has just expired. + // Create and upsert a token which has just expired. mockExpiredToken := mock.ACLToken() mockExpiredToken.ExpirationTime = pointer.Of(time.Now().Add(-5 * time.Minute)) err = s1.State().UpsertACLTokens(structs.MsgTypeTestSetup, 120, []*structs.ACLToken{mockExpiredToken}) - require.NoError(t, err) + must.NoError(t, err) expiredTokenResp, err := c1.ResolveSecretToken(mockExpiredToken.SecretID) - require.Nil(t, expiredTokenResp) - require.ErrorContains(t, err, "ACL token expired") + must.Nil(t, expiredTokenResp) + must.StrContains(t, err.Error(), "ACL token expired") }