From fd20cccb255b2e56bddddc10cef2e8a46ba61168 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 12 Dec 2024 11:52:04 -0500 Subject: [PATCH] cni: use check command when restoring from restart When the Nomad client restarts and restores allocations, the network namespace for an allocation may exist but no longer be correctly configured. For example, if the host is rebooted and the task was a Docker task using a pause container, the network namespace may be recreated by the docker daemon. When we restore an allocation, use the CNI "check" command to verify that any existing network namespace matches the expected configuration. This requires CNI plugins of at least version 1.2.0 to avoid a bug in older plugin versions that would cause the check to fail. If the check fails, destroy the network namespace and try to recreate it from scratch once. If that fails in the second pass, fail the restore so that the allocation can be recreated (rather than silently having networking fail). This should fix the gap left #24650 for Docker task drivers and any other drivers with the `MustInitiateNetwork` capability. Fixes: https://github.com/hashicorp/nomad/issues/24292 Ref: https://github.com/hashicorp/nomad/pull/24650 --- .changelog/24658.txt | 3 + client/allocrunner/network_hook.go | 25 ++- client/allocrunner/network_hook_test.go | 146 +++++++++++++++++- client/allocrunner/networking.go | 8 +- client/allocrunner/networking_bridge_linux.go | 4 +- client/allocrunner/networking_cni.go | 39 ++++- .../allocrunner/networking_cni_mock_test.go | 12 +- client/allocrunner/networking_cni_test.go | 2 +- .../content/docs/upgrade/upgrade-specific.mdx | 13 ++ .../partials/install/install-cni-plugins.mdx | 10 +- 10 files changed, 247 insertions(+), 15 deletions(-) create mode 100644 .changelog/24658.txt diff --git a/.changelog/24658.txt b/.changelog/24658.txt new file mode 100644 index 00000000000..f46da955490 --- /dev/null +++ b/.changelog/24658.txt @@ -0,0 +1,3 @@ +```release-note:bug +networking: check network namespaces on Linux during client restarts and fail the allocation if an existing namespace is invalid +``` diff --git a/client/allocrunner/network_hook.go b/client/allocrunner/network_hook.go index 570943b3795..bddcf1715c7 100644 --- a/client/allocrunner/network_hook.go +++ b/client/allocrunner/network_hook.go @@ -5,6 +5,7 @@ package allocrunner import ( "context" + "errors" "fmt" hclog "github.com/hashicorp/go-hclog" @@ -28,6 +29,8 @@ const ( dockerNetSpecHostnameKey = "docker_sandbox_hostname" ) +var ErrCNICheckFailed = errors.New("network namespace already exists but was misconfigured") + type networkIsolationSetter interface { SetNetworkIsolation(*drivers.NetworkIsolationSpec) } @@ -132,6 +135,9 @@ func (h *networkHook) Prerun() error { Hostname: interpolatedNetworks[0].Hostname, } + var checkedOnce bool + +CREATE: spec, created, err := h.manager.CreateNetwork(h.alloc.ID, &networkCreateReq) if err != nil { return fmt.Errorf("failed to create network for alloc: %v", err) @@ -142,11 +148,26 @@ func (h *networkHook) Prerun() error { h.isolationSetter.SetNetworkIsolation(spec) } - if created { - status, err := h.networkConfigurator.Setup(context.TODO(), h.alloc, spec) + if spec != nil { + status, err := h.networkConfigurator.Setup(context.TODO(), h.alloc, spec, created) if err != nil { + // if the netns already existed but is invalid, we get + // ErrCNICheckFailed. We'll try to recover from this one time by + // recreating the netns from scratch before giving up + if errors.Is(err, ErrCNICheckFailed) && !checkedOnce { + checkedOnce = true + destroyErr := h.manager.DestroyNetwork(h.alloc.ID, spec) + if destroyErr != nil { + return fmt.Errorf("%w: destroying network to retry failed: %v", err, destroyErr) + } + goto CREATE + } + return fmt.Errorf("failed to configure networking for alloc: %v", err) } + if status == nil { + return nil // netns already existed and was correctly configured + } // If the driver set the sandbox hostname label, then we will use that // to set the HostsConfig.Hostname. Otherwise, identify the sandbox diff --git a/client/allocrunner/network_hook_test.go b/client/allocrunner/network_hook_test.go index 99845f875bc..9e4be2da89a 100644 --- a/client/allocrunner/network_hook_test.go +++ b/client/allocrunner/network_hook_test.go @@ -4,6 +4,7 @@ package allocrunner import ( + "fmt" "testing" "github.com/hashicorp/nomad/ci" @@ -14,6 +15,7 @@ import ( "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/plugins/drivers" "github.com/hashicorp/nomad/plugins/drivers/testutils" + "github.com/hashicorp/nomad/testutil" "github.com/shoenig/test" "github.com/shoenig/test/must" ) @@ -65,7 +67,7 @@ func TestNetworkHook_Prerun_Postrun_group(t *testing.T) { MockNetworkManager: testutils.MockNetworkManager{ CreateNetworkF: func(allocID string, req *drivers.NetworkCreateRequest) (*drivers.NetworkIsolationSpec, bool, error) { test.Eq(t, alloc.ID, allocID) - return spec, false, nil + return spec, true, nil }, DestroyNetworkF: func(allocID string, netSpec *drivers.NetworkIsolationSpec) error { @@ -137,3 +139,145 @@ func TestNetworkHook_Prerun_Postrun_host(t *testing.T) { must.NoError(t, hook.Postrun()) must.True(t, destroyCalled) } + +// TestNetworkHook_Prerun_Postrun_ExistingNetNS tests that the prerun and +// postrun hooks call the Setup and Destroy with the expected behaviors when the +// network namespace already exists (typical of agent restarts and host reboots) +func TestNetworkHook_Prerun_Postrun_ExistingNetNS(t *testing.T) { + ci.Parallel(t) + + alloc := mock.Alloc() + alloc.Job.TaskGroups[0].Networks = []*structs.NetworkResource{ + {Mode: "bridge"}, + } + + spec := &drivers.NetworkIsolationSpec{ + Mode: drivers.NetIsolationModeGroup, + Path: "test", + Labels: map[string]string{"abc": "123"}, + } + isolationSetter := &mockNetworkIsolationSetter{t: t, expectedSpec: spec} + statusSetter := &mockNetworkStatusSetter{t: t, expectedStatus: nil} + + callCounts := testutil.NewCallCounter() + + nm := &testutils.MockDriver{ + MockNetworkManager: testutils.MockNetworkManager{ + CreateNetworkF: func(allocID string, req *drivers.NetworkCreateRequest) (*drivers.NetworkIsolationSpec, bool, error) { + test.Eq(t, alloc.ID, allocID) + callCounts.Inc("CreateNetwork") + return spec, false, nil + }, + + DestroyNetworkF: func(allocID string, netSpec *drivers.NetworkIsolationSpec) error { + test.Eq(t, alloc.ID, allocID) + test.Eq(t, spec, netSpec) + callCounts.Inc("DestroyNetwork") + return nil + }, + }, + } + + fakePlugin := newMockCNIPlugin() + + configurator := &cniNetworkConfigurator{ + nodeAttrs: map[string]string{ + "plugins.cni.version.bridge": "1.6.1", + }, + nodeMeta: map[string]string{}, + logger: testlog.HCLogger(t), + cni: fakePlugin, + nsOpts: &nsOpts{}, + } + + envBuilder := taskenv.NewBuilder(mock.Node(), alloc, nil, alloc.Job.Region) + + testCases := []struct { + name string + cniVersion string + checkErrs []error + setupErrs []string + expectPrerunCreateNetworkCalls int + expectPrerunDestroyNetworkCalls int + expectCheckCalls int + expectSetupCalls int + expectPostrunDestroyNetworkCalls int + expectPrerunError string + }{ + { + name: "good check", + cniVersion: "1.6.1", + expectPrerunCreateNetworkCalls: 1, + expectPrerunDestroyNetworkCalls: 0, + expectCheckCalls: 1, + expectSetupCalls: 0, + expectPostrunDestroyNetworkCalls: 1, + }, + { + name: "initial check fails", + cniVersion: "1.6.1", + checkErrs: []error{fmt.Errorf("whatever")}, + expectPrerunCreateNetworkCalls: 2, + expectPrerunDestroyNetworkCalls: 1, + expectCheckCalls: 2, + expectSetupCalls: 0, + expectPostrunDestroyNetworkCalls: 2, + }, + { + name: "check fails twice", + cniVersion: "1.6.1", + checkErrs: []error{ + fmt.Errorf("whatever"), + fmt.Errorf("whatever"), + }, + expectPrerunCreateNetworkCalls: 2, + expectPrerunDestroyNetworkCalls: 1, + expectCheckCalls: 2, + expectSetupCalls: 0, + expectPostrunDestroyNetworkCalls: 2, + expectPrerunError: "failed to configure networking for alloc: network namespace already exists but was misconfigured: whatever", + }, + { + name: "old CNI version skips check", + cniVersion: "1.2.0", + expectPrerunCreateNetworkCalls: 1, + expectPrerunDestroyNetworkCalls: 0, + expectCheckCalls: 0, + expectSetupCalls: 0, + expectPostrunDestroyNetworkCalls: 1, + }, + } + + for _, tc := range testCases { + + t.Run(tc.name, func(t *testing.T) { + callCounts.Reset() + fakePlugin.counter.Reset() + fakePlugin.checkErrors = tc.checkErrs + configurator.nodeAttrs["plugins.cni.version.bridge"] = tc.cniVersion + hook := newNetworkHook(testlog.HCLogger(t), isolationSetter, + alloc, nm, configurator, statusSetter, envBuilder.Build()) + + err := hook.Prerun() + if tc.expectPrerunError == "" { + must.NoError(t, err) + } else { + must.EqError(t, err, tc.expectPrerunError) + } + + test.Eq(t, tc.expectPrerunDestroyNetworkCalls, + callCounts.Get()["DestroyNetwork"], test.Sprint("DestroyNetwork calls after prerun")) + test.Eq(t, tc.expectPrerunCreateNetworkCalls, + callCounts.Get()["CreateNetwork"], test.Sprint("CreateNetwork calls after prerun")) + + test.Eq(t, tc.expectCheckCalls, fakePlugin.counter.Get()["Check"], test.Sprint("Check calls")) + test.Eq(t, tc.expectSetupCalls, fakePlugin.counter.Get()["Setup"], test.Sprint("Setup calls")) + + must.NoError(t, hook.Postrun()) + test.Eq(t, tc.expectPostrunDestroyNetworkCalls, + callCounts.Get()["DestroyNetwork"], test.Sprint("DestroyNetwork calls after postrun")) + + }) + + } +} diff --git a/client/allocrunner/networking.go b/client/allocrunner/networking.go index e98af0b4f89..07a98660795 100644 --- a/client/allocrunner/networking.go +++ b/client/allocrunner/networking.go @@ -14,7 +14,7 @@ import ( // NetworkConfigurator sets up and tears down the interfaces, routes, firewall // rules, etc for the configured networking mode of the allocation. type NetworkConfigurator interface { - Setup(context.Context, *structs.Allocation, *drivers.NetworkIsolationSpec) (*structs.AllocNetworkStatus, error) + Setup(context.Context, *structs.Allocation, *drivers.NetworkIsolationSpec, bool) (*structs.AllocNetworkStatus, error) Teardown(context.Context, *structs.Allocation, *drivers.NetworkIsolationSpec) error } @@ -23,7 +23,7 @@ type NetworkConfigurator interface { // require further configuration type hostNetworkConfigurator struct{} -func (h *hostNetworkConfigurator) Setup(context.Context, *structs.Allocation, *drivers.NetworkIsolationSpec) (*structs.AllocNetworkStatus, error) { +func (h *hostNetworkConfigurator) Setup(context.Context, *structs.Allocation, *drivers.NetworkIsolationSpec, bool) (*structs.AllocNetworkStatus, error) { return nil, nil } func (h *hostNetworkConfigurator) Teardown(context.Context, *structs.Allocation, *drivers.NetworkIsolationSpec) error { @@ -41,10 +41,10 @@ type synchronizedNetworkConfigurator struct { nc NetworkConfigurator } -func (s *synchronizedNetworkConfigurator) Setup(ctx context.Context, allocation *structs.Allocation, spec *drivers.NetworkIsolationSpec) (*structs.AllocNetworkStatus, error) { +func (s *synchronizedNetworkConfigurator) Setup(ctx context.Context, allocation *structs.Allocation, spec *drivers.NetworkIsolationSpec, created bool) (*structs.AllocNetworkStatus, error) { networkingGlobalMutex.Lock() defer networkingGlobalMutex.Unlock() - return s.nc.Setup(ctx, allocation, spec) + return s.nc.Setup(ctx, allocation, spec, created) } func (s *synchronizedNetworkConfigurator) Teardown(ctx context.Context, allocation *structs.Allocation, spec *drivers.NetworkIsolationSpec) error { diff --git a/client/allocrunner/networking_bridge_linux.go b/client/allocrunner/networking_bridge_linux.go index 63a11c8f093..1204837b0dd 100644 --- a/client/allocrunner/networking_bridge_linux.go +++ b/client/allocrunner/networking_bridge_linux.go @@ -120,12 +120,12 @@ func (b *bridgeNetworkConfigurator) ensureForwardingRules() error { } // Setup calls the CNI plugins with the add action -func (b *bridgeNetworkConfigurator) Setup(ctx context.Context, alloc *structs.Allocation, spec *drivers.NetworkIsolationSpec) (*structs.AllocNetworkStatus, error) { +func (b *bridgeNetworkConfigurator) Setup(ctx context.Context, alloc *structs.Allocation, spec *drivers.NetworkIsolationSpec, created bool) (*structs.AllocNetworkStatus, error) { if err := b.ensureForwardingRules(); err != nil { return nil, fmt.Errorf("failed to initialize table forwarding rules: %v", err) } - return b.cni.Setup(ctx, alloc, spec) + return b.cni.Setup(ctx, alloc, spec, created) } // Teardown calls the CNI plugins with the delete action diff --git a/client/allocrunner/networking_cni.go b/client/allocrunner/networking_cni.go index 1c877fbe2ff..4f39eb2183d 100644 --- a/client/allocrunner/networking_cni.go +++ b/client/allocrunner/networking_cni.go @@ -28,6 +28,7 @@ import ( consulIPTables "github.com/hashicorp/consul/sdk/iptables" log "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-set/v3" + "github.com/hashicorp/go-version" "github.com/hashicorp/nomad/client/taskenv" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/envoy" @@ -138,8 +139,19 @@ func addNomadWorkloadCNIArgs(logger log.Logger, alloc *structs.Allocation, cniAr } } +var supportsCNICheck = mustCNICheckConstraint() + +func mustCNICheckConstraint() version.Constraints { + v, err := version.NewConstraint(">= 1.3.0") + if err != nil { + panic(err) + } + return v +} + // Setup calls the CNI plugins with the add action -func (c *cniNetworkConfigurator) Setup(ctx context.Context, alloc *structs.Allocation, spec *drivers.NetworkIsolationSpec) (*structs.AllocNetworkStatus, error) { +func (c *cniNetworkConfigurator) Setup(ctx context.Context, alloc *structs.Allocation, spec *drivers.NetworkIsolationSpec, created bool) (*structs.AllocNetworkStatus, error) { + if err := c.ensureCNIInitialized(); err != nil { return nil, fmt.Errorf("cni not initialized: %w", err) } @@ -171,6 +183,31 @@ func (c *cniNetworkConfigurator) Setup(ctx context.Context, alloc *structs.Alloc cniArgs[ConsulIPTablesConfigEnvVar] = string(iptablesCfg) } + if !created { + // The netns will not be created if it already exists, typically on + // agent restart. If the configuration of a prexisting netns is wrong + // (ex. after a host reboot for docker created netns), networking will + // be broken. CNI's ADD command is not idempotent so we can't simply try + // again. Run CHECK to verify the network is still valid. Older plugins + // have a broken CHECK, so we have to allow the buggy behavior in the + // case of a host reboot with docker-created netns there. + cniVersion, err := version.NewSemver(c.nodeAttrs["plugins.cni.version.bridge"]) + if err == nil && supportsCNICheck.Check(cniVersion) { + err := c.cni.Check(ctx, alloc.ID, spec.Path, + c.nsOpts.withCapabilityPortMap(portMaps.ports), + c.nsOpts.withArgs(cniArgs), + ) + if err != nil { + return nil, fmt.Errorf("%w: %w", ErrCNICheckFailed, err) + } + } else { + c.logger.Debug("network namespace exists but could not check if networking is valid because bridge plugin version was <1.3.0: continuing anyways") + return nil, nil + } + c.logger.Trace("network namespace exists and passed check: skipping setup") + return nil, nil + } + // Depending on the version of bridge cni plugin used, a known race could occure // where two alloc attempt to create the nomad bridge at the same time, resulting // in one of them to fail. This rety attempts to overcome those erroneous failures. diff --git a/client/allocrunner/networking_cni_mock_test.go b/client/allocrunner/networking_cni_mock_test.go index 9d8e67e22a9..2bc06e06819 100644 --- a/client/allocrunner/networking_cni_mock_test.go +++ b/client/allocrunner/networking_cni_mock_test.go @@ -18,6 +18,7 @@ var _ cni.CNI = &mockCNIPlugin{} type mockCNIPlugin struct { counter *testutil.CallCounter setupErrors []string + checkErrors []error } func newMockCNIPlugin() *mockCNIPlugin { @@ -25,7 +26,9 @@ func newMockCNIPlugin() *mockCNIPlugin { callCounts := testutil.NewCallCounter() callCounts.Reset() return &mockCNIPlugin{ - counter: callCounts, + counter: callCounts, + setupErrors: []string{}, + checkErrors: []error{}, } } @@ -65,6 +68,13 @@ func (f *mockCNIPlugin) Remove(ctx context.Context, id, path string, opts ...cni } func (f *mockCNIPlugin) Check(ctx context.Context, id, path string, opts ...cni.NamespaceOpts) error { + if f.counter != nil { + f.counter.Inc("Check") + } + numOfCalls := f.counter.Get()["Check"] + if numOfCalls <= len(f.checkErrors) { + return f.checkErrors[numOfCalls-1] + } return nil } diff --git a/client/allocrunner/networking_cni_test.go b/client/allocrunner/networking_cni_test.go index 798d641adb4..f76f2b2462e 100644 --- a/client/allocrunner/networking_cni_test.go +++ b/client/allocrunner/networking_cni_test.go @@ -296,7 +296,7 @@ func TestSetup(t *testing.T) { } // method under test - result, err := c.Setup(context.Background(), alloc, spec) + result, err := c.Setup(context.Background(), alloc, spec, true) if tc.expectErr == "" { must.NoError(t, err) must.Eq(t, tc.expectResult, result) diff --git a/website/content/docs/upgrade/upgrade-specific.mdx b/website/content/docs/upgrade/upgrade-specific.mdx index 2734093f0be..a724f458eb5 100644 --- a/website/content/docs/upgrade/upgrade-specific.mdx +++ b/website/content/docs/upgrade/upgrade-specific.mdx @@ -13,6 +13,19 @@ upgrade. However, specific versions of Nomad may have more details provided for their upgrades as a result of new features or changed behavior. This page is used to document those details separately from the standard upgrade flow. +## Nomad 1.9.5 + +#### CNI plugins + +Nomad 1.9.5 includes a bug fix for restoring allocation networking after a +client host reboot. This fix requires recent versions of the CNI reference +plugins (minimum 1.2.0) and will fallback to the existing behavior if the CNI +reference plugins cannot support the fix. + +We recommend installing the CNI reference plugins from the [CNI project release +page](https://github.com/containernetworking/plugins/releases) rather than your +Linux distribution's package manager. + ## Nomad 1.9.4 #### Security updates to default deny lists diff --git a/website/content/partials/install/install-cni-plugins.mdx b/website/content/partials/install/install-cni-plugins.mdx index d916ba11e68..080482be139 100644 --- a/website/content/partials/install/install-cni-plugins.mdx +++ b/website/content/partials/install/install-cni-plugins.mdx @@ -4,19 +4,23 @@ that use network namespaces. Refer to the [CNI Plugins external guide](https://www.cni.dev/plugins/current/) for details on individual plugins. The following series of commands determines your operating system architecture, -downloads the [CNI 1.5.1 -release](https://github.com/containernetworking/plugins/releases/tag/v1.5.1), +downloads the [CNI 1.6.1 +release](https://github.com/containernetworking/plugins/releases/tag/v1.6.1), and then extracts the CNI plugin binaries into the `/opt/cni/bin` directory. Update the `CNI_PLUGIN_VERSION` value to use a different release version. ```shell-session $ export ARCH_CNI=$( [ $(uname -m) = aarch64 ] && echo arm64 || echo amd64) -$ export CNI_PLUGIN_VERSION=v1.5.1 +$ export CNI_PLUGIN_VERSION=v1.6.1 $ curl -L -o cni-plugins.tgz "https://github.com/containernetworking/plugins/releases/download/${CNI_PLUGIN_VERSION}/cni-plugins-linux-${ARCH_CNI}-${CNI_PLUGIN_VERSION}".tgz && \ sudo mkdir -p /opt/cni/bin && \ sudo tar -C /opt/cni/bin -xzf cni-plugins.tgz ``` +Your Linux distribution's package manager may provide the CNI reference plugins +but we recommend installing the most recent stable version to ensure you have +fixes for known bugs shipping in those versions. + Nomad looks for CNI plugin binaries by default in the `/opt/cni/bin` directory. However, you may install in the binaries in a different directory and then configure using the [`cni_path`](/nomad/docs/configuration/client#cni_path)