Merge pull request kubernetes#26364 from jupblb/b232589040

Add optional presubmit with 5k nodes for perf-tests repo
KohlsTechnology · Jun 14, 2022 · ad0ae57 · ad0ae57
2 parents ed7ecaa + 86342b0
commit ad0ae57
Show file tree

Hide file tree

Showing 7 changed files with 99 additions and 16 deletions.
diff --git a/config/jobs/kubernetes/sig-scalability/sig-scalability-presubmit-jobs.yaml b/config/jobs/kubernetes/sig-scalability/sig-scalability-presubmit-jobs.yaml
@@ -128,7 +128,7 @@ presubmits:
         - --extract=local
         - --flush-mem-after-build=true
         - --gcp-nodes=500
-        - --gcp-project=k8s-presubmit-scale
+        - --gcp-project-type=scalability-presubmit-5k-project
         - --gcp-zone=us-east1-b
         - --provider=gce
         - --stage=gs://kubernetes-release-pull/ci/pull-kubernetes-e2e-gce-big-performance
@@ -262,7 +262,7 @@ presubmits:
         - --extract=local
         - --flush-mem-after-build=true
         - --gcp-nodes=5000
-        - --gcp-project=k8s-presubmit-scale
+        - --gcp-project-type=scalability-presubmit-5k-project
         - --gcp-zone=us-east1-b
         - --provider=gce
         - --stage=gs://kubernetes-release-pull/ci/pull-kubernetes-e2e-gce-scale-performance-manual
@@ -672,6 +672,95 @@ presubmits:
         securityContext:
           privileged: true
 
+  # Fork of kubernetes/kubernetes: pull-kubernetes-e2e-gce-scale-performance-manual
+  - name: pull-perf-tests-clusterloader2-e2e-gce-scale-performance-manual
+    always_run: false
+    max_concurrency: 1
+    branches:
+    - master
+    decorate: true
+    path_alias: k8s.io/perf-tests
+    decoration_config:
+      timeout: 450m
+    extra_refs:
+    - org: kubernetes
+      repo: release
+      base_ref: master
+      path_alias: k8s.io/release
+    labels:
+      preset-service-account: "true"
+      preset-k8s-ssh: "true"
+      preset-dind-enabled: "true"
+      preset-e2e-scalability-common: "true"
+      preset-e2e-scalability-presubmits: "true"
+    annotations:
+      testgrid-dashboards: presubmits-kubernetes-scalability
+      testgrid-tab-name: pull-perf-tests-clusterloader2-e2e-gce-scale-performance
+    spec:
+      containers:
+      - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20220514-17efd5d2c3-master
+        command:
+        - runner.sh
+        - /workspace/scenarios/kubernetes_e2e.py
+        args:
+        - --cluster=
+        - --env=HEAPSTER_MACHINE_TYPE=e2-standard-32
+        # TODO(mborsz): Adjust or remove this change once we understand coredns
+        # memory usage regression.
+        - --env=KUBE_DNS_MEMORY_LIMIT=300Mi
+        - --extract=ci/latest-fast
+        - --extract-ci-bucket=k8s-release-dev
+        - --gcp-nodes=5000
+        - --gcp-project-type=scalability-presubmit-5k-project
+        - --gcp-zone=us-east1-b
+        - --provider=gce
+        - --metadata-sources=cl2-metadata.json
+        - --env=CL2_LOAD_TEST_THROUGHPUT=50
+        - --env=CL2_DELETE_TEST_THROUGHPUT=50
+        # Overrides CONTROLLER_MANAGER_TEST_ARGS from preset-e2e-scalability-periodics.
+        - --env=CONTROLLER_MANAGER_TEST_ARGS=--profiling --kube-api-qps=100 --kube-api-burst=100 --endpointslice-updates-batch-period=500ms --endpoint-updates-batch-period=500ms
+        # Overrides SCHEDULER_TEST_ARGS from preset-e2e-scalability-periodics.
+        # TODO(#1311): Clean this up after the experiment - it should allow
+        #   to hugely decrease pod-startup-latency across the whole test.
+        #   Given that individual controllers have separate QPS limits, we allow
+        #   scheduler to keep up with the load from deployment, daemonset and job
+        #   performing pod creations at once.
+        - --env=SCHEDULER_TEST_ARGS=--profiling --kube-api-qps=300 --kube-api-burst=300
+        # With APF only sum of --max-requests-inflight and --max-mutating-requests-inflight matters, so set --max-mutating-requests-inflight to 0.
+        - --env=APISERVER_TEST_ARGS=--max-requests-inflight=640 --max-mutating-requests-inflight=0
+        - --env=CL2_ENABLE_API_AVAILABILITY_MEASUREMENT=true
+        - --env=CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD=99.5
+        - --test=false
+        - --test-cmd=$GOPATH/src/k8s.io/perf-tests/run-e2e.sh
+        - --test-cmd-args=cluster-loader2
+        - --test-cmd-args=--experimental-gcp-snapshot-prometheus-disk=true
+        - --test-cmd-args=--experimental-prometheus-disk-snapshot-name=$(JOB_NAME)-$(BUILD_ID)
+        - --test-cmd-args=--experimental-prometheus-snapshot-to-report-dir=true
+        - --test-cmd-args=--nodes=5000
+        - --test-cmd-args=--prometheus-scrape-node-exporter
+        - --test-cmd-args=--provider=gce
+        - --test-cmd-args=--report-dir=$(ARTIFACTS)
+        - --test-cmd-args=--testconfig=testing/load/config.yaml
+        - --test-cmd-args=--testconfig=testing/huge-service/config.yaml
+        - --test-cmd-args=--testconfig=testing/access-tokens/config.yaml
+        - --test-cmd-args=--testoverrides=./testing/experiments/enable_restart_count_check.yaml
+        - --test-cmd-args=--testoverrides=./testing/experiments/ignore_known_gce_container_restarts.yaml
+        - --test-cmd-args=--testoverrides=./testing/overrides/5000_nodes.yaml
+        - --test-cmd-name=ClusterLoaderV2
+        - --timeout=420m
+        - --use-logexporter
+        - --logexporter-gcs-path=gs://sig-scalability-logs/$(JOB_NAME)/$(BUILD_ID)
+        resources:
+          limits:
+            # Using 6 CPU to speed up bazel build phase (4 is enough for the test itself)
+            cpu: 6
+            memory: "16Gi"
+          requests:
+            cpu: 6
+            memory: "16Gi"
+        securityContext:
+          privileged: true
+
   - name: pull-perf-tests-util-images
     always_run: false
     skip_report: false

diff --git a/config/prow/cluster/build/boskos-janitor.yaml b/config/prow/cluster/build/boskos-janitor.yaml
@@ -22,7 +22,7 @@ spec:
         image: gcr.io/k8s-staging-boskos/janitor:v20220516-d007e44
         args:
         - --boskos-url=http://boskos.test-pods.svc.cluster.local.
-        - --resource-type=gce-project,gpu-project,ingress-project,istio-project,scalability-presubmit-project,scalability-project,node-e2e-project
+        - --resource-type=gce-project,gpu-project,ingress-project,istio-project,scalability-presubmit-project,scalability-presubmit-5k-project,scalability-project,node-e2e-project
         - --pool-size=20
         - --
         - --hours=0

diff --git a/config/prow/cluster/build/boskos-reaper_deployment.yaml b/config/prow/cluster/build/boskos-reaper_deployment.yaml
@@ -21,4 +21,4 @@ spec:
         image: gcr.io/k8s-staging-boskos/reaper:v20220516-d007e44
         args:
         - --boskos-url=http://boskos.test-pods.svc.cluster.local.
-        - --resource-type=gce-project,gpu-project,ingress-project,istio-project,scalability-presubmit-project,scalability-project,aws-account,node-e2e-project
+        - --resource-type=gce-project,gpu-project,ingress-project,istio-project,scalability-presubmit-project,scalability-presubmit-5k-project,scalability-project,aws-account,node-e2e-project
diff --git a/config/prow/cluster/build/boskos-resources/boskos-resources.yaml b/config/prow/cluster/build/boskos-resources/boskos-resources.yaml
@@ -326,6 +326,10 @@ resources:
     - k8s-presubmit-scale-45
   state: dirty
   type: scalability-presubmit-project
+- names:
+    - k8s-presubmit-scale
+  state: dirty
+  type: scalability-presubmit-5k-project
 - names:
   - k8s-jkns-gke-ubuntu
   - k8s-jkns-gke-ubuntu-1-6

diff --git a/config/prow/cluster/monitoring/mixins/lib/config.libsonnet b/config/prow/cluster/monitoring/mixins/lib/config.libsonnet
@@ -73,6 +73,7 @@ local config = {
     {job: "k8s-prow-builds-new-boskos", type: "scalability-project", friendly: "Scalability project"},
     {job: "k8s-infra-prow-builds-boskos", type: "scalability-project", friendly: "Scalability project (k8s-infra)"},
     {job: "k8s-prow-builds-new-boskos", type: "scalability-presubmit-project", friendly: "Scalability presubmit project"}
+    {job: "k8s-prow-builds-new-boskos", type: "scalability-presubmit-5k-project", friendly: "Scalability presubmit project (5k)"}
   ],
 
   // How long we go during work hours without seeing a webhook before alerting.

diff --git a/experiment/ci-janitor/main.go b/experiment/ci-janitor/main.go
@@ -47,7 +47,6 @@ var (
 		"k8s-jkns-pr-kubemark",
 		"k8s-jkns-pr-node-e2e",
 		"k8s-jkns-pr-gce-gpus",
-		"k8s-presubmit-scale",
 		// k8s-infra projects, can't be cleaned by k8s-prow serviceaccounts
 		"k8s-infra-e2e-scale-5k-project",
 		"k8s-infra-e2e-gpu-project",

diff --git a/scenarios/kubernetes_janitor.py b/scenarios/kubernetes_janitor.py
@@ -99,12 +99,6 @@ def clean_project(project, hours=24, dryrun=False, ratelimit=None, filt=None):
     'k8s-jkns-pr-gce-gpus': 3,
 }
 
-SCALE_PROJECT = {
-    # cleans up resources older than 12h
-    # for scale presubmit job we need to give jobs enough time to finish.
-    'k8s-presubmit-scale': 12,
-}
-
 def check_predefine_jobs(jobs, ratelimit):
     """Handle predefined jobs"""
     for project, expire in jobs.iteritems():
@@ -131,8 +125,6 @@ def check_ci_jobs():
             if any(b in project for b in EXEMPT_PROJECTS):
                 print >>sys.stderr, 'Project %r is exempted in ci-janitor' % project
                 continue
-            if project in PR_PROJECTS or project in SCALE_PROJECT:
-                continue # CI janitor skips all PR jobs
             found = project
         if found:
             clean_project(found, clean_hours)
@@ -142,8 +134,6 @@ def main(mode, ratelimit, projects, age, artifacts, filt):
     """Run janitor for each project."""
     if mode == 'pr':
         check_predefine_jobs(PR_PROJECTS, ratelimit)
-    elif mode == 'scale':
-        check_predefine_jobs(SCALE_PROJECT, ratelimit)
     elif mode == 'custom':
         projs = str.split(projects, ',')
         for proj in projs:
@@ -183,7 +173,7 @@ def main(mode, ratelimit, projects, age, artifacts, filt):
     VERBOSE = False
     PARSER = argparse.ArgumentParser()
     PARSER.add_argument(
-        '--mode', default='ci', choices=['ci', 'pr', 'scale', 'custom'],
+        '--mode', default='ci', choices=['ci', 'pr', 'custom'],
         help='Which type of projects to clear')
     PARSER.add_argument(
         '--ratelimit', type=int,