diff --git a/README.md b/README.md index 3d7cdeb1f9b..db5231425b1 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,7 @@ Katib supports Katib is the project which is agnostic to machine learning (ML) frameworks. It can tune hyperparameters of applications written in any language of the users’ choice and natively supports many ML frameworks, such as -[TensorFlow](https://www.tensorflow.org/), [Apache MXNet](https://mxnet.apache.org/), -[PyTorch](https://pytorch.org/), [XGBoost](https://xgboost.readthedocs.io/en/latest/), and others. +[TensorFlow](https://www.tensorflow.org/), [PyTorch](https://pytorch.org/), [XGBoost](https://xgboost.readthedocs.io/en/latest/), and others. Katib can perform training jobs using any Kubernetes [Custom Resources](https://www.kubeflow.org/docs/components/katib/trial-template/) diff --git a/docs/images-location.md b/docs/images-location.md index c20b60a43e4..ed330ac7bfe 100644 --- a/docs/images-location.md +++ b/docs/images-location.md @@ -271,17 +271,6 @@ The following table shows images for training containers which are used in the Dockerfile - - - docker.io/bytepsimage/mxnet - - - Distributed BytePS example for MXJob - - - Dockerfile - - docker.io/kubeflowkatib/xgboost-lightgbm diff --git a/examples/v1beta1/README.md b/examples/v1beta1/README.md index f971ae29862..063d762da6f 100644 --- a/examples/v1beta1/README.md +++ b/examples/v1beta1/README.md @@ -122,8 +122,6 @@ Check the following examples for the various distributed operators: - [PyTorchJob MNIST](./kubeflow-training-operator/pytorchjob-mnist.yaml) -- [MXJob BytePS](./kubeflow-training-operator/mxjob-byteps.yaml) - - [XGBoostJob LightGBM](./kubeflow-training-operator/xgboostjob-lightgbm.yaml) - [MPIJob Horovod](./kubeflow-training-operator/mpijob-horovod.yaml) diff --git a/examples/v1beta1/kubeflow-pipelines/early-stopping.ipynb b/examples/v1beta1/kubeflow-pipelines/early-stopping.ipynb index 422cc1ff90a..8866f621ad5 100644 --- a/examples/v1beta1/kubeflow-pipelines/early-stopping.ipynb +++ b/examples/v1beta1/kubeflow-pipelines/early-stopping.ipynb @@ -9,7 +9,7 @@ "In this notebook you will:\n", "- Create Katib Experiment using random algorithm.\n", "- Use median stopping rule as an early stopping algorithm.\n", - "- Use Kubernetes Job with mxnet mnist training container as a Trial template.\n", + "- Use Kubernetes Job with pytorch mnist training container as a Trial template.\n", "- Create Pipeline to get the optimal hyperparameters.\n", "\n", "Reference documentation:\n", diff --git a/examples/v1beta1/kubeflow-training-operator/mxjob-byteps.yaml b/examples/v1beta1/kubeflow-training-operator/mxjob-byteps.yaml deleted file mode 100644 index 1d2fa656a62..00000000000 --- a/examples/v1beta1/kubeflow-training-operator/mxjob-byteps.yaml +++ /dev/null @@ -1,86 +0,0 @@ ---- -apiVersion: kubeflow.org/v1beta1 -kind: Experiment -metadata: - namespace: kubeflow - name: mxjob-byteps -spec: - objective: - type: maximize - goal: 0.99 - objectiveMetricName: Train-accuracy - algorithm: - algorithmName: random - parallelTrialCount: 1 - maxTrialCount: 4 - maxFailedTrialCount: 3 - parameters: - - name: lr - parameterType: double - feasibleSpace: - min: "0.1" - max: "0.11" - trialTemplate: - primaryContainerName: mxnet - # In this example we can collect metrics only from the Worker pods. - primaryPodLabels: - training.kubeflow.org/replica-type: worker - trialParameters: - - name: learningRate - description: Learning rate for the training model - reference: lr - trialSpec: - apiVersion: kubeflow.org/v1 - kind: MXJob - spec: - jobMode: MXTrain - runPolicy: - cleanPodPolicy: None - mxReplicaSpecs: - Scheduler: - replicas: 1 - restartPolicy: Never - template: - spec: - containers: - - name: mxnet - image: docker.io/bytepsimage/mxnet - command: ["bpslaunch"] - Server: - replicas: 1 - restartPolicy: Never - template: - spec: - containers: - - name: mxnet - image: docker.io/bytepsimage/mxnet - command: ["bpslaunch"] - Worker: - replicas: 1 - restartPolicy: Never - template: - spec: - containers: - - name: mxnet - image: docker.io/bytepsimage/mxnet - command: ["bpslaunch"] - args: - [ - "python3", - "/usr/local/byteps/example/mxnet/train_imagenet_byteps.py", - "--benchmark", - "1", - "--lr=${trialParameters.learningRate}", - "--num-examples=1000", - "--num-epochs=4", - ] - volumeMounts: - - mountPath: /dev/shm - name: dshm - resources: - limits: - nvidia.com/gpu: 1 - volumes: - - name: dshm - emptyDir: - medium: Memory diff --git a/hack/gen-python-sdk/post_gen.py b/hack/gen-python-sdk/post_gen.py index 1803bb20430..f61f6c8d227 100644 --- a/hack/gen-python-sdk/post_gen.py +++ b/hack/gen-python-sdk/post_gen.py @@ -59,9 +59,6 @@ def _rewrite_helper(input_file, output_file, rewrite_rules): lines.append( "from kubeflow.katib.constants.constants import BASE_IMAGE_PYTORCH\n" ) - lines.append( - "from kubeflow.katib.constants.constants import BASE_IMAGE_MXNET\n" - ) # Add Kubernetes models to proper deserialization of Katib models. if output_file == "sdk/python/v1beta1/kubeflow/katib/models/__init__.py": diff --git a/manifests/v1beta1/components/controller/rbac.yaml b/manifests/v1beta1/components/controller/rbac.yaml index e93fbd031cf..9eec0229645 100644 --- a/manifests/v1beta1/components/controller/rbac.yaml +++ b/manifests/v1beta1/components/controller/rbac.yaml @@ -97,7 +97,6 @@ rules: - pytorchjobs - mpijobs - xgboostjobs - - mxjobs verbs: - "get" - "list" diff --git a/manifests/v1beta1/installs/katib-cert-manager/katib-config.yaml b/manifests/v1beta1/installs/katib-cert-manager/katib-config.yaml index c7be5cd231f..11b97ce95bc 100644 --- a/manifests/v1beta1/installs/katib-cert-manager/katib-config.yaml +++ b/manifests/v1beta1/installs/katib-cert-manager/katib-config.yaml @@ -10,7 +10,6 @@ init: - PyTorchJob.v1.kubeflow.org - MPIJob.v1.kubeflow.org - XGBoostJob.v1.kubeflow.org - - MXJob.v1.kubeflow.org runtime: metricsCollectors: - kind: StdOut diff --git a/manifests/v1beta1/installs/katib-external-db/katib-config.yaml b/manifests/v1beta1/installs/katib-external-db/katib-config.yaml index 2aaf0496b10..3f6af8ba506 100644 --- a/manifests/v1beta1/installs/katib-external-db/katib-config.yaml +++ b/manifests/v1beta1/installs/katib-external-db/katib-config.yaml @@ -12,7 +12,6 @@ init: - PyTorchJob.v1.kubeflow.org - MPIJob.v1.kubeflow.org - XGBoostJob.v1.kubeflow.org - - MXJob.v1.kubeflow.org runtime: metricsCollectors: - kind: StdOut diff --git a/manifests/v1beta1/installs/katib-leader-election/katib-config.yaml b/manifests/v1beta1/installs/katib-leader-election/katib-config.yaml index 875859cd75b..e027fefc4dc 100644 --- a/manifests/v1beta1/installs/katib-leader-election/katib-config.yaml +++ b/manifests/v1beta1/installs/katib-leader-election/katib-config.yaml @@ -13,7 +13,6 @@ init: - PyTorchJob.v1.kubeflow.org - MPIJob.v1.kubeflow.org - XGBoostJob.v1.kubeflow.org - - MXJob.v1.kubeflow.org runtime: metricsCollectors: - kind: StdOut diff --git a/manifests/v1beta1/installs/katib-openshift/katib-config.yaml b/manifests/v1beta1/installs/katib-openshift/katib-config.yaml index c7be5cd231f..11b97ce95bc 100644 --- a/manifests/v1beta1/installs/katib-openshift/katib-config.yaml +++ b/manifests/v1beta1/installs/katib-openshift/katib-config.yaml @@ -10,7 +10,6 @@ init: - PyTorchJob.v1.kubeflow.org - MPIJob.v1.kubeflow.org - XGBoostJob.v1.kubeflow.org - - MXJob.v1.kubeflow.org runtime: metricsCollectors: - kind: StdOut diff --git a/manifests/v1beta1/installs/katib-standalone-postgres/katib-config.yaml b/manifests/v1beta1/installs/katib-standalone-postgres/katib-config.yaml index 2aaf0496b10..3f6af8ba506 100644 --- a/manifests/v1beta1/installs/katib-standalone-postgres/katib-config.yaml +++ b/manifests/v1beta1/installs/katib-standalone-postgres/katib-config.yaml @@ -12,7 +12,6 @@ init: - PyTorchJob.v1.kubeflow.org - MPIJob.v1.kubeflow.org - XGBoostJob.v1.kubeflow.org - - MXJob.v1.kubeflow.org runtime: metricsCollectors: - kind: StdOut diff --git a/manifests/v1beta1/installs/katib-standalone/katib-config.yaml b/manifests/v1beta1/installs/katib-standalone/katib-config.yaml index 2aaf0496b10..3f6af8ba506 100644 --- a/manifests/v1beta1/installs/katib-standalone/katib-config.yaml +++ b/manifests/v1beta1/installs/katib-standalone/katib-config.yaml @@ -12,7 +12,6 @@ init: - PyTorchJob.v1.kubeflow.org - MPIJob.v1.kubeflow.org - XGBoostJob.v1.kubeflow.org - - MXJob.v1.kubeflow.org runtime: metricsCollectors: - kind: StdOut diff --git a/pkg/apis/controller/experiments/v1beta1/constants.go b/pkg/apis/controller/experiments/v1beta1/constants.go index e595ce14ca4..d2135fba017 100644 --- a/pkg/apis/controller/experiments/v1beta1/constants.go +++ b/pkg/apis/controller/experiments/v1beta1/constants.go @@ -45,7 +45,6 @@ var ( "TFJob": true, "PyTorchJob": true, "XGBoostJob": true, - "MXJob": true, "MPIJob": true, } ) diff --git a/pkg/util/v1beta1/katibconfig/config_test.go b/pkg/util/v1beta1/katibconfig/config_test.go index 67d24c10d71..9f7d41c2aed 100644 --- a/pkg/util/v1beta1/katibconfig/config_test.go +++ b/pkg/util/v1beta1/katibconfig/config_test.go @@ -401,7 +401,6 @@ init: - PyTorchJob.v1.kubeflow.org - MPIJob.v1.kubeflow.org - XGBoostJob.v1.kubeflow.org - - MXJob.v1.kubeflow.org webhookPort: 18443 enableLeaderElection: true leaderElectionID: xyz0123 @@ -456,7 +455,6 @@ runtime: "PyTorchJob.v1.kubeflow.org", "MPIJob.v1.kubeflow.org", "XGBoostJob.v1.kubeflow.org", - "MXJob.v1.kubeflow.org", }, WebhookPort: &customizedWebhookPort, EnableLeaderElection: true, diff --git a/sdk/python/v1beta1/kubeflow/katib/__init__.py b/sdk/python/v1beta1/kubeflow/katib/__init__.py index bafe7befea3..c6ca7dda3b9 100644 --- a/sdk/python/v1beta1/kubeflow/katib/__init__.py +++ b/sdk/python/v1beta1/kubeflow/katib/__init__.py @@ -81,4 +81,3 @@ from kubeflow.katib.constants.constants import BASE_IMAGE_TENSORFLOW from kubeflow.katib.constants.constants import BASE_IMAGE_TENSORFLOW_GPU from kubeflow.katib.constants.constants import BASE_IMAGE_PYTORCH -from kubeflow.katib.constants.constants import BASE_IMAGE_MXNET diff --git a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py index 1e0478f48f8..8cc8db35bb6 100644 --- a/sdk/python/v1beta1/kubeflow/katib/constants/constants.py +++ b/sdk/python/v1beta1/kubeflow/katib/constants/constants.py @@ -57,7 +57,6 @@ BASE_IMAGE_TENSORFLOW = "docker.io/tensorflow/tensorflow:2.13.0" BASE_IMAGE_TENSORFLOW_GPU = "docker.io/tensorflow/tensorflow:2.13.0-gpu" BASE_IMAGE_PYTORCH = "docker.io/pytorch/pytorch:2.2.1-cuda12.1-cudnn8-runtime" -BASE_IMAGE_MXNET = "docker.io/mxnet/python:1.9.1_native_py3" DEFAULT_DB_MANAGER_ADDRESS = "katib-db-manager.kubeflow:6789"