Skip to content

Commit

Permalink
feat: add machine spec generator/reader for model weight request
Browse files Browse the repository at this point in the history
Signed-off-by: Sunyanan Choochotkaew <[email protected]>
  • Loading branch information
sunya-ch committed Aug 26, 2024
1 parent 31f106e commit d8a6c14
Show file tree
Hide file tree
Showing 112 changed files with 12,438 additions and 15 deletions.
6 changes: 6 additions & 0 deletions cmd/exporter/exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ type AppConfig struct {
ApiserverEnabled bool
RedfishCredFilePath string
ExposeEstimatedIdlePower bool
MachineSpecFilePath string
DisablePowerMeter bool
}

Expand All @@ -80,6 +81,7 @@ func newAppConfig() *AppConfig {
flag.BoolVar(&_config.ApiserverEnabled, "apiserver", true, "if apiserver is disabled, we collect pod information from kubelet")
flag.StringVar(&_config.RedfishCredFilePath, "redfish-cred-file-path", "", "path to the redfish credential file")
flag.BoolVar(&_config.ExposeEstimatedIdlePower, "expose-estimated-idle-power", false, "estimated idle power is meaningful only if Kepler is running on bare-metal or when there is only one virtual machine on the node")
flag.StringVar(&_config.MachineSpecFilePath, "machine-spec", "", "path to the machine spec file in json format")
flag.BoolVar(&_config.DisablePowerMeter, "disable-power-meter", false, "whether manually disable power meter read and forcefully apply the estimator for node powers")

return _config
Expand Down Expand Up @@ -134,6 +136,10 @@ func main() {
config.SetRedfishCredFilePath(appConfig.RedfishCredFilePath)
}

if appConfig.MachineSpecFilePath != "" {
config.SetMachineSpecFilePath(appConfig.MachineSpecFilePath)
}

config.LogConfigs()

components.InitPowerImpl()
Expand Down
4 changes: 4 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ require (
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.20.0
github.com/prometheus/prometheus v0.54.0
github.com/shirou/gopsutil v3.21.11+incompatible
github.com/sirupsen/logrus v1.9.3
golang.org/x/exp v0.0.0-20240719175910-8a7402abbf56
golang.org/x/sys v0.23.0
Expand Down Expand Up @@ -72,6 +73,9 @@ require (
github.com/prometheus/procfs v0.15.1 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/stretchr/testify v1.9.0 // indirect
github.com/tklauser/go-sysconf v0.3.14 // indirect
github.com/tklauser/numcpus v0.8.0 // indirect
github.com/yusufpapurcu/wmi v1.2.4 // indirect
golang.org/x/crypto v0.26.0 // indirect
golang.org/x/net v0.28.0 // indirect
golang.org/x/oauth2 v0.21.0 // indirect
Expand Down
8 changes: 8 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ github.com/prometheus/prometheus v0.54.0 h1:6+VmEkohHcofl3W5LyRlhw1Lfm575w/aX6ZF
github.com/prometheus/prometheus v0.54.0/go.mod h1:xlLByHhk2g3ycakQGrMaU8K7OySZx98BzeCR99991NY=
github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M=
github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA=
github.com/shirou/gopsutil v3.21.11+incompatible h1:+1+c1VGhc88SSonWP6foOcLhvnKlUeu/erjjvaPEYiI=
github.com/shirou/gopsutil v3.21.11+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA=
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
Expand All @@ -154,8 +156,14 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/tklauser/go-sysconf v0.3.14 h1:g5vzr9iPFFz24v2KZXs/pvpvh8/V9Fw6vQK5ZZb78yU=
github.com/tklauser/go-sysconf v0.3.14/go.mod h1:1ym4lWMLUOhuBOPGtRcJm7tEGX4SCYNEEEtghGG/8uY=
github.com/tklauser/numcpus v0.8.0 h1:Mx4Wwe/FjZLeQsK/6kt2EOepwwSl7SmJrK5bV/dXYgY=
github.com/tklauser/numcpus v0.8.0/go.mod h1:ZJZlAY+dmR4eut8epnzf0u/VwodKmryxR8txiloSqBE=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0=
github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE=
go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0=
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
Expand Down
11 changes: 11 additions & 0 deletions hack/build-manifest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ declare PROMETHEUS_DEPLOY=false
declare HIGH_GRANULARITY=false
declare DCGM_DEPLOY=false
declare HABANA_DEPLOY=false
declare MACHINE_SPEC_DEPLOY=false

ensure_all_tools() {
header "Ensuring all tools are installed"
Expand Down Expand Up @@ -203,6 +204,16 @@ deploy_habana() {
uncomment_patch habana "${MANIFESTS_OUT_DIR}"/exporter/kustomization.yaml
ok "Habana deployment configured"
}
deploy_machine_spec() {
header "Machine Spec Deployment"
$MACHINE_SPEC_DEPLOY || {
skip "skipping machine spec deployment"
return 0
}
uncomment machine_spec_configmap "${MANIFESTS_OUT_DIR}"/exporter/kustomization.yaml
uncomment_patch machine-spec "${MANIFESTS_OUT_DIR}"/exporter/kustomization.yaml
ok "Machine spec deployment configured"
}
build_manifest() {
info "Building manifests ..."
for deploy in $(declare -F | cut -f3 -d ' ' | grep 'deploy_'); do
Expand Down
4 changes: 4 additions & 0 deletions manifests/k8s/config/exporter/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ resources:
# - prometheus_common_service_monitor.yaml
# - prometheus_common_rules.yaml
# - prometheus_high_granularity_rules.yaml
# uncomment this line for default machine spec configmap
# - machine_spec_configmap.yaml

patchesStrategicMerge: []
# add this line to allow ci
Expand All @@ -26,6 +28,8 @@ patchesStrategicMerge: []
# - ./patch/patch-dcgmi.yaml
# add this line for habana patch
# - ./patch/patch-habana.yaml
# add this line for machine spec patch
# - ./patch/patch-machine-spec.yaml

secretGenerator:
- name: redfish
Expand Down
18 changes: 18 additions & 0 deletions manifests/k8s/config/exporter/machine_spec_configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: kepler-machine-spec
namespace: system
data:
m5.metal: |
{"processor": "intel_xeon_platinum_8259cl", "cores": 96, "chips": 2, "memory": 377, "frequency": 3500}
i3.metal: |
{"processor": "intel_xeon_e5_2686v4", "cores": 72, "chips": 2, "memory": 503, "frequency": 3000}
c5.metal: |
{"processor": "intel_xeon_platinum_8275cl", "cores": 96, "chips": 2, "memory": 188, "frequency": 3900}
r5.metal: |
{"processor": "intel_xeon_platinum_8259cl", "cores": 96, "chips": 2, "memory": 755, "frequency": 3500}
m5zn.metal: |
{"processor": "intel_xeon_platinum_8252c", "cores": 48, "chips": 2, "memory": 188, "frequency": 4500}
m7i.metal-24xl: |
{"processor": "intel_xeon_platinum_8488c", "cores": 96, "chips": 1, "memory": 377, "frequency": 3800}
21 changes: 21 additions & 0 deletions manifests/k8s/config/exporter/patch/patch-machine-spec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: kepler-exporter
namespace: system
spec:
template:
spec:
containers:
- name: kepler-exporter
volumeMounts:
- name: machine-spec
mountPath: /etc/kepler/models/machine
readOnly: true
volumes:
- name: config-models
configMap:
name: kepler-machine-spec
items:
- key: m5.metal # set a target machine refer to kepler-machine-spec configmap
path: spec.json
22 changes: 20 additions & 2 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,10 @@ var (
redfishSkipSSLVerify = getBoolConfig("REDFISH_SKIP_SSL_VERIFY", true)

////////////////////////////////////
ModelServerEnable = getBoolConfig("MODEL_SERVER_ENABLE", false)
ModelServerEndpoint = SetModelServerReqEndpoint()
DefaultMachineSpecFilePath = "/etc/kepler/models/machine/spec.json"
machineSpecFilePath string
ModelServerEnable = getBoolConfig("MODEL_SERVER_ENABLE", false)
ModelServerEndpoint = SetModelServerReqEndpoint()
// for model config
ModelConfigValues map[string]string
// model_parameter_prefix
Expand Down Expand Up @@ -245,6 +247,22 @@ func GetMockACPIPowerPath() string {
return MockACPIPowerPath
}

func SetMachineSpecFilePath(specFilePath string) {
machineSpecFilePath = specFilePath
}

// GetMachineSpec initializes a map of MachineSpecValues from MACHINE_SPEC
func GetMachineSpec() *MachineSpec {
if machineSpecFilePath != "" {
if spec, err := readMachineSpec(machineSpecFilePath); err == nil {
return spec
} else {
klog.Warningf("failed to read spec from %s: %v, use default machine spec", machineSpecFilePath, err)
}
}
return getDefaultMachineSpec()
}

// InitModelConfigMap initializes map of config from MODEL_CONFIG
func InitModelConfigMap() {
ModelConfigValues = GetModelConfigMap()
Expand Down
24 changes: 24 additions & 0 deletions pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package config

import (
"encoding/json"
"os"
"runtime"

Expand Down Expand Up @@ -56,6 +57,16 @@ func createTempFile(contents string) (filename string, reterr error) {
return f.Name(), nil
}

func (spec *MachineSpec) saveToFile(path string) error {
file, err := os.Create(path)
if err != nil {
return err
}
defer file.Close()
encoder := json.NewEncoder(file)
return encoder.Encode(spec)
}

var _ = Describe("Test Configuration", func() {
It("Test cgroup version", func() {
file, err := createTempFile("")
Expand Down Expand Up @@ -113,4 +124,17 @@ var _ = Describe("Test Configuration", func() {
// no test
}
})
It("Test machine spec generation and read", func() {
tmpPath := "./test_spec"
// generate spec
spec := generateSpec()
Expect(spec).NotTo(BeNil())
err := spec.saveToFile(tmpPath)
Expect(err).To(BeNil())
readSpec, err := readMachineSpec(tmpPath)
Expect(err).To(BeNil())
Expect(*spec).To(BeEquivalentTo(*readSpec))
err = os.Remove(tmpPath)
Expect(err).To(BeNil())
})
})
Loading

0 comments on commit d8a6c14

Please sign in to comment.