diff --git a/operator/.errcheck_excludes.txt b/operator/.errcheck_excludes.txt new file mode 100644 index 0000000000..5c8b739e9e --- /dev/null +++ b/operator/.errcheck_excludes.txt @@ -0,0 +1,2 @@ +fmt.Fprintln +fmt.Fprint \ No newline at end of file diff --git a/operator/.golangci.yml b/operator/.golangci.yml new file mode 100644 index 0000000000..f199f38a3a --- /dev/null +++ b/operator/.golangci.yml @@ -0,0 +1,46 @@ +# options for analysis running +run: + # timeout for analysis, e.g. 30s, 5m, default is 1m + deadline: 5m + + # exit code when at least one issue was found, default is 1 + issues-exit-code: 1 + + # which dirs to skip: they won't be analyzed; + # can use regexp here: generated.*, regexp is applied on full path; + # default value is empty list, but next dirs are always skipped independently + # from this option's value: + # vendor$, third_party$, testdata$, examples$, Godeps$, builtin$ + skip-dirs: vendor + +# output configuration options +output: + # colored-line-number|line-number|json|tab|checkstyle, default is "colored-line-number" + format: colored-line-number + + # print lines of code with issue, default is true + print-issued-lines: true + + # print linter name in the end of issue text, default is true + print-linter-name: true + +linters: + disable-all: true + enable: + # Sorted alphabetically. + - errcheck + - exportloopref + - goimports # Also includes gofmt style formatting + - gosimple + - govet + - misspell + - staticcheck + - structcheck + - typecheck + - varcheck + +linters-settings: + errcheck: + exclude: ./.errcheck_excludes.txt + goconst: + min-occurrences: 5 diff --git a/operator/Makefile b/operator/Makefile index 5475a5751e..8d7c0e0c06 100644 --- a/operator/Makefile +++ b/operator/Makefile @@ -55,6 +55,10 @@ fmt: ## Run go fmt against code. vet: ## Run go vet against code. go vet ./... +.PHONY: lint +lint: ## Run go linters against code. + golangci-lint run --fix + .PHONY: test test: manifests generate fmt vet envtest ## Run tests. KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path)" go test ./... -coverprofile cover.out diff --git a/operator/apis/mlops/v1alpha1/explainer_types.go b/operator/apis/mlops/v1alpha1/explainer_types.go index f66707f65c..75ec5e143a 100644 --- a/operator/apis/mlops/v1alpha1/explainer_types.go +++ b/operator/apis/mlops/v1alpha1/explainer_types.go @@ -23,8 +23,8 @@ import ( // ExplainerSpec defines the desired state of Explainer type ExplainerSpec struct { ModelSpec `json:",inline"` - BlackBox *BlackBox `json:"blackbox,omitempty"` - WhiteBox *WhiteBox `json:"whitebox,omitempty"` + BlackBox *BlackBox `json:"blackbox,omitempty"` + WhiteBox *WhiteBox `json:"whitebox,omitempty"` } // Either ModelRef or PipelineRef is required @@ -41,7 +41,7 @@ type BlackBox struct { type WhiteBox struct { // Storage URI for the model repository // +optional - InferenceArtifactSpec`json:",inline"` + InferenceArtifactSpec `json:",inline"` } // ExplainerStatus defines the observed state of Explainer diff --git a/operator/apis/mlops/v1alpha1/model_types.go b/operator/apis/mlops/v1alpha1/model_types.go index 59c7ab1726..a914c2010b 100644 --- a/operator/apis/mlops/v1alpha1/model_types.go +++ b/operator/apis/mlops/v1alpha1/model_types.go @@ -106,6 +106,3 @@ type ModelList struct { func init() { SchemeBuilder.Register(&Model{}, &ModelList{}) } - - - diff --git a/operator/apis/mlops/v1alpha1/server_types.go b/operator/apis/mlops/v1alpha1/server_types.go index 225aec15b7..7ddfca5477 100644 --- a/operator/apis/mlops/v1alpha1/server_types.go +++ b/operator/apis/mlops/v1alpha1/server_types.go @@ -25,12 +25,12 @@ import ( // TODO do we need autoscaling spec? type ServerSpec struct { // Server definition - Server ServerDefn `json:"server,omitempty"` + Server ServerDefn `json:"server,omitempty"` // Either Models or Mesh needs to be provides // Preloaded models for non mesh server - Models []PreLoadedModelSpec `json:"models,omitempty"` + Models []PreLoadedModelSpec `json:"models,omitempty"` // Seldon mesh specifications for mesh servers that can load models dynamically - Mesh *MeshDefn `json:"mesh"` + Mesh *MeshDefn `json:"mesh"` // PodSpec overrides PodOverride PodSpec `json:"podSpec,omitempty"` // Number of replicas - defaults to 1 @@ -40,7 +40,7 @@ type ServerSpec struct { type PreLoadedModelSpec struct { // Name override // +optional - Name *string `json:"name,omitempty"` + Name *string `json:"name,omitempty"` InferenceArtifactSpec `json:",inline"` } @@ -59,10 +59,10 @@ type MeshDefn struct { type ServerDefn struct { // Server type - mlserver, triton or left out if custom container - Type *string `json:"type,omitempty"` + Type *string `json:"type,omitempty"` // +optional RuntimeVersion *string `json:"runtimeVersion,omitempty"` - // Conatiner overrides for server + // Container overrides for server Container *v1.Container `json:"container,omitempty"` } diff --git a/operator/apis/mlops/v1alpha1/zz_generated.deepcopy.go b/operator/apis/mlops/v1alpha1/zz_generated.deepcopy.go index f72b03c342..6e7edc8e14 100644 --- a/operator/apis/mlops/v1alpha1/zz_generated.deepcopy.go +++ b/operator/apis/mlops/v1alpha1/zz_generated.deepcopy.go @@ -1,4 +1,3 @@ -//go:build !ignore_autogenerated // +build !ignore_autogenerated /* @@ -118,14 +117,7 @@ func (in *ExplainerList) DeepCopyObject() runtime.Object { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ExplainerSpec) DeepCopyInto(out *ExplainerSpec) { *out = *in - in.InferenceArtifactSpec.DeepCopyInto(&out.InferenceArtifactSpec) - if in.Resources != nil { - in, out := &in.Resources, &out.Resources - *out = make(v1.ResourceList, len(*in)) - for key, val := range *in { - (*out)[key] = val.DeepCopy() - } - } + in.ModelSpec.DeepCopyInto(&out.ModelSpec) if in.BlackBox != nil { in, out := &in.BlackBox, &out.BlackBox *out = new(BlackBox) @@ -134,7 +126,7 @@ func (in *ExplainerSpec) DeepCopyInto(out *ExplainerSpec) { if in.WhiteBox != nil { in, out := &in.WhiteBox, &out.WhiteBox *out = new(WhiteBox) - **out = **in + (*in).DeepCopyInto(*out) } } @@ -166,11 +158,31 @@ func (in *ExplainerStatus) DeepCopy() *ExplainerStatus { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *InferenceArtifactSpec) DeepCopyInto(out *InferenceArtifactSpec) { *out = *in + if in.ModelType != nil { + in, out := &in.ModelType, &out.ModelType + *out = new(string) + **out = **in + } if in.StorageURI != nil { in, out := &in.StorageURI, &out.StorageURI *out = new(string) **out = **in } + if in.SchemaURI != nil { + in, out := &in.SchemaURI, &out.SchemaURI + *out = new(string) + **out = **in + } + if in.ServiceAccountName != nil { + in, out := &in.ServiceAccountName, &out.ServiceAccountName + *out = new(string) + **out = **in + } + if in.SecretName != nil { + in, out := &in.SecretName, &out.SecretName + *out = new(string) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferenceArtifactSpec. @@ -186,17 +198,11 @@ func (in *InferenceArtifactSpec) DeepCopy() *InferenceArtifactSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *InferenceStep) DeepCopyInto(out *InferenceStep) { *out = *in - if in.Next != nil { - in, out := &in.Next, &out.Next + if in.Preceding != nil { + in, out := &in.Preceding, &out.Preceding *out = make([]InferenceNextStep, len(*in)) copy(*out, *in) } - if in.NextAsync != nil { - in, out := &in.NextAsync, &out.NextAsync - *out = make([]InferenceNextStep, len(*in)) - copy(*out, *in) - } - in.Spec.DeepCopyInto(&out.Spec) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InferenceStep. @@ -209,6 +215,31 @@ func (in *InferenceStep) DeepCopy() *InferenceStep { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *LoggingSpec) DeepCopyInto(out *LoggingSpec) { + *out = *in + if in.Uri != nil { + in, out := &in.Uri, &out.Uri + *out = new(string) + **out = **in + } + if in.Percent != nil { + in, out := &in.Percent, &out.Percent + *out = new(int32) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new LoggingSpec. +func (in *LoggingSpec) DeepCopy() *LoggingSpec { + if in == nil { + return nil + } + out := new(LoggingSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *MeshDefn) DeepCopyInto(out *MeshDefn) { *out = *in @@ -322,6 +353,26 @@ func (in *ModelSpec) DeepCopyInto(out *ModelSpec) { *out = new(string) **out = **in } + if in.Replicas != nil { + in, out := &in.Replicas, &out.Replicas + *out = new(int32) + **out = **in + } + if in.MinReplicas != nil { + in, out := &in.MinReplicas, &out.MinReplicas + *out = new(int32) + **out = **in + } + if in.MaxReplicas != nil { + in, out := &in.MaxReplicas, &out.MaxReplicas + *out = new(int32) + **out = **in + } + if in.Logger != nil { + in, out := &in.Logger, &out.Logger + *out = new(LoggingSpec) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelSpec. @@ -586,6 +637,27 @@ func (in *PodSpec) DeepCopy() *PodSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *PreLoadedModelSpec) DeepCopyInto(out *PreLoadedModelSpec) { + *out = *in + if in.Name != nil { + in, out := &in.Name, &out.Name + *out = new(string) + **out = **in + } + in.InferenceArtifactSpec.DeepCopyInto(&out.InferenceArtifactSpec) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PreLoadedModelSpec. +func (in *PreLoadedModelSpec) DeepCopy() *PreLoadedModelSpec { + if in == nil { + return nil + } + out := new(PreLoadedModelSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Server) DeepCopyInto(out *Server) { *out = *in @@ -621,6 +693,11 @@ func (in *ServerDefn) DeepCopyInto(out *ServerDefn) { *out = new(string) **out = **in } + if in.RuntimeVersion != nil { + in, out := &in.RuntimeVersion, &out.RuntimeVersion + *out = new(string) + **out = **in + } if in.Container != nil { in, out := &in.Container, &out.Container *out = new(v1.Container) @@ -676,7 +753,7 @@ func (in *ServerSpec) DeepCopyInto(out *ServerSpec) { in.Server.DeepCopyInto(&out.Server) if in.Models != nil { in, out := &in.Models, &out.Models - *out = make([]InferenceArtifactSpec, len(*in)) + *out = make([]PreLoadedModelSpec, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -687,6 +764,11 @@ func (in *ServerSpec) DeepCopyInto(out *ServerSpec) { (*in).DeepCopyInto(*out) } in.PodOverride.DeepCopyInto(&out.PodOverride) + if in.Replicas != nil { + in, out := &in.Replicas, &out.Replicas + *out = new(int32) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServerSpec. @@ -717,6 +799,7 @@ func (in *ServerStatus) DeepCopy() *ServerStatus { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *WhiteBox) DeepCopyInto(out *WhiteBox) { *out = *in + in.InferenceArtifactSpec.DeepCopyInto(&out.InferenceArtifactSpec) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WhiteBox. diff --git a/operator/config/crd/bases/mlops.seldon.io_explainers.yaml b/operator/config/crd/bases/mlops.seldon.io_explainers.yaml index d55b64088d..dec447d51f 100644 --- a/operator/config/crd/bases/mlops.seldon.io_explainers.yaml +++ b/operator/config/crd/bases/mlops.seldon.io_explainers.yaml @@ -49,6 +49,20 @@ spec: description: Reference to Pipeline step - must include pipeline type: string type: object + dedicated: + description: Dedicated server exclusive to this model Default false + type: boolean + logger: + description: Payload logging + properties: + percent: + description: Percentage of payloads to log Defaults to 100% + format: int32 + type: integer + uri: + description: URI to logging endpoint. + type: string + type: object maxReplicas: description: Max number of replicas - default equal to replicas format: int32 diff --git a/operator/config/crd/bases/mlops.seldon.io_models.yaml b/operator/config/crd/bases/mlops.seldon.io_models.yaml index 01f859c436..a7ae6dfc1e 100644 --- a/operator/config/crd/bases/mlops.seldon.io_models.yaml +++ b/operator/config/crd/bases/mlops.seldon.io_models.yaml @@ -36,6 +36,20 @@ spec: spec: description: ModelSpec defines the desired state of Model properties: + dedicated: + description: Dedicated server exclusive to this model Default false + type: boolean + logger: + description: Payload logging + properties: + percent: + description: Percentage of payloads to log Defaults to 100% + format: int32 + type: integer + uri: + description: URI to logging endpoint. + type: string + type: object maxReplicas: description: Max number of replicas - default equal to replicas format: int32 diff --git a/operator/config/crd/bases/mlops.seldon.io_pipelines.yaml b/operator/config/crd/bases/mlops.seldon.io_pipelines.yaml index 129ad6360d..deeb0baccc 100644 --- a/operator/config/crd/bases/mlops.seldon.io_pipelines.yaml +++ b/operator/config/crd/bases/mlops.seldon.io_pipelines.yaml @@ -40,6 +40,9 @@ spec: description: The steps of this inference graph pipeline items: properties: + async: + description: Whether this step is asynchronous + type: boolean if: description: Condition specification to satisfy for this step to be run diff --git a/operator/config/crd/bases/mlops.seldon.io_servers.yaml b/operator/config/crd/bases/mlops.seldon.io_servers.yaml index cef5e12951..1fecd3d392 100644 --- a/operator/config/crd/bases/mlops.seldon.io_servers.yaml +++ b/operator/config/crd/bases/mlops.seldon.io_servers.yaml @@ -8906,7 +8906,7 @@ spec: description: Server definition properties: container: - description: Conatiner overrides for server + description: Container overrides for server properties: args: description: 'Arguments to the entrypoint. The docker image''s diff --git a/scheduler/.errcheck_excludes.txt b/scheduler/.errcheck_excludes.txt new file mode 100644 index 0000000000..5c8b739e9e --- /dev/null +++ b/scheduler/.errcheck_excludes.txt @@ -0,0 +1,2 @@ +fmt.Fprintln +fmt.Fprint \ No newline at end of file diff --git a/scheduler/.golangci.yml b/scheduler/.golangci.yml new file mode 100644 index 0000000000..f199f38a3a --- /dev/null +++ b/scheduler/.golangci.yml @@ -0,0 +1,46 @@ +# options for analysis running +run: + # timeout for analysis, e.g. 30s, 5m, default is 1m + deadline: 5m + + # exit code when at least one issue was found, default is 1 + issues-exit-code: 1 + + # which dirs to skip: they won't be analyzed; + # can use regexp here: generated.*, regexp is applied on full path; + # default value is empty list, but next dirs are always skipped independently + # from this option's value: + # vendor$, third_party$, testdata$, examples$, Godeps$, builtin$ + skip-dirs: vendor + +# output configuration options +output: + # colored-line-number|line-number|json|tab|checkstyle, default is "colored-line-number" + format: colored-line-number + + # print lines of code with issue, default is true + print-issued-lines: true + + # print linter name in the end of issue text, default is true + print-linter-name: true + +linters: + disable-all: true + enable: + # Sorted alphabetically. + - errcheck + - exportloopref + - goimports # Also includes gofmt style formatting + - gosimple + - govet + - misspell + - staticcheck + - structcheck + - typecheck + - varcheck + +linters-settings: + errcheck: + exclude: ./.errcheck_excludes.txt + goconst: + min-occurrences: 5 diff --git a/scheduler/Makefile b/scheduler/Makefile index d3da14cf57..2ff52db8c2 100644 --- a/scheduler/Makefile +++ b/scheduler/Makefile @@ -14,6 +14,9 @@ build-agent: test build: build-scheduler build-agent +lint: ## Run go linters against code. + golangci-lint run --fix + test: go test ./pkg/... diff --git a/scheduler/apis/mesh/seldon.go b/scheduler/apis/mesh/seldon.go index 131669080e..ee78281cb1 100644 --- a/scheduler/apis/mesh/seldon.go +++ b/scheduler/apis/mesh/seldon.go @@ -1,7 +1,7 @@ package mesh type SeldonConfig struct { - Name string `yaml:"name"` + Name string `yaml:"name"` SeldonSpec `yaml:"spec"` } @@ -11,17 +11,17 @@ type SeldonSpec struct { } type StaticServer struct { - Name string `yaml:"name"` + Name string `yaml:"name"` Replicas []Replica `yaml:"replicas"` } type Replica struct { - Address string `yaml:"address"` - Port uint32 `yaml:"port"` + Address string `yaml:"address"` + Port uint32 `yaml:"port"` } type StaticModel struct { - Name string `yaml:"name"` + Name string `yaml:"name"` ModelServer string `yaml:"modelServer"` - Servers []int `yaml:"servers"` + Servers []int `yaml:"servers"` } diff --git a/scheduler/cmd/agent/main.go b/scheduler/cmd/agent/main.go index 5c4d193570..3d33752877 100644 --- a/scheduler/cmd/agent/main.go +++ b/scheduler/cmd/agent/main.go @@ -2,9 +2,7 @@ package main import ( "flag" - "github.com/cenkalti/backoff/v4" "io/ioutil" - "k8s.io/client-go/util/homedir" "math/rand" "os" "path/filepath" @@ -12,22 +10,24 @@ import ( "strings" "time" + "github.com/cenkalti/backoff/v4" + "k8s.io/client-go/util/homedir" + "github.com/seldonio/seldon-core/scheduler/pkg/agent" log "github.com/sirupsen/logrus" ) var ( - l log.FieldLogger - serverName string - replicaIdx uint - schedulerHost string - schedulerPort int - rcloneHost string - rclonePort int - inferenceHost string - inferencePort int - modelRepository string - kubeconfig string + serverName string + replicaIdx uint + schedulerHost string + schedulerPort int + rcloneHost string + rclonePort int + inferenceHost string + inferencePort int + modelRepository string + kubeconfig string namespace string replicaConfigStr string inferenceSvcName string @@ -35,16 +35,16 @@ var ( const ( EnvMLServerHttpPort = "MLSERVER_HTTP_PORT" - EnvServerName = "SELDON_SERVER_NAME" - EnvServerIdx = "POD_NAME" - EnvSchedulerHost = "SELDON_SCHEDULER_HOST" - EnvSchedulerPort = "SELDON_SCHEDULER_PORT" - EnvReplicaConfig = "SELDON_REPLICA_CONFIG" + EnvServerName = "SELDON_SERVER_NAME" + EnvServerIdx = "POD_NAME" + EnvSchedulerHost = "SELDON_SCHEDULER_HOST" + EnvSchedulerPort = "SELDON_SCHEDULER_PORT" + EnvReplicaConfig = "SELDON_REPLICA_CONFIG" FlagSchedulerHost = "scheduler-host" FlagSchedulerPort = "scheduler-port" - FlagServerName = "server-name" - FlagServerIdx = "server-idx" + FlagServerName = "server-name" + FlagServerIdx = "server-idx" FlagInferencePort = "inference-port" FlagReplicaConfig = "replica-config" ) @@ -80,73 +80,70 @@ func isFlagPassed(name string) bool { return found } - - func updateFlagsFromEnv() { if !isFlagPassed(FlagInferencePort) { port := os.Getenv(EnvMLServerHttpPort) if port != "" { - log.Infof("Got %s from %s setting to %s",FlagInferencePort, EnvMLServerHttpPort, port) + log.Infof("Got %s from %s setting to %s", FlagInferencePort, EnvMLServerHttpPort, port) var err error - inferencePort,err = strconv.Atoi(port) + inferencePort, err = strconv.Atoi(port) if err != nil { - log.WithError(err).Fatalf("Failed to parse %s with value %s",EnvMLServerHttpPort,port) + log.WithError(err).Fatalf("Failed to parse %s with value %s", EnvMLServerHttpPort, port) } } } if !isFlagPassed(FlagServerName) { val := os.Getenv(EnvServerName) if val != "" { - log.Infof("Got %s from %s setting to %s",FlagServerName, EnvServerName, val) + log.Infof("Got %s from %s setting to %s", FlagServerName, EnvServerName, val) serverName = val } } if !isFlagPassed(FlagServerIdx) { podName := os.Getenv(EnvServerIdx) if podName != "" { - lastDashIdx := strings.LastIndex(podName,"-") + lastDashIdx := strings.LastIndex(podName, "-") if lastDashIdx == -1 { log.Info("Can't decypher pod name to find last dash and index") return } val := podName[lastDashIdx+1:] var err error - idxAsInt,err := strconv.Atoi(val) + idxAsInt, err := strconv.Atoi(val) if err != nil { - log.WithError(err).Fatalf("Failed to parse %s with value %s",EnvServerIdx,val) + log.WithError(err).Fatalf("Failed to parse %s with value %s", EnvServerIdx, val) } replicaIdx = uint(idxAsInt) - log.Infof("Got %s from %s with value %s setting with %d",FlagServerIdx, EnvServerIdx, podName, replicaIdx) + log.Infof("Got %s from %s with value %s setting with %d", FlagServerIdx, EnvServerIdx, podName, replicaIdx) } } if !isFlagPassed(FlagSchedulerHost) { val := os.Getenv(EnvSchedulerHost) if val != "" { - log.Infof("Got %s from %s setting to %s",FlagSchedulerHost, EnvSchedulerHost, val) + log.Infof("Got %s from %s setting to %s", FlagSchedulerHost, EnvSchedulerHost, val) schedulerHost = val } } if !isFlagPassed(FlagSchedulerPort) { port := os.Getenv(EnvSchedulerPort) if port != "" { - log.Infof("Got %s from %s setting to %s",FlagSchedulerPort, EnvSchedulerPort, port) + log.Infof("Got %s from %s setting to %s", FlagSchedulerPort, EnvSchedulerPort, port) var err error - schedulerPort,err = strconv.Atoi(port) + schedulerPort, err = strconv.Atoi(port) if err != nil { - log.WithError(err).Fatalf("Failed to parse %s with value %s",EnvSchedulerPort,port) + log.WithError(err).Fatalf("Failed to parse %s with value %s", EnvSchedulerPort, port) } } } if !isFlagPassed(FlagReplicaConfig) { val := os.Getenv(EnvReplicaConfig) if val != "" { - log.Infof("Got %s from %s setting to %s",FlagReplicaConfig, EnvReplicaConfig, val) + log.Infof("Got %s from %s setting to %s", FlagReplicaConfig, EnvReplicaConfig, val) replicaConfigStr = val } } } - func getNamespace() string { nsBytes, err := ioutil.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/namespace") if err != nil { @@ -165,10 +162,9 @@ func setInferenceSvcName() { } else { inferenceSvcName = inferenceHost } - log.Infof("Setting inference svc name to %s",inferenceSvcName) + log.Infof("Setting inference svc name to %s", inferenceSvcName) } - func main() { logger := log.New() log.SetLevel(log.DebugLevel) @@ -178,7 +174,7 @@ func main() { replicaConfig, err := agent.ParseReplicConfig(replicaConfigStr) if err != nil { - log.Fatalf("Failed to parse replica config %s",replicaConfigStr) + log.Fatalf("Failed to parse replica config %s", replicaConfigStr) } rcloneClient := agent.NewRCloneClient(rcloneHost, rclonePort, modelRepository, logger) diff --git a/scheduler/cmd/scheduler/main.go b/scheduler/cmd/scheduler/main.go index aca1b045f3..c8dc887ec5 100644 --- a/scheduler/cmd/scheduler/main.go +++ b/scheduler/cmd/scheduler/main.go @@ -17,17 +17,18 @@ package main import ( "context" "flag" + "io/ioutil" + "math/rand" + "path/filepath" + "time" + "github.com/seldonio/seldon-core/scheduler/pkg/agent" "github.com/seldonio/seldon-core/scheduler/pkg/envoy/processor" "github.com/seldonio/seldon-core/scheduler/pkg/scheduler/filters" "github.com/seldonio/seldon-core/scheduler/pkg/scheduler/sorters" server2 "github.com/seldonio/seldon-core/scheduler/pkg/server" "github.com/seldonio/seldon-core/scheduler/pkg/store" - "io/ioutil" "k8s.io/client-go/util/homedir" - "math/rand" - "path/filepath" - "time" "github.com/envoyproxy/go-control-plane/pkg/cache/v3" serverv3 "github.com/envoyproxy/go-control-plane/pkg/server/v3" @@ -37,11 +38,11 @@ import ( ) var ( - envoyPort uint - agentPort uint - schedulerPort uint - kubeconfig string - namespace string + envoyPort uint + agentPort uint + schedulerPort uint + kubeconfig string + namespace string nodeID string ) @@ -104,11 +105,9 @@ func main() { []sorters.ReplicaSorter{sorters.ModelAlreadyLoadedSorter{}}) as := agent.NewAgentServer(logger, ss, es, sched) - go as.ListenForSyncs() // Start agent syncs go es.ListenForSyncs() // Start envoy syncs - s := server2.NewSchedulerServer(logger, ss, sched, as) go func() { err := s.StartGrpcServer(schedulerPort) @@ -119,7 +118,7 @@ func main() { err := as.StartGrpcServer(agentPort) if err != nil { - log.Fatalf("Failed to start agent grpc server %s",err.Error()) + log.Fatalf("Failed to start agent grpc server %s", err.Error()) } as.StopAgentSync() diff --git a/scheduler/pkg/agent/client.go b/scheduler/pkg/agent/client.go index d7b224a22f..3edb5b0eb3 100644 --- a/scheduler/pkg/agent/client.go +++ b/scheduler/pkg/agent/client.go @@ -3,6 +3,11 @@ package agent import ( "context" "fmt" + "io" + "math" + "sync" + "time" + backoff "github.com/cenkalti/backoff/v4" grpc_retry "github.com/grpc-ecosystem/go-grpc-middleware/retry" "github.com/seldonio/seldon-core/scheduler/apis/mlops/agent" @@ -10,30 +15,26 @@ import ( log "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/protobuf/encoding/protojson" - "io" - "math" - "sync" - "time" ) type Client struct { - mu sync.RWMutex + mu sync.RWMutex schedulerHost string schedulerPort int - serverName string - replicaIdx uint32 - conn *grpc.ClientConn - callOptions []grpc.CallOption - logger log.FieldLogger - RCloneClient *RCloneClient - V2Client *V2Client + serverName string + replicaIdx uint32 + conn *grpc.ClientConn + callOptions []grpc.CallOption + logger log.FieldLogger + RCloneClient *RCloneClient + V2Client *V2Client replicaConfig *agent.ReplicaConfig - loadedModels map[string]*pbs.ModelDetails + loadedModels map[string]*pbs.ModelDetails } func ParseReplicConfig(json string) (*agent.ReplicaConfig, error) { config := agent.ReplicaConfig{} - err := protojson.Unmarshal([]byte(json),&config) + err := protojson.Unmarshal([]byte(json), &config) if err != nil { return nil, err } @@ -48,7 +49,7 @@ func NewClient(serverName string, rcloneClient *RCloneClient, v2Client *V2Client, replicaConfig *agent.ReplicaConfig, - inferenceSvcName string) (*Client, error) { + inferenceSvcName string) (*Client, error) { replicaConfig.InferenceSvc = inferenceSvcName replicaConfig.AvailableMemoryBytes = replicaConfig.MemoryBytes @@ -60,19 +61,19 @@ func NewClient(serverName string, return &Client{ schedulerHost: schedulerHost, schedulerPort: schedulerPort, - serverName: serverName, - replicaIdx: replicaIdx, - callOptions: opts, - logger: logger.WithField("Name","Client"), - RCloneClient: rcloneClient, - V2Client: v2Client, + serverName: serverName, + replicaIdx: replicaIdx, + callOptions: opts, + logger: logger.WithField("Name", "Client"), + RCloneClient: rcloneClient, + V2Client: v2Client, replicaConfig: replicaConfig, - loadedModels: make(map[string]*pbs.ModelDetails), + loadedModels: make(map[string]*pbs.ModelDetails), }, nil } func (c *Client) CreateConnection() error { - c.logger.Infof("Creating connection to %s:%d",c.schedulerHost,c.schedulerPort) + c.logger.Infof("Creating connection to %s:%d", c.schedulerHost, c.schedulerPort) conn, err := getConnection(c.schedulerHost, c.schedulerPort) if err != nil { return err @@ -81,7 +82,7 @@ func (c *Client) CreateConnection() error { return nil } -func ( c *Client) WaitReady() error { +func (c *Client) WaitReady() error { logFailure := func(err error, delay time.Duration) { c.logger.WithError(err).Errorf("Rclone not ready") } @@ -116,21 +117,20 @@ func getConnection(host string, port int) (*grpc.ClientConn, error) { return conn, nil } - func (c *Client) Start() error { c.logger.Infof("Call subscribe to scheduler") grpcClient := agent.NewAgentServiceClient(c.conn) var loadedModels []*pbs.ModelDetails - for _,v := range c.loadedModels { - loadedModels = append(loadedModels,v) + for _, v := range c.loadedModels { + loadedModels = append(loadedModels, v) } stream, err := grpcClient.Subscribe(context.Background(), &agent.AgentSubscribeRequest{ - ServerName: c.serverName, - ReplicaIdx: c.replicaIdx, + ServerName: c.serverName, + ReplicaIdx: c.replicaIdx, ReplicaConfig: c.replicaConfig, - LoadedModels: loadedModels, - Shared: true, - },grpc_retry.WithMax(100)) + LoadedModels: loadedModels, + Shared: true, + }, grpc_retry.WithMax(100)) if err != nil { return err } @@ -146,7 +146,7 @@ func (c *Client) Start() error { switch operation.Operation { case agent.ModelOperationMessage_LOAD_MODEL: c.logger.Infof("calling load model") - err := c.LoadModel(operation) + err := c.LoadModel(operation) if err != nil { c.logger.WithError(err).Errorf("Failed to handle load model") } @@ -164,17 +164,17 @@ func (c *Client) Start() error { func (c *Client) sendModelEventError(modelName string, modelVersion string, event agent.ModelEventMessage_Event, err error) error { grpcClient := agent.NewAgentServiceClient(c.conn) _, err = grpcClient.AgentEvent(context.Background(), &agent.ModelEventMessage{ - ServerName: c.serverName, - ReplicaIdx: c.replicaIdx, - ModelName: modelName, - Event: event, - Message: err.Error(), + ServerName: c.serverName, + ReplicaIdx: c.replicaIdx, + ModelName: modelName, + Event: event, + Message: err.Error(), AvailableMemoryBytes: c.replicaConfig.AvailableMemoryBytes, }) return err } -func (c *Client) LoadModel(request *agent.ModelOperationMessage) error { +func (c *Client) LoadModel(request *agent.ModelOperationMessage) error { c.mu.Lock() defer c.mu.Unlock() if request == nil || request.Details == nil { @@ -182,7 +182,7 @@ func (c *Client) LoadModel(request *agent.ModelOperationMessage) error { } modelName := request.Details.Name if request.Details.GetMemoryBytes() > c.replicaConfig.AvailableMemoryBytes { - err := fmt.Errorf("Not enough memory on replica for model %s available %d requested %d",modelName,c.replicaConfig.AvailableMemoryBytes, request.Details.GetMemoryBytes()) + err := fmt.Errorf("Not enough memory on replica for model %s available %d requested %d", modelName, c.replicaConfig.AvailableMemoryBytes, request.Details.GetMemoryBytes()) err2 := c.sendModelEventError(modelName, request.Details.GetVersion(), agent.ModelEventMessage_LOAD_FAILED, err) if err2 != nil { c.logger.WithError(err2).Errorf("Failed to send error back on load model") @@ -216,11 +216,11 @@ func (c *Client) LoadModel(request *agent.ModelOperationMessage) error { c.replicaConfig.AvailableMemoryBytes = c.replicaConfig.AvailableMemoryBytes - request.Details.GetMemoryBytes() grpcClient := agent.NewAgentServiceClient(c.conn) _, err = grpcClient.AgentEvent(context.Background(), &agent.ModelEventMessage{ - ServerName: c.serverName, - ReplicaIdx: c.replicaIdx, - ModelName: modelName, - ModelVersion: request.Details.GetVersion(), - Event: agent.ModelEventMessage_LOADED, + ServerName: c.serverName, + ReplicaIdx: c.replicaIdx, + ModelName: modelName, + ModelVersion: request.Details.GetVersion(), + Event: agent.ModelEventMessage_LOADED, AvailableMemoryBytes: c.replicaConfig.AvailableMemoryBytes, }) if err != nil { @@ -259,11 +259,11 @@ func (c *Client) UnloadModel(request *agent.ModelOperationMessage) error { c.replicaConfig.AvailableMemoryBytes = c.replicaConfig.AvailableMemoryBytes + loadedModel.GetMemoryBytes() grpcClient := agent.NewAgentServiceClient(c.conn) _, err = grpcClient.AgentEvent(context.Background(), &agent.ModelEventMessage{ - ServerName: c.serverName, - ReplicaIdx: c.replicaIdx, - ModelName: modelName, - ModelVersion: loadedModel.GetVersion(), - Event: agent.ModelEventMessage_UNLOADED, + ServerName: c.serverName, + ReplicaIdx: c.replicaIdx, + ModelName: modelName, + ModelVersion: loadedModel.GetVersion(), + Event: agent.ModelEventMessage_UNLOADED, AvailableMemoryBytes: c.replicaConfig.AvailableMemoryBytes, }) if err != nil { diff --git a/scheduler/pkg/agent/client_test.go b/scheduler/pkg/agent/client_test.go index 098220df7c..06707ca2ff 100644 --- a/scheduler/pkg/agent/client_test.go +++ b/scheduler/pkg/agent/client_test.go @@ -2,6 +2,9 @@ package agent import ( "context" + "net" + "testing" + "github.com/jarcoal/httpmock" . "github.com/onsi/gomega" pb "github.com/seldonio/seldon-core/scheduler/apis/mlops/agent" @@ -9,18 +12,16 @@ import ( log "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/test/bufconn" - "net" - "testing" ) type mockAgentV2Server struct { models []string pb.UnimplementedAgentServiceServer - loadedEvents int - loadFailedEvents int - unloadedEvents int + loadedEvents int + loadFailedEvents int + unloadedEvents int unloadFailedEvents int - otherEvents int + otherEvents int } func dialerv2(mockAgentV2Server *mockAgentV2Server) func(context.Context, string) (net.Conn, error) { @@ -58,19 +59,21 @@ func (m *mockAgentV2Server) AgentEvent(ctx context.Context, message *pb.ModelEve } func (m mockAgentV2Server) Subscribe(request *pb.AgentSubscribeRequest, server pb.AgentService_SubscribeServer) error { - for _,model := range m.models { - server.Send(&pb.ModelOperationMessage{ + for _, model := range m.models { + err := server.Send(&pb.ModelOperationMessage{ Operation: pb.ModelOperationMessage_LOAD_MODEL, Details: &pbs.ModelDetails{ Name: model, - Uri: "gs://model", + Uri: "gs://model", }, }) + if err != nil { + return err + } } return nil } - func TestClientCreate(t *testing.T) { t.Logf("Started") logger := log.New() @@ -78,21 +81,21 @@ func TestClientCreate(t *testing.T) { g := NewGomegaWithT(t) type test struct { - models []string + models []string replicaConfig *pb.ReplicaConfig - v2Status int - rsStatus int + v2Status int + rsStatus int } tests := []test{ {models: []string{"model"}, replicaConfig: &pb.ReplicaConfig{}, v2Status: 200, rsStatus: 200}, {models: []string{"model"}, replicaConfig: &pb.ReplicaConfig{}, v2Status: 400, rsStatus: 200}, } - for _,test := range tests { + for _, test := range tests { httpmock.Activate() v2Client := createTestV2Client(test.models, test.v2Status) rcloneClient := createTestRCloneClient(test.rsStatus) - client, err := NewClient("mlserver",1, "scheduler",9002,logger,rcloneClient, v2Client, test.replicaConfig,"0.0.0.0") + client, err := NewClient("mlserver", 1, "scheduler", 9002, logger, rcloneClient, v2Client, test.replicaConfig, "0.0.0.0") g.Expect(err).To(BeNil()) mockAgentV2Server := &mockAgentV2Server{models: test.models} conn, err := grpc.DialContext(context.Background(), "", grpc.WithInsecure(), grpc.WithContextDialer(dialerv2(mockAgentV2Server))) @@ -120,46 +123,46 @@ func TestLoadModel(t *testing.T) { g := NewGomegaWithT(t) type test struct { - models []string - replicaConfig *pb.ReplicaConfig - op *pb.ModelOperationMessage + models []string + replicaConfig *pb.ReplicaConfig + op *pb.ModelOperationMessage expectedAvailableMemory uint64 - v2Status int - rsStatus int - success bool + v2Status int + rsStatus int + success bool } smallMemory := uint64(500) largeMemory := uint64(2000) tests := []test{ {models: []string{"iris"}, - op: &pb.ModelOperationMessage{Details: &pbs.ModelDetails{Name: "iris", Uri: "gs://models/iris", MemoryBytes: &smallMemory}}, - replicaConfig: &pb.ReplicaConfig{MemoryBytes: 1000}, + op: &pb.ModelOperationMessage{Details: &pbs.ModelDetails{Name: "iris", Uri: "gs://models/iris", MemoryBytes: &smallMemory}}, + replicaConfig: &pb.ReplicaConfig{MemoryBytes: 1000}, expectedAvailableMemory: 500, - v2Status: 200, - rsStatus: 200, - success: true}, // Success + v2Status: 200, + rsStatus: 200, + success: true}, // Success {models: []string{"iris"}, - op: &pb.ModelOperationMessage{Details: &pbs.ModelDetails{Name: "iris", Uri: "gs://models/iris", MemoryBytes: &smallMemory}}, - replicaConfig: &pb.ReplicaConfig{MemoryBytes: 1000}, + op: &pb.ModelOperationMessage{Details: &pbs.ModelDetails{Name: "iris", Uri: "gs://models/iris", MemoryBytes: &smallMemory}}, + replicaConfig: &pb.ReplicaConfig{MemoryBytes: 1000}, expectedAvailableMemory: 1000, - v2Status: 400, - rsStatus: 200, - success: false}, // Fail as V2 fail + v2Status: 400, + rsStatus: 200, + success: false}, // Fail as V2 fail {models: []string{"iris"}, - op: &pb.ModelOperationMessage{Details: &pbs.ModelDetails{Name: "iris", Uri: "gs://models/iris", MemoryBytes: &largeMemory}}, - replicaConfig: &pb.ReplicaConfig{MemoryBytes: 1000}, + op: &pb.ModelOperationMessage{Details: &pbs.ModelDetails{Name: "iris", Uri: "gs://models/iris", MemoryBytes: &largeMemory}}, + replicaConfig: &pb.ReplicaConfig{MemoryBytes: 1000}, expectedAvailableMemory: 500, - v2Status: 200, - rsStatus: 200, - success: false}, // Fail due to too much memory required + v2Status: 200, + rsStatus: 200, + success: false}, // Fail due to too much memory required } - for tidx,test := range tests { - t.Logf("Test #%d",tidx) + for tidx, test := range tests { + t.Logf("Test #%d", tidx) httpmock.Activate() v2Client := createTestV2Client(test.models, test.v2Status) rcloneClient := createTestRCloneClient(test.rsStatus) - client, err := NewClient("mlserver",1, "scheduler",9002,logger,rcloneClient, v2Client, test.replicaConfig, "0.0.0.0") + client, err := NewClient("mlserver", 1, "scheduler", 9002, logger, rcloneClient, v2Client, test.replicaConfig, "0.0.0.0") g.Expect(err).To(BeNil()) mockAgentV2Server := &mockAgentV2Server{models: []string{}} conn, cerr := grpc.DialContext(context.Background(), "", grpc.WithInsecure(), grpc.WithContextDialer(dialerv2(mockAgentV2Server))) @@ -191,38 +194,38 @@ func TestUnloadModel(t *testing.T) { g := NewGomegaWithT(t) type test struct { - models []string - replicaConfig *pb.ReplicaConfig - loadOp *pb.ModelOperationMessage - unloadOp *pb.ModelOperationMessage + models []string + replicaConfig *pb.ReplicaConfig + loadOp *pb.ModelOperationMessage + unloadOp *pb.ModelOperationMessage expectedAvailableMemory uint64 - v2Status int - success bool + v2Status int + success bool } smallMemory := uint64(500) tests := []test{ {models: []string{"iris"}, - loadOp: &pb.ModelOperationMessage{Details: &pbs.ModelDetails{Name: "iris", Uri: "gs://models/iris", MemoryBytes: &smallMemory}}, - unloadOp: &pb.ModelOperationMessage{Details: &pbs.ModelDetails{Name: "iris", Uri: "gs://models/iris", MemoryBytes: &smallMemory}}, - replicaConfig: &pb.ReplicaConfig{MemoryBytes: 1000}, + loadOp: &pb.ModelOperationMessage{Details: &pbs.ModelDetails{Name: "iris", Uri: "gs://models/iris", MemoryBytes: &smallMemory}}, + unloadOp: &pb.ModelOperationMessage{Details: &pbs.ModelDetails{Name: "iris", Uri: "gs://models/iris", MemoryBytes: &smallMemory}}, + replicaConfig: &pb.ReplicaConfig{MemoryBytes: 1000}, expectedAvailableMemory: 1000, - v2Status: 200, - success: true}, // Success + v2Status: 200, + success: true}, // Success {models: []string{"iris"}, - loadOp: &pb.ModelOperationMessage{Details: &pbs.ModelDetails{Name: "iris", Uri: "gs://models/iris", MemoryBytes: &smallMemory}}, - unloadOp: &pb.ModelOperationMessage{Details: &pbs.ModelDetails{Name: "iris2", Uri: "gs://models/iris", MemoryBytes: &smallMemory}}, - replicaConfig: &pb.ReplicaConfig{MemoryBytes: 1000}, + loadOp: &pb.ModelOperationMessage{Details: &pbs.ModelDetails{Name: "iris", Uri: "gs://models/iris", MemoryBytes: &smallMemory}}, + unloadOp: &pb.ModelOperationMessage{Details: &pbs.ModelDetails{Name: "iris2", Uri: "gs://models/iris", MemoryBytes: &smallMemory}}, + replicaConfig: &pb.ReplicaConfig{MemoryBytes: 1000}, expectedAvailableMemory: 500, - v2Status: 200, - success: false}, // Fail to unload unknown model + v2Status: 200, + success: false}, // Fail to unload unknown model } - for tidx,test := range tests { - t.Logf("Test #%d",tidx) + for tidx, test := range tests { + t.Logf("Test #%d", tidx) httpmock.Activate() v2Client := createTestV2Client(test.models, test.v2Status) rcloneClient := createTestRCloneClient(200) - client, err := NewClient("mlserver",1, "scheduler",9002,logger,rcloneClient, v2Client, test.replicaConfig, "0.0.0.0") + client, err := NewClient("mlserver", 1, "scheduler", 9002, logger, rcloneClient, v2Client, test.replicaConfig, "0.0.0.0") g.Expect(err).To(BeNil()) mockAgentV2Server := &mockAgentV2Server{models: []string{}} conn, cerr := grpc.DialContext(context.Background(), "", grpc.WithInsecure(), grpc.WithContextDialer(dialerv2(mockAgentV2Server))) @@ -251,4 +254,4 @@ func TestUnloadModel(t *testing.T) { err = conn.Close() g.Expect(err).To(BeNil()) } -} \ No newline at end of file +} diff --git a/scheduler/pkg/agent/rclone.go b/scheduler/pkg/agent/rclone.go index 799c351855..c02353b5b3 100644 --- a/scheduler/pkg/agent/rclone.go +++ b/scheduler/pkg/agent/rclone.go @@ -4,26 +4,26 @@ import ( "bytes" "encoding/json" "fmt" - log "github.com/sirupsen/logrus" "io/ioutil" "net" "net/http" "net/url" "strconv" + + log "github.com/sirupsen/logrus" ) const ( ContentTypeJSON = "application/json" - ContentType = "Content-Type" + ContentType = "Content-Type" ) - type RCloneClient struct { - host string - port int - localPath string - httpClient *http.Client - logger log.FieldLogger + host string + port int + localPath string + httpClient *http.Client + logger log.FieldLogger } type Noop struct { @@ -31,25 +31,22 @@ type Noop struct { } type RcloneCopy struct { - SrcFs string `json:"srcFs"` - DstFs string `json:"dstFs"` - CreateEmptySrcDirs bool `json:"createEmptySrcDirs"` + SrcFs string `json:"srcFs"` + DstFs string `json:"dstFs"` + CreateEmptySrcDirs bool `json:"createEmptySrcDirs"` } - func NewRCloneClient(host string, port int, localPath string, logger log.FieldLogger) *RCloneClient { - logger.Infof("Rclone server %s:%d with model-repository:%s",host, port, localPath) + logger.Infof("Rclone server %s:%d with model-repository:%s", host, port, localPath) return &RCloneClient{ - host: host, - port: port, - localPath: localPath, + host: host, + port: port, + localPath: localPath, httpClient: http.DefaultClient, - logger: logger.WithField("Source","RCloneClient"), + logger: logger.WithField("Source", "RCloneClient"), } } - - func (r *RCloneClient) call(op []byte, path string) error { rcloneUrl := url.URL{ Scheme: "http", @@ -74,9 +71,9 @@ func (r *RCloneClient) call(op []byte, path string) error { if err != nil { return err } - r.logger.Printf("rclone response: %s",b) + r.logger.Printf("rclone response: %s", b) if response.StatusCode != http.StatusOK { - return fmt.Errorf("Failed rclone request to host:%s port:%d path:%s",r.host,r.port,path) + return fmt.Errorf("Failed rclone request to host:%s port:%d path:%s", r.host, r.port, path) } return nil } @@ -87,21 +84,20 @@ func (r *RCloneClient) Ready() error { if err != nil { return err } - return r.call(b ,"/rc/noop") + return r.call(b, "/rc/noop") } - func (r *RCloneClient) Copy(modelName string, src string) error { - dst := fmt.Sprintf("%s/%s",r.localPath,modelName) + dst := fmt.Sprintf("%s/%s", r.localPath, modelName) copy := RcloneCopy{ - SrcFs: src, - DstFs: dst, + SrcFs: src, + DstFs: dst, CreateEmptySrcDirs: true, } - r.logger.Infof("Copy from %s to %s",src, dst) + r.logger.Infof("Copy from %s to %s", src, dst) b, err := json.Marshal(copy) if err != nil { return err } - return r.call(b,"/sync/copy") + return r.call(b, "/sync/copy") } \ No newline at end of file diff --git a/scheduler/pkg/agent/rclone_test.go b/scheduler/pkg/agent/rclone_test.go index 3a33ce417f..97b44414e9 100644 --- a/scheduler/pkg/agent/rclone_test.go +++ b/scheduler/pkg/agent/rclone_test.go @@ -2,23 +2,24 @@ package agent import ( "fmt" + "testing" + "github.com/jarcoal/httpmock" . "github.com/onsi/gomega" log "github.com/sirupsen/logrus" - "testing" ) func createTestRCloneMockResponders(host string, port int, status int) { - httpmock.RegisterResponder("POST", fmt.Sprintf("=~http://%s:%d/",host, port), + httpmock.RegisterResponder("POST", fmt.Sprintf("=~http://%s:%d/", host, port), httpmock.NewStringResponder(status, `{}`)) } -func createTestRCloneClient(status int) *RCloneClient{ +func createTestRCloneClient(status int) *RCloneClient { logger := log.New() log.SetLevel(log.DebugLevel) host := "rclone-server" port := 5572 - r := NewRCloneClient(host, port, "/tmp/rclone", logger) + r := NewRCloneClient(host, port, "/tmp/rclone", logger) createTestRCloneMockResponders(host, port, status) return r } @@ -39,14 +40,14 @@ func TestRcloneCopy(t *testing.T) { g := NewGomegaWithT(t) type test struct { modelName string - uri string - status int + uri string + status int } - tests := []test { - {modelName: "iris", uri:"gs://seldon-models/sklearn/iris-0.23.2/lr_model", status: 200}, - {modelName: "iris", uri:"gs://seldon-models/sklearn/iris-0.23.2/lr_model", status: 400}, + tests := []test{ + {modelName: "iris", uri: "gs://seldon-models/sklearn/iris-0.23.2/lr_model", status: 200}, + {modelName: "iris", uri: "gs://seldon-models/sklearn/iris-0.23.2/lr_model", status: 400}, } - for _,test := range tests { + for _, test := range tests { httpmock.Activate() r := createTestRCloneClient(test.status) err := r.Copy(test.modelName, test.uri) diff --git a/scheduler/pkg/agent/server.go b/scheduler/pkg/agent/server.go index f8a07c81f1..d598dac167 100644 --- a/scheduler/pkg/agent/server.go +++ b/scheduler/pkg/agent/server.go @@ -3,6 +3,9 @@ package agent import ( "context" "fmt" + "net" + "sync" + pb "github.com/seldonio/seldon-core/scheduler/apis/mlops/agent" "github.com/seldonio/seldon-core/scheduler/pkg/envoy/processor" "github.com/seldonio/seldon-core/scheduler/pkg/scheduler" @@ -11,8 +14,6 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" - "net" - "sync" ) type ServerKey struct { @@ -25,14 +26,14 @@ type AgentHandler interface { } type Server struct { - mutext sync.RWMutex + mu sync.RWMutex pb.UnimplementedAgentServiceServer - logger log.FieldLogger - agents map[ServerKey]*AgentSubscriber - store store.SchedulerStore + logger log.FieldLogger + agents map[ServerKey]*AgentSubscriber + store store.SchedulerStore envoyHandler processor.EnvoyHandler - source chan string - scheduler scheduler.Scheduler + source chan string + scheduler scheduler.Scheduler } type SchedulerAgent interface { @@ -41,8 +42,8 @@ type SchedulerAgent interface { type AgentSubscriber struct { finished chan<- bool - mutext sync.Mutex // grpc streams are not thread safe for sendMsg https://github.com/grpc/grpc-go/issues/2355 - stream pb.AgentService_SubscribeServer + mu sync.Mutex // grpc streams are not thread safe for sendMsg https://github.com/grpc/grpc-go/issues/2355 + stream pb.AgentService_SubscribeServer } func NewAgentServer(logger log.FieldLogger, @@ -50,12 +51,12 @@ func NewAgentServer(logger log.FieldLogger, envoyHandler processor.EnvoyHandler, scheduler scheduler.Scheduler) *Server { return &Server{ - logger: logger.WithField("source","AgentServer"), - agents: make(map[ServerKey]*AgentSubscriber), - store: store, + logger: logger.WithField("source", "AgentServer"), + agents: make(map[ServerKey]*AgentSubscriber), + store: store, envoyHandler: envoyHandler, - source: make(chan string, 1), - scheduler: scheduler, + source: make(chan string, 1), + scheduler: scheduler, } } @@ -69,12 +70,11 @@ func (s *Server) StopAgentSync() { func (s *Server) ListenForSyncs() { for modelName := range s.source { - s.logger.Infof("Received sync for model %s",modelName) + s.logger.Infof("Received sync for model %s", modelName) go s.Sync(modelName) } } - func (s *Server) StartGrpcServer(agentPort uint) error { lis, err := net.Listen("tcp", fmt.Sprintf(":%d", agentPort)) if err != nil { @@ -84,13 +84,13 @@ func (s *Server) StartGrpcServer(agentPort uint) error { grpcServer := grpc.NewServer(opts...) pb.RegisterAgentServiceServer(grpcServer, s) s.logger.Printf("Agent server running on %d", agentPort) - return grpcServer.Serve(lis) + return grpcServer.Serve(lis) } -func (s *Server) Sync(modelName string) { - logger := s.logger.WithField("func","Sync") - s.mutext.RLock() - defer s.mutext.RUnlock() +func (s *Server) Sync(modelName string) { + logger := s.logger.WithField("func", "Sync") + s.mu.RLock() + defer s.mu.RUnlock() model, err := s.store.GetModel(modelName) if err != nil { @@ -98,34 +98,36 @@ func (s *Server) Sync(modelName string) { return } if model == nil { - logger.Errorf("Model %s not found",modelName) + logger.Errorf("Model %s not found", modelName) return } // Handle any load requests for latest version latestModel := model.GetLatest() if latestModel != nil { - for _,replicaIdx := range latestModel.GetReplicaForState(store.LoadRequested) { + for _, replicaIdx := range latestModel.GetReplicaForState(store.LoadRequested) { logger.Infof("Sending load model request for %s", modelName) as, ok := s.agents[ServerKey{serverName: latestModel.Server(), replicaIdx: uint32(replicaIdx)}] if !ok { - logger.Errorf("Failed to find server replica for %s:%d",latestModel.Server(), replicaIdx) + logger.Errorf("Failed to find server replica for %s:%d", latestModel.Server(), replicaIdx) continue } - + as.mu.Lock() err = as.stream.Send(&pb.ModelOperationMessage{ Operation: pb.ModelOperationMessage_LOAD_MODEL, - Details: latestModel.Details(), + Details: latestModel.Details(), }) if err != nil { - logger.WithError(err).Errorf("stream message send failed for model %s and replicaidx %d",modelName, replicaIdx) + as.mu.Unlock() + logger.WithError(err).Errorf("stream message send failed for model %s and replicaidx %d", modelName, replicaIdx) continue } + as.mu.Unlock() err := s.store.UpdateModelState(latestModel.Key(), latestModel.GetVersion(), latestModel.Server(), replicaIdx, nil, store.Loading) if err != nil { - logger.WithError(err).Errorf("Sync set model state failed for model %s replicaidx %d",modelName,replicaIdx) + logger.WithError(err).Errorf("Sync set model state failed for model %s replicaidx %d", modelName, replicaIdx) continue } } @@ -133,34 +135,35 @@ func (s *Server) Sync(modelName string) { // Loop through all versions and unload any requested for _, modelVersion := range model.Versions { - for _,replicaIdx := range modelVersion.GetReplicaForState(store.UnloadRequested) { + for _, replicaIdx := range modelVersion.GetReplicaForState(store.UnloadRequested) { s.logger.Infof("Sending unload model request for %s", modelName) as, ok := s.agents[ServerKey{serverName: modelVersion.Server(), replicaIdx: uint32(replicaIdx)}] if !ok { - logger.Errorf("Failed to find server replica for %s:%d",modelVersion.Server(), replicaIdx) + logger.Errorf("Failed to find server replica for %s:%d", modelVersion.Server(), replicaIdx) continue } + as.mu.Lock() err = as.stream.Send(&pb.ModelOperationMessage{ Operation: pb.ModelOperationMessage_UNLOAD_MODEL, - Details: modelVersion.Details(), + Details: modelVersion.Details(), }) if err != nil { - logger.WithError(err).Errorf("stream message send failed for model %s and replicaidx %d",modelName, replicaIdx) + as.mu.Unlock() + logger.WithError(err).Errorf("stream message send failed for model %s and replicaidx %d", modelName, replicaIdx) continue } + as.mu.Unlock() err := s.store.UpdateModelState(modelVersion.Key(), modelVersion.GetVersion(), modelVersion.Server(), replicaIdx, nil, store.Unloading) if err != nil { - logger.WithError(err).Errorf("Sync set model state failed for model %s replicaidx %d",modelName,replicaIdx) + logger.WithError(err).Errorf("Sync set model state failed for model %s replicaidx %d", modelName, replicaIdx) continue } } } } - - func (s *Server) AgentEvent(ctx context.Context, message *pb.ModelEventMessage) (*pb.ModelEventResponse, error) { - logger := s.logger.WithField("func","AgentEvent") + logger := s.logger.WithField("func", "AgentEvent") var state store.ModelReplicaState switch message.Event { case pb.ModelEventMessage_LOADED: @@ -173,10 +176,10 @@ func (s *Server) AgentEvent(ctx context.Context, message *pb.ModelEventMessage) default: state = store.ModelReplicaStateUnknown } - logger.Infof("Updating state for model %s to %s",message.ModelName, state.String()) + logger.Infof("Updating state for model %s to %s", message.ModelName, state.String()) err := s.store.UpdateModelState(message.ModelName, message.GetModelVersion(), message.ServerName, int(message.ReplicaIdx), &message.AvailableMemoryBytes, state) if err != nil { - logger.Infof("Failed Updating state for model %s",message.ModelName) + logger.Infof("Failed Updating state for model %s", message.ModelName) return nil, status.Error(codes.Internal, err.Error()) } s.envoyHandler.SendEnvoySync(message.ModelName) @@ -184,17 +187,17 @@ func (s *Server) AgentEvent(ctx context.Context, message *pb.ModelEventMessage) } func (s *Server) Subscribe(request *pb.AgentSubscribeRequest, stream pb.AgentService_SubscribeServer) error { - logger := s.logger.WithField("func","Subscribe") - logger.Infof("Received subscribe request from %s:%d",request.ServerName, request.ReplicaIdx) + logger := s.logger.WithField("func", "Subscribe") + logger.Infof("Received subscribe request from %s:%d", request.ServerName, request.ReplicaIdx) fin := make(chan bool) - s.mutext.Lock() + s.mu.Lock() s.agents[ServerKey{serverName: request.ServerName, replicaIdx: request.ReplicaIdx}] = &AgentSubscriber{ finished: fin, - stream: stream, + stream: stream, } - s.mutext.Unlock() + s.mu.Unlock() err := s.syncMessage(request, stream) if err != nil { @@ -208,20 +211,20 @@ func (s *Server) Subscribe(request *pb.AgentSubscribeRequest, stream pb.AgentSer case <-fin: logger.Infof("Closing stream for replica: %s:%d", request.ServerName, request.ReplicaIdx) return nil - case <- ctx.Done(): + case <-ctx.Done(): logger.Infof("Client replica %s:%d has disconnected", request.ServerName, request.ReplicaIdx) - s.mutext.Lock() - delete(s.agents,ServerKey{serverName: request.ServerName, replicaIdx: request.ReplicaIdx}) - s.mutext.Unlock() + s.mu.Lock() + delete(s.agents, ServerKey{serverName: request.ServerName, replicaIdx: request.ReplicaIdx}) + s.mu.Unlock() modelsChanged, err := s.store.RemoveServerReplica(request.ServerName, int(request.ReplicaIdx)) if err != nil { - logger.WithError(err).Errorf("Failed to remove replica and redeploy models for %s:%d",request.ServerName, request.ReplicaIdx) + logger.WithError(err).Errorf("Failed to remove replica and redeploy models for %s:%d", request.ServerName, request.ReplicaIdx) } s.logger.Debugf("Models changed by disconnect %v", modelsChanged) - for _,modelName := range modelsChanged { + for _, modelName := range modelsChanged { err = s.scheduler.Schedule(modelName) if err != nil { - logger.Debugf("Failed to reschedule model %s when server %s replica %d disconnected",modelName, request.ServerName, request.ReplicaIdx) + logger.Debugf("Failed to reschedule model %s when server %s replica %d disconnected", modelName, request.ServerName, request.ReplicaIdx) } else { s.SendAgentSync(modelName) } @@ -233,8 +236,8 @@ func (s *Server) Subscribe(request *pb.AgentSubscribeRequest, stream pb.AgentSer } func (s *Server) syncMessage(request *pb.AgentSubscribeRequest, stream pb.AgentService_SubscribeServer) error { - s.mutext.Lock() - defer s.mutext.Unlock() + s.mu.Lock() + defer s.mu.Unlock() err := s.store.AddServerReplica(request) if err != nil { @@ -244,11 +247,8 @@ func (s *Server) syncMessage(request *pb.AgentSubscribeRequest, stream pb.AgentS if err != nil { return err } - for _,updatedModels := range updatedModels { + for _, updatedModels := range updatedModels { s.SendAgentSync(updatedModels) } return nil } - - - diff --git a/scheduler/pkg/agent/server_test.go b/scheduler/pkg/agent/server_test.go index 5aa09d24e2..62162e88be 100644 --- a/scheduler/pkg/agent/server_test.go +++ b/scheduler/pkg/agent/server_test.go @@ -2,13 +2,14 @@ package agent import ( "context" + "testing" + "time" + . "github.com/onsi/gomega" pb "github.com/seldonio/seldon-core/scheduler/apis/mlops/agent" "github.com/seldonio/seldon-core/scheduler/pkg/store" log "github.com/sirupsen/logrus" "google.golang.org/grpc/metadata" - "testing" - "time" ) type MockAgentServer struct { @@ -37,7 +38,7 @@ func (m MockAgentServer) SetTrailer(md metadata.MD) { } func (m MockAgentServer) Context() context.Context { - ctx, _ := context.WithTimeout(context.Background(), time.Duration(time.Millisecond*80)) + ctx, _ := context.WithTimeout(context.Background(), time.Millisecond*80) // nolint return ctx } @@ -83,10 +84,9 @@ func setupTestAgent() (*Server, *store.MemoryStore) { func TestSubscribe(t *testing.T) { g := NewGomegaWithT(t) - as,_ := setupTestAgent() + as, _ := setupTestAgent() mockStream := NewMockAgentServer() err := as.Subscribe(&pb.AgentSubscribeRequest{ServerName: "test", ReplicaIdx: 0, ReplicaConfig: &pb.ReplicaConfig{Capabilities: []string{"sklearn"}, MemoryBytes: 1000}}, mockStream) g.Expect(err).To(BeNil()) g.Expect(mockStream.sentMessages).To(Equal(0)) } - diff --git a/scheduler/pkg/agent/v2.go b/scheduler/pkg/agent/v2.go index 3823624a55..59dda58085 100644 --- a/scheduler/pkg/agent/v2.go +++ b/scheduler/pkg/agent/v2.go @@ -5,19 +5,20 @@ import ( "encoding/json" "errors" "fmt" - log "github.com/sirupsen/logrus" "io/ioutil" "net" "net/http" "net/url" "strconv" + + log "github.com/sirupsen/logrus" ) type V2Client struct { - host string - port int - httpClient *http.Client - logger log.FieldLogger + host string + port int + httpClient *http.Client + logger log.FieldLogger } type V2Error struct { @@ -27,12 +28,12 @@ type V2Error struct { var V2BadRequestErr = errors.New("V2 Bad Request") func NewV2Client(host string, port int, logger log.FieldLogger) *V2Client { - logger.Infof("V2 Inference Server %s:%d",host, port) + logger.Infof("V2 Inference Server %s:%d", host, port) return &V2Client{ - host: host, - port: port, + host: host, + port: port, httpClient: http.DefaultClient, - logger: logger.WithField("Source","V2Client"), + logger: logger.WithField("Source", "V2Client"), } } @@ -62,7 +63,7 @@ func (v *V2Client) call(path string) error { if err != nil { return err } - v.logger.Infof("v2 server response: %s",b) + v.logger.Infof("v2 server response: %s", b) if response.StatusCode != http.StatusOK { if response.StatusCode == http.StatusBadRequest { v2Error := V2Error{} @@ -70,7 +71,7 @@ func (v *V2Client) call(path string) error { if err != nil { return err } - return fmt.Errorf("%s. %w",v2Error.Error, V2BadRequestErr) + return fmt.Errorf("%s. %w", v2Error.Error, V2BadRequestErr) } else { return err } @@ -79,18 +80,18 @@ func (v *V2Client) call(path string) error { } func (v *V2Client) LoadModel(name string) error { - path := fmt.Sprintf("v2/repository/models/%s/load",name) - v.logger.Infof("Load request: %s",path) + path := fmt.Sprintf("v2/repository/models/%s/load", name) + v.logger.Infof("Load request: %s", path) return v.call(path) } func (v *V2Client) UnloadModel(name string) error { - path := fmt.Sprintf("v2/repository/models/%s/unload",name) - v.logger.Infof("Unload request: %s",path) + path := fmt.Sprintf("v2/repository/models/%s/unload", name) + v.logger.Infof("Unload request: %s", path) return v.call(path) } func (v *V2Client) Ready() error { - _,err := http.Get(v.getUrl("v2/health/ready").String()) + _, err := http.Get(v.getUrl("v2/health/ready").String()) return err } \ No newline at end of file diff --git a/scheduler/pkg/agent/v2_test.go b/scheduler/pkg/agent/v2_test.go index 104569ee9f..e28f17dbaf 100644 --- a/scheduler/pkg/agent/v2_test.go +++ b/scheduler/pkg/agent/v2_test.go @@ -3,16 +3,17 @@ package agent import ( "errors" "fmt" + "testing" + "github.com/jarcoal/httpmock" . "github.com/onsi/gomega" log "github.com/sirupsen/logrus" - "testing" ) func createTestV2ClientMockResponders(host string, port int, modelName string, status int) { - httpmock.RegisterResponder("POST", fmt.Sprintf("http://%s:%d/v2/repository/models/%s/load",host, port, modelName), + httpmock.RegisterResponder("POST", fmt.Sprintf("http://%s:%d/v2/repository/models/%s/load", host, port, modelName), httpmock.NewStringResponder(status, `{}`)) - httpmock.RegisterResponder("POST", fmt.Sprintf("http://%s:%d/v2/repository/models/%s/unload",host, port, modelName), + httpmock.RegisterResponder("POST", fmt.Sprintf("http://%s:%d/v2/repository/models/%s/unload", host, port, modelName), httpmock.NewStringResponder(status, `{}`)) } @@ -21,7 +22,7 @@ func createTestV2Client(models []string, status int) *V2Client { log.SetLevel(log.DebugLevel) host := "model-server" port := 8080 - v2 := NewV2Client(host, port, logger) + v2 := NewV2Client(host, port, logger) for _, model := range models { createTestV2ClientMockResponders(host, port, model, status) } @@ -33,16 +34,16 @@ func TestLoad(t *testing.T) { type test struct { models []string status int - err error + err error } - tests := []test { + tests := []test{ {models: []string{"iris"}, status: 200, err: nil}, {models: []string{"iris"}, status: 400, err: V2BadRequestErr}, } - for _,test := range tests { + for _, test := range tests { httpmock.Activate() r := createTestV2Client(test.models, test.status) - for _,model := range test.models { + for _, model := range test.models { err := r.LoadModel(model) if test.status == 200 { g.Expect(err).To(BeNil()) @@ -63,16 +64,16 @@ func TestUnload(t *testing.T) { type test struct { models []string status int - err error + err error } - tests := []test { + tests := []test{ {models: []string{"iris"}, status: 200, err: nil}, {models: []string{"iris"}, status: 400, err: V2BadRequestErr}, } - for _,test := range tests { + for _, test := range tests { httpmock.Activate() r := createTestV2Client(test.models, test.status) - for _,model := range test.models { + for _, model := range test.models { err := r.UnloadModel(model) if test.status == 200 { g.Expect(err).To(BeNil()) @@ -87,4 +88,3 @@ func TestUnload(t *testing.T) { httpmock.DeactivateAndReset() } } - diff --git a/scheduler/pkg/envoy/processor/hash.go b/scheduler/pkg/envoy/processor/hash.go index 5bca41040a..b2d4b4e1a0 100644 --- a/scheduler/pkg/envoy/processor/hash.go +++ b/scheduler/pkg/envoy/processor/hash.go @@ -17,10 +17,7 @@ func computeHashKeyForList(list []int) string { buffer.WriteString(",") } h := sha256.New() - h.Write([]byte(buffer.String())) + _, _ = h.Write(buffer.Bytes()) b := h.Sum(nil) return base64.StdEncoding.EncodeToString(b) } - - - diff --git a/scheduler/pkg/envoy/processor/incremental.go b/scheduler/pkg/envoy/processor/incremental.go index ebb7e2e6d0..4ccda88e38 100644 --- a/scheduler/pkg/envoy/processor/incremental.go +++ b/scheduler/pkg/envoy/processor/incremental.go @@ -3,6 +3,11 @@ package processor import ( "context" "fmt" + "math" + "math/rand" + "strconv" + "sync" + "github.com/envoyproxy/go-control-plane/pkg/cache/types" "github.com/envoyproxy/go-control-plane/pkg/cache/v3" rsrc "github.com/envoyproxy/go-control-plane/pkg/resource/v3" @@ -10,10 +15,6 @@ import ( "github.com/seldonio/seldon-core/scheduler/pkg/envoy/xdscache" "github.com/seldonio/seldon-core/scheduler/pkg/store" "github.com/sirupsen/logrus" - "math" - "math/rand" - "strconv" - "sync" ) type EnvoyHandler interface { @@ -25,11 +26,11 @@ type IncrementalProcessor struct { nodeID string // snapshotVersion holds the current version of the snapshot. snapshotVersion int64 - logger logrus.FieldLogger - xdsCache xdscache.SeldonXDSCache - mu sync.RWMutex - store store.SchedulerStore - source chan string + logger logrus.FieldLogger + xdsCache xdscache.SeldonXDSCache + mu sync.RWMutex + store store.SchedulerStore + source chan string } func NewIncrementalProcessor(cache cache.SnapshotCache, nodeID string, log logrus.FieldLogger, store store.SchedulerStore) *IncrementalProcessor { @@ -37,14 +38,14 @@ func NewIncrementalProcessor(cache cache.SnapshotCache, nodeID string, log logru cache: cache, nodeID: nodeID, snapshotVersion: rand.Int63n(1000), - logger: log.WithField("source","EnvoyServer"), + logger: log.WithField("source", "EnvoyServer"), xdsCache: xdscache.SeldonXDSCache{ Listeners: make(map[string]resources.Listener), Clusters: make(map[string]resources.Cluster), Routes: make(map[string]resources.Route), Endpoints: make(map[string]resources.Endpoint), }, - store: store, + store: store, source: make(chan string, 1), } ip.SetListener("seldon_http") @@ -60,9 +61,9 @@ func (s *IncrementalProcessor) StopEnvoySync() { } func (s *IncrementalProcessor) ListenForSyncs() { - logger := s.logger.WithField("func","ListenForSyncs") + logger := s.logger.WithField("func", "ListenForSyncs") for msg := range s.source { - logger.Debugf("Received sync for model %s",msg) + logger.Debugf("Received sync for model %s", msg) err := s.Sync(msg) if err != nil { logger.Errorf("Failed to process sync") @@ -70,7 +71,6 @@ func (s *IncrementalProcessor) ListenForSyncs() { } } - func (p *IncrementalProcessor) SetListener(listenerName string) { p.mu.Lock() defer p.mu.Unlock() @@ -92,14 +92,14 @@ func (p *IncrementalProcessor) newSnapshotVersion() string { } func (p *IncrementalProcessor) updateEnvoy() error { - logger := p.logger.WithField("func","updateEnvoy") + logger := p.logger.WithField("func", "updateEnvoy") // Create the snapshot that we'll serve to Envoy - snapshot,err := cache.NewSnapshot( + snapshot, err := cache.NewSnapshot( p.newSnapshotVersion(), // version map[rsrc.Type][]types.Resource{ - rsrc.ClusterType: p.xdsCache.ClusterContents(), // clusters - rsrc.RouteType: p.xdsCache.RouteContents(), // routes - rsrc.ListenerType: p.xdsCache.ListenerContents(), // listeners + rsrc.ClusterType: p.xdsCache.ClusterContents(), // clusters + rsrc.RouteType: p.xdsCache.RouteContents(), // routes + rsrc.ListenerType: p.xdsCache.ListenerContents(), // listeners }) if err != nil { return err @@ -125,28 +125,28 @@ func (p *IncrementalProcessor) removeModelForServerInEnvoy(modelName string) err } func (p *IncrementalProcessor) Sync(modelName string) error { - logger := p.logger.WithField("func","Sync") + logger := p.logger.WithField("func", "Sync") model, err := p.store.GetModel(modelName) if err != nil { logger.WithError(err).Errorf("Failed to sync model %s", modelName) return p.removeModelForServerInEnvoy(modelName) } if model == nil { - logger.Debugf("sync: No model - removing for %s",modelName) + logger.Debugf("sync: No model - removing for %s", modelName) return p.removeModelForServerInEnvoy(modelName) } latestModel := model.GetLatest() if latestModel == nil { - logger.Debugf("sync: No latest model - removing for %s",modelName) + logger.Debugf("sync: No latest model - removing for %s", modelName) return p.removeModelForServerInEnvoy(modelName) } if latestModel.NoLiveReplica() { - logger.Debugf("sync: No live model - removing for %s",modelName) + logger.Debugf("sync: No live model - removing for %s", modelName) return p.removeModelForServerInEnvoy(modelName) } server, err := p.store.GetServer(latestModel.Server()) - if server == nil { - logger.Debugf("sync: No server - removing for %s",modelName) + if err != nil || server == nil { + logger.Debugf("sync: No server - removing for %s", modelName) return p.removeModelForServerInEnvoy(modelName) } @@ -156,26 +156,26 @@ func (p *IncrementalProcessor) Sync(modelName string) error { clusterNameBase := server.Name + "_" + computeHashKeyForList(assignment) httpClusterName := clusterNameBase + "_http" grpcClusterName := clusterNameBase + "_grpc" - p.xdsCache.AddRoute(modelName,modelName,httpClusterName, grpcClusterName, latestModel.Details().LogPayloads) + p.xdsCache.AddRoute(modelName, modelName, httpClusterName, grpcClusterName, latestModel.Details().LogPayloads) if !p.xdsCache.HasCluster(httpClusterName) { p.xdsCache.AddCluster(httpClusterName, modelName, false) - for _,serverIdx := range assignment { + for _, serverIdx := range assignment { replica, ok := server.Replicas[serverIdx] if !ok { - return fmt.Errorf("Invalid replica index %d for server %s",serverIdx, server.Name) + return fmt.Errorf("Invalid replica index %d for server %s", serverIdx, server.Name) } p.xdsCache.AddEndpoint(httpClusterName, replica.GetInferenceSvc(), uint32(replica.GetInferenceHttpPort())) } } if !p.xdsCache.HasCluster(grpcClusterName) { p.xdsCache.AddCluster(grpcClusterName, modelName, true) - for _,serverIdx := range assignment { + for _, serverIdx := range assignment { replica, ok := server.Replicas[serverIdx] if !ok { - return fmt.Errorf("Invalid replica index %d for server %s",serverIdx, server.Name) + return fmt.Errorf("Invalid replica index %d for server %s", serverIdx, server.Name) } p.xdsCache.AddEndpoint(grpcClusterName, replica.GetInferenceSvc(), uint32(replica.GetInferenceGrpcPort())) } } return p.updateEnvoy() -} \ No newline at end of file +} diff --git a/scheduler/pkg/envoy/resources/cache.go b/scheduler/pkg/envoy/resources/cache.go index 99a3ff84fe..37e2b887f4 100644 --- a/scheduler/pkg/envoy/resources/cache.go +++ b/scheduler/pkg/envoy/resources/cache.go @@ -1,14 +1,14 @@ package resources type Listener struct { - Name string - Address string - Port uint32 + Name string + Address string + Port uint32 //RouteNames []string } type Route struct { - Name string + Name string Host string HttpCluster string GrpcCluster string @@ -19,7 +19,7 @@ type Cluster struct { Name string Grpc bool Endpoints map[string]Endpoint - Routes map[string]bool + Routes map[string]bool } type Endpoint struct { diff --git a/scheduler/pkg/envoy/resources/resource.go b/scheduler/pkg/envoy/resources/resource.go index f49a846efc..474145a3c4 100644 --- a/scheduler/pkg/envoy/resources/resource.go +++ b/scheduler/pkg/envoy/resources/resource.go @@ -15,11 +15,12 @@ package resources import ( + "time" + matcher "github.com/envoyproxy/go-control-plane/envoy/config/common/matcher/v3" envoy_extensions_common_tap_v3 "github.com/envoyproxy/go-control-plane/envoy/extensions/common/tap/v3" "google.golang.org/protobuf/types/known/anypb" "google.golang.org/protobuf/types/known/durationpb" - "time" cluster "github.com/envoyproxy/go-control-plane/envoy/config/cluster/v3" core "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" @@ -37,7 +38,7 @@ import ( const ( RouteConfigurationName = "listener_0" SeldonLoggingHeader = "Seldon-Logging" - EnvoyLogPathPrefix = "/tmp/request-log" + EnvoyLogPathPrefix = "/tmp/request-log" ) func MakeCluster(clusterName string, eps []Endpoint, isGrpc bool) *cluster.Cluster { @@ -56,13 +57,13 @@ func MakeCluster(clusterName string, eps []Endpoint, isGrpc bool) *cluster.Clust panic(err) } return &cluster.Cluster{ - Name: clusterName, - ConnectTimeout: durationpb.New(5 * time.Second), - ClusterDiscoveryType: &cluster.Cluster_Type{Type: cluster.Cluster_STRICT_DNS}, - LbPolicy: cluster.Cluster_ROUND_ROBIN, - LoadAssignment: MakeEndpoint(clusterName, eps), - DnsLookupFamily: cluster.Cluster_V4_ONLY, - TypedExtensionProtocolOptions: map[string]*anypb.Any{"envoy.extensions.upstreams.http.v3.HttpProtocolOptions":hpoMarshalled}, + Name: clusterName, + ConnectTimeout: durationpb.New(5 * time.Second), + ClusterDiscoveryType: &cluster.Cluster_Type{Type: cluster.Cluster_STRICT_DNS}, + LbPolicy: cluster.Cluster_ROUND_ROBIN, + LoadAssignment: MakeEndpoint(clusterName, eps), + DnsLookupFamily: cluster.Cluster_V4_ONLY, + TypedExtensionProtocolOptions: map[string]*anypb.Any{"envoy.extensions.upstreams.http.v3.HttpProtocolOptions": hpoMarshalled}, } } return &cluster.Cluster{ @@ -70,8 +71,8 @@ func MakeCluster(clusterName string, eps []Endpoint, isGrpc bool) *cluster.Clust ConnectTimeout: durationpb.New(5 * time.Second), ClusterDiscoveryType: &cluster.Cluster_Type{Type: cluster.Cluster_STRICT_DNS}, LbPolicy: cluster.Cluster_ROUND_ROBIN, - LoadAssignment: MakeEndpoint(clusterName, eps), - DnsLookupFamily: cluster.Cluster_V4_ONLY, + LoadAssignment: MakeEndpoint(clusterName, eps), + DnsLookupFamily: cluster.Cluster_V4_ONLY, } } @@ -117,7 +118,7 @@ func MakeRoute(routes []Route) *route.RouteConfiguration { for _, r := range routes { rt := &route.Route{ - Name: r.Name+"_http", // Seems optional + Name: r.Name + "_http", // Seems optional Match: &route.RouteMatch{ PathSpecifier: &route.RouteMatch_Prefix{ Prefix: "/v2", @@ -149,12 +150,12 @@ func MakeRoute(routes []Route) *route.RouteConfiguration { } if r.LogPayloads { rt.ResponseHeadersToAdd = []*core.HeaderValueOption{ - {Header: &core.HeaderValue{Key: SeldonLoggingHeader,Value: "true"}}, + {Header: &core.HeaderValue{Key: SeldonLoggingHeader, Value: "true"}}, } } - rts = append(rts,rt) + rts = append(rts, rt) rt = &route.Route{ - Name: r.Name+"_grpc", // Seems optional + Name: r.Name + "_grpc", // Seems optional Match: &route.RouteMatch{ PathSpecifier: &route.RouteMatch_Prefix{ Prefix: "/inference.GRPCInferenceService", @@ -186,10 +187,10 @@ func MakeRoute(routes []Route) *route.RouteConfiguration { } if r.LogPayloads { rt.ResponseHeadersToAdd = []*core.HeaderValueOption{ - {Header: &core.HeaderValue{Key: SeldonLoggingHeader,Value: "true"}}, + {Header: &core.HeaderValue{Key: SeldonLoggingHeader, Value: "true"}}, } } - rts = append(rts,rt) + rts = append(rts, rt) } return &route.RouteConfiguration{ @@ -291,7 +292,6 @@ func MakeHTTPListener(listenerName, address string, port uint32) *listener.Liste panic(err) } - return &listener.Listener{ Name: listenerName, Address: &core.Address{ diff --git a/scheduler/pkg/envoy/xdscache/seldoncache.go b/scheduler/pkg/envoy/xdscache/seldoncache.go index 7a05b0fe28..c3b1bbaa62 100644 --- a/scheduler/pkg/envoy/xdscache/seldoncache.go +++ b/scheduler/pkg/envoy/xdscache/seldoncache.go @@ -2,13 +2,14 @@ package xdscache import ( "fmt" + "github.com/envoyproxy/go-control-plane/pkg/cache/types" "github.com/seldonio/seldon-core/scheduler/pkg/envoy/resources" ) const ( - DefaultListenerAddress = "0.0.0.0" - DefaultListenerPort uint32 = 9000 + DefaultListenerAddress = "0.0.0.0" + DefaultListenerPort uint32 = 9000 ) type SeldonXDSCache struct { @@ -23,7 +24,7 @@ func (xds *SeldonXDSCache) ClusterContents() []types.Resource { for _, c := range xds.Clusters { endpoints := make([]resources.Endpoint, 0, len(c.Endpoints)) - for _, value := range c.Endpoints { // Likely to be small (<100?) as is number of model replicas + for _, value := range c.Endpoints { // Likely to be small (<100?) as is number of model replicas endpoints = append(endpoints, value) } r = append(r, resources.MakeCluster(c.Name, endpoints, c.Grpc)) @@ -58,7 +59,7 @@ func (xds *SeldonXDSCache) EndpointsContents() []types.Resource { for _, c := range xds.Clusters { endpoints := make([]resources.Endpoint, 0, len(c.Endpoints)) - for _, value := range c.Endpoints { + for _, value := range c.Endpoints { endpoints = append(endpoints, value) } r = append(r, resources.MakeEndpoint(c.Name, endpoints)) @@ -67,12 +68,11 @@ func (xds *SeldonXDSCache) EndpointsContents() []types.Resource { return r } - func (xds *SeldonXDSCache) AddListener(name string) { xds.Listeners[name] = resources.Listener{ - Name: name, - Address: DefaultListenerAddress, - Port: DefaultListenerPort, + Name: name, + Address: DefaultListenerAddress, + Port: DefaultListenerPort, } } @@ -95,17 +95,16 @@ func (xds *SeldonXDSCache) AddCluster(name string, route string, isGrpc bool) { cluster, ok := xds.Clusters[name] if !ok { cluster = resources.Cluster{ - Name: name, + Name: name, Endpoints: make(map[string]resources.Endpoint), - Routes: make(map[string]bool), - Grpc: isGrpc, + Routes: make(map[string]bool), + Grpc: isGrpc, } } cluster.Routes[route] = true xds.Clusters[name] = cluster } - func (xds *SeldonXDSCache) RemoveRoute(modelName string) { route, ok := xds.Routes[modelName] if !ok { @@ -126,7 +125,7 @@ func (xds *SeldonXDSCache) RemoveRoute(modelName string) { func (xds *SeldonXDSCache) AddEndpoint(clusterName, upstreamHost string, upstreamPort uint32) { cluster := xds.Clusters[clusterName] - k := fmt.Sprintf("%s:%d",upstreamHost,upstreamPort) + k := fmt.Sprintf("%s:%d", upstreamHost, upstreamPort) cluster.Endpoints[k] = resources.Endpoint{ UpstreamHost: upstreamHost, UpstreamPort: upstreamPort, diff --git a/scheduler/pkg/scheduler/filters/replicamemory.go b/scheduler/pkg/scheduler/filters/replicamemory.go index ce8cb8362e..61349c8935 100644 --- a/scheduler/pkg/scheduler/filters/replicamemory.go +++ b/scheduler/pkg/scheduler/filters/replicamemory.go @@ -7,6 +7,3 @@ type AvailableMemoryFilter struct{} func (r AvailableMemoryFilter) Filter(model *store.ModelVersion, replica *store.ServerReplica) bool { return model.GetRequiredMemory() <= replica.GetAvailableMemory() } - - - diff --git a/scheduler/pkg/scheduler/filters/replicamemory_test.go b/scheduler/pkg/scheduler/filters/replicamemory_test.go index c07adbb94e..05ff0c3468 100644 --- a/scheduler/pkg/scheduler/filters/replicamemory_test.go +++ b/scheduler/pkg/scheduler/filters/replicamemory_test.go @@ -1,48 +1,48 @@ package filters - import ( + "testing" + . "github.com/onsi/gomega" pb "github.com/seldonio/seldon-core/scheduler/apis/mlops/scheduler" "github.com/seldonio/seldon-core/scheduler/pkg/store" - "testing" ) func getTestModelWithMemory(requiredmemory *uint64) *store.ModelVersion { return store.NewModelVersion( &pb.ModelDetails{ - Name: "model1", + Name: "model1", Requirements: []string{}, - MemoryBytes: requiredmemory, + MemoryBytes: requiredmemory, }, "server", - map[int]store.ModelReplicaState{3:store.Loading}, + map[int]store.ModelReplicaState{3: store.Loading}, false, store.ModelProgressing) } func getTestServerReplicaWithMemory(availableMemory uint64) *store.ServerReplica { - return store.NewServerReplica("svc", 8080, 5001, 1,nil, []string{},availableMemory,availableMemory,nil,true) + return store.NewServerReplica("svc", 8080, 5001, 1, nil, []string{}, availableMemory, availableMemory, nil, true) } func TestReplicaMemoryFilter(t *testing.T) { g := NewGomegaWithT(t) type test struct { - name string - model *store.ModelVersion - server *store.ServerReplica + name string + model *store.ModelVersion + server *store.ServerReplica expected bool } memory := uint64(100) - tests := []test { - {name: "EnoughMemory", model: getTestModelWithMemory(&memory), server: getTestServerReplicaWithMemory(100),expected: true}, - {name: "NoMemorySpecified", model: getTestModelWithMemory(nil), server: getTestServerReplicaWithMemory(200),expected: true}, - {name: "NotEnoughMemory", model: getTestModelWithMemory(&memory), server: getTestServerReplicaWithMemory(50),expected: false}, + tests := []test{ + {name: "EnoughMemory", model: getTestModelWithMemory(&memory), server: getTestServerReplicaWithMemory(100), expected: true}, + {name: "NoMemorySpecified", model: getTestModelWithMemory(nil), server: getTestServerReplicaWithMemory(200), expected: true}, + {name: "NotEnoughMemory", model: getTestModelWithMemory(&memory), server: getTestServerReplicaWithMemory(50), expected: false}, } - for _,test := range tests { + for _, test := range tests { t.Run(test.name, func(t *testing.T) { filter := AvailableMemoryFilter{} ok := filter.Filter(test.model, test.server) diff --git a/scheduler/pkg/scheduler/filters/replicarequirements.go b/scheduler/pkg/scheduler/filters/replicarequirements.go index 0275c97f99..70f2514a30 100644 --- a/scheduler/pkg/scheduler/filters/replicarequirements.go +++ b/scheduler/pkg/scheduler/filters/replicarequirements.go @@ -5,7 +5,7 @@ import "github.com/seldonio/seldon-core/scheduler/pkg/store" type RequirementsReplicaFilter struct{} func (s RequirementsReplicaFilter) Filter(model *store.ModelVersion, replica *store.ServerReplica) bool { - for _,requirement := range model.GetRequirements() { + for _, requirement := range model.GetRequirements() { requirementFound := false for _, capability := range replica.GetCapabilities() { if requirement == capability { @@ -14,11 +14,8 @@ func (s RequirementsReplicaFilter) Filter(model *store.ModelVersion, replica *st } } if !requirementFound { - return false - } + return false + } } return true } - - - diff --git a/scheduler/pkg/scheduler/filters/replicarequirements_test.go b/scheduler/pkg/scheduler/filters/replicarequirements_test.go index 4fe8ace6e3..910114f545 100644 --- a/scheduler/pkg/scheduler/filters/replicarequirements_test.go +++ b/scheduler/pkg/scheduler/filters/replicarequirements_test.go @@ -1,49 +1,48 @@ package filters - import ( + "testing" + . "github.com/onsi/gomega" pb "github.com/seldonio/seldon-core/scheduler/apis/mlops/scheduler" "github.com/seldonio/seldon-core/scheduler/pkg/store" - "testing" ) func TestReplicaRequirementsFilter(t *testing.T) { g := NewGomegaWithT(t) - getTestModelWithRequirements := func (requirements []string) *store.ModelVersion { + getTestModelWithRequirements := func(requirements []string) *store.ModelVersion { return store.NewModelVersion( - &pb.ModelDetails{ - Name: "model1", - Requirements: requirements, - }, - "server", - map[int]store.ModelReplicaState{3:store.Loading}, - false, - store.ModelProgressing) + &pb.ModelDetails{ + Name: "model1", + Requirements: requirements, + }, + "server", + map[int]store.ModelReplicaState{3: store.Loading}, + false, + store.ModelProgressing) } getTestServerReplicaWithCaps := func(capabilities []string) *store.ServerReplica { - return store.NewServerReplica("svc", 8080, 5001, 1,nil, capabilities,100,100,nil,true) + return store.NewServerReplica("svc", 8080, 5001, 1, nil, capabilities, 100, 100, nil, true) } - type test struct { - name string - model *store.ModelVersion - server *store.ServerReplica + name string + model *store.ModelVersion + server *store.ServerReplica expected bool } - tests := []test { - {name: "Match", model: getTestModelWithRequirements([]string{"sklearn"}), server: getTestServerReplicaWithCaps([]string{"sklearn"}),expected: true}, - {name: "Mismatch", model: getTestModelWithRequirements([]string{"sklearn"}), server: getTestServerReplicaWithCaps([]string{"xgboost"}),expected: false}, - {name: "PartialMatch", model: getTestModelWithRequirements([]string{"sklearn", "xgboost"}), server: getTestServerReplicaWithCaps([]string{"xgboost"}),expected: false}, - {name: "MultiMatch", model: getTestModelWithRequirements([]string{"sklearn", "xgboost"}), server: getTestServerReplicaWithCaps([]string{"xgboost", "sklearn", "tensorflow"}),expected: true}, - {name: "Duplicates", model: getTestModelWithRequirements([]string{"sklearn", "xgboost", "sklearn"}), server: getTestServerReplicaWithCaps([]string{"xgboost", "sklearn", "tensorflow"}),expected: true}, + tests := []test{ + {name: "Match", model: getTestModelWithRequirements([]string{"sklearn"}), server: getTestServerReplicaWithCaps([]string{"sklearn"}), expected: true}, + {name: "Mismatch", model: getTestModelWithRequirements([]string{"sklearn"}), server: getTestServerReplicaWithCaps([]string{"xgboost"}), expected: false}, + {name: "PartialMatch", model: getTestModelWithRequirements([]string{"sklearn", "xgboost"}), server: getTestServerReplicaWithCaps([]string{"xgboost"}), expected: false}, + {name: "MultiMatch", model: getTestModelWithRequirements([]string{"sklearn", "xgboost"}), server: getTestServerReplicaWithCaps([]string{"xgboost", "sklearn", "tensorflow"}), expected: true}, + {name: "Duplicates", model: getTestModelWithRequirements([]string{"sklearn", "xgboost", "sklearn"}), server: getTestServerReplicaWithCaps([]string{"xgboost", "sklearn", "tensorflow"}), expected: true}, } - for _,test := range tests { + for _, test := range tests { t.Run(test.name, func(t *testing.T) { filter := RequirementsReplicaFilter{} ok := filter.Filter(test.model, test.server) diff --git a/scheduler/pkg/scheduler/filters/sharing.go b/scheduler/pkg/scheduler/filters/sharing.go index 5236768331..f4ea7a7ca1 100644 --- a/scheduler/pkg/scheduler/filters/sharing.go +++ b/scheduler/pkg/scheduler/filters/sharing.go @@ -6,7 +6,5 @@ type SharingServerFilter struct{} func (e SharingServerFilter) Filter(model *store.ModelVersion, server *store.ServerSnapshot) bool { requestedServer := model.GetRequestedServer() - return (requestedServer == nil && server.Shared) || (requestedServer!= nil && *requestedServer == server.Name) + return (requestedServer == nil && server.Shared) || (requestedServer != nil && *requestedServer == server.Name) } - - diff --git a/scheduler/pkg/scheduler/filters/sharing_test.go b/scheduler/pkg/scheduler/filters/sharing_test.go index 1822bf8f27..b63d37eef7 100644 --- a/scheduler/pkg/scheduler/filters/sharing_test.go +++ b/scheduler/pkg/scheduler/filters/sharing_test.go @@ -1,49 +1,50 @@ package filters import ( + "testing" + . "github.com/onsi/gomega" pb "github.com/seldonio/seldon-core/scheduler/apis/mlops/scheduler" "github.com/seldonio/seldon-core/scheduler/pkg/store" - "testing" ) func TestSharingFilter(t *testing.T) { g := NewGomegaWithT(t) type test struct { - name string - model *store.ModelVersion - server *store.ServerSnapshot + name string + model *store.ModelVersion + server *store.ServerSnapshot expected bool } serverName := "server1" modelExplicitServer := store.NewModelVersion( &pb.ModelDetails{ - Name: "model1", + Name: "model1", Server: &serverName, }, serverName, - map[int]store.ModelReplicaState{3:store.Loading}, + map[int]store.ModelReplicaState{3: store.Loading}, false, store.ModelProgressing) modelSharedServer := store.NewModelVersion( &pb.ModelDetails{ - Name: "model1", + Name: "model1", Server: nil, }, serverName, - map[int]store.ModelReplicaState{3:store.Loading}, + map[int]store.ModelReplicaState{3: store.Loading}, false, store.ModelProgressing) - tests := []test { - {name: "ModelAndServerMatchNotShared", model: modelExplicitServer, server: &store.ServerSnapshot{Name: serverName, Shared: false},expected: true}, - {name: "ModelAndServerMatchShared", model: modelExplicitServer, server: &store.ServerSnapshot{Name: serverName, Shared: true},expected: true}, - {name: "ModelAndServerDontMatch", model: modelExplicitServer, server: &store.ServerSnapshot{Name: "foo", Shared: true},expected: false}, - {name: "SharedModelAnyServer", model: modelSharedServer, server: &store.ServerSnapshot{Name: "foo", Shared: true},expected: true}, - {name: "SharedModelNotSharedServer", model: modelSharedServer, server: &store.ServerSnapshot{Name: "foo", Shared: false},expected: false}, + tests := []test{ + {name: "ModelAndServerMatchNotShared", model: modelExplicitServer, server: &store.ServerSnapshot{Name: serverName, Shared: false}, expected: true}, + {name: "ModelAndServerMatchShared", model: modelExplicitServer, server: &store.ServerSnapshot{Name: serverName, Shared: true}, expected: true}, + {name: "ModelAndServerDontMatch", model: modelExplicitServer, server: &store.ServerSnapshot{Name: "foo", Shared: true}, expected: false}, + {name: "SharedModelAnyServer", model: modelSharedServer, server: &store.ServerSnapshot{Name: "foo", Shared: true}, expected: true}, + {name: "SharedModelNotSharedServer", model: modelSharedServer, server: &store.ServerSnapshot{Name: "foo", Shared: false}, expected: false}, } - for _,test := range tests { + for _, test := range tests { t.Run(test.name, func(t *testing.T) { filter := SharingServerFilter{} ok := filter.Filter(test.model, test.server) diff --git a/scheduler/pkg/scheduler/interface.go b/scheduler/pkg/scheduler/interface.go index 1247300a63..85b18b4086 100644 --- a/scheduler/pkg/scheduler/interface.go +++ b/scheduler/pkg/scheduler/interface.go @@ -16,4 +16,3 @@ type ReplicaFilter interface { type ServerFilter interface { Filter(model *store.ModelVersion, server *store.ServerSnapshot) bool } - diff --git a/scheduler/pkg/scheduler/scheduler.go b/scheduler/pkg/scheduler/scheduler.go index 6b197c51a4..1c166f475a 100644 --- a/scheduler/pkg/scheduler/scheduler.go +++ b/scheduler/pkg/scheduler/scheduler.go @@ -2,22 +2,23 @@ package scheduler import ( "fmt" + "sort" + "sync" + "github.com/seldonio/seldon-core/scheduler/pkg/scheduler/sorters" store "github.com/seldonio/seldon-core/scheduler/pkg/store" log "github.com/sirupsen/logrus" - "sort" - "sync" ) type SimpleScheduler struct { - mu sync.RWMutex - store store.SchedulerStore - logger log.FieldLogger - serverFilters []ServerFilter - serverSorts []sorters.ServerSorter + mu sync.RWMutex + store store.SchedulerStore + logger log.FieldLogger + serverFilters []ServerFilter + serverSorts []sorters.ServerSorter replicaFilters []ReplicaFilter - replicaSorts []sorters.ReplicaSorter - failedModels map[string]bool + replicaSorts []sorters.ReplicaSorter + failedModels map[string]bool } func NewSimpleScheduler(logger log.FieldLogger, @@ -27,13 +28,13 @@ func NewSimpleScheduler(logger log.FieldLogger, serverSorts []sorters.ServerSorter, replicaSorts []sorters.ReplicaSorter) *SimpleScheduler { return &SimpleScheduler{ - store: store, - logger: logger, - serverFilters: serverFilters, - serverSorts: serverSorts, + store: store, + logger: logger, + serverFilters: serverFilters, + serverSorts: serverSorts, replicaFilters: replicaFilters, - replicaSorts: replicaSorts, - failedModels: make(map[string]bool), + replicaSorts: replicaSorts, + failedModels: make(map[string]bool), } } @@ -53,10 +54,10 @@ func (s *SimpleScheduler) ScheduleFailedModels() ([]string, error) { s.mu.RLock() defer s.mu.RUnlock() var updatedModels []string - for modelName,_ := range s.failedModels { + for modelName := range s.failedModels { err := s.scheduleToServer(modelName) if err != nil { - s.logger.Debugf("Failed to schedule failed model %s",modelName) + s.logger.Debugf("Failed to schedule failed model %s", modelName) } else { updatedModels = append(updatedModels, modelName) } @@ -66,25 +67,25 @@ func (s *SimpleScheduler) ScheduleFailedModels() ([]string, error) { //TODO - clarify non shared models should not be scheduled func (s *SimpleScheduler) scheduleToServer(modelKey string) error { - s.logger.Debugf("Schedule model %s",modelKey) + s.logger.Debugf("Schedule model %s", modelKey) // Get Model model, err := s.store.GetModel(modelKey) if err != nil { return err } if model == nil { - fmt.Errorf("Can't find model with key %s",modelKey) + return fmt.Errorf("Can't find model with key %s", modelKey) } latestModel := model.GetLatest() if latestModel == nil { return fmt.Errorf("No latest model for %s", modelKey) } - if model.Deleted && latestModel.HasServer(){ - s.logger.Debugf("Model %s is deleted ensuring removed",modelKey) + if model.Deleted && latestModel.HasServer() { + s.logger.Debugf("Model %s is deleted ensuring removed", modelKey) err = s.store.UpdateLoadedModels(modelKey, latestModel.GetVersion(), latestModel.Server(), []*store.ServerReplica{}) if err != nil { - s.logger.Warnf("Failed to unschedule model replicas for model %s on server %s",modelKey,latestModel.Server()) + s.logger.Warnf("Failed to unschedule model replicas for model %s on server %s", modelKey, latestModel.Server()) } } else { // Get all servers @@ -96,9 +97,9 @@ func (s *SimpleScheduler) scheduleToServer(modelKey string) error { filteredServers := s.filterServers(latestModel, servers) s.sortServers(latestModel, filteredServers) ok := false - s.logger.Debugf("Model %s candidate servers %v",modelKey, filteredServers) + s.logger.Debugf("Model %s candidate servers %v", modelKey, filteredServers) // For each server filter and sort replicas and attempt schedule if enough replicas - for _,candidateServer := range filteredServers { + for _, candidateServer := range filteredServers { candidateReplicas := s.filterReplicas(latestModel, candidateServer) if len(candidateReplicas.ChosenReplicas) < latestModel.DesiredReplicas() { continue @@ -107,7 +108,7 @@ func (s *SimpleScheduler) scheduleToServer(modelKey string) error { err = s.store.UpdateLoadedModels(modelKey, latestModel.GetVersion(), candidateServer.Name, candidateReplicas.ChosenReplicas[0:latestModel.DesiredReplicas()]) if err != nil { - s.logger.Warnf("Failed to update model replicas for model %s on server %s",modelKey,candidateServer.Name) + s.logger.Warnf("Failed to update model replicas for model %s on server %s", modelKey, candidateServer.Name) } else { ok = true break @@ -123,23 +124,26 @@ func (s *SimpleScheduler) scheduleToServer(modelKey string) error { } func (s *SimpleScheduler) sortServers(model *store.ModelVersion, server []*store.ServerSnapshot) { - for _,sorter := range s.serverSorts { - sort.SliceStable(server, func(i, j int) bool { return sorter.IsLess(&sorters.CandidateServer{Model: model, Server: server[i]}, &sorters.CandidateServer{Model: model, Server: server[j]})}) + for _, sorter := range s.serverSorts { + sort.SliceStable(server, func(i, j int) bool { + return sorter.IsLess(&sorters.CandidateServer{Model: model, Server: server[i]}, &sorters.CandidateServer{Model: model, Server: server[j]}) + }) } } func (s *SimpleScheduler) sortReplicas(candidateServer *sorters.CandidateServer) { - for _,sorter := range s.replicaSorts { + for _, sorter := range s.replicaSorts { sort.SliceStable(candidateServer.ChosenReplicas, func(i, j int) bool { return sorter.IsLess(&sorters.CandidateReplica{Model: candidateServer.Model, Server: candidateServer.Server, Replica: candidateServer.ChosenReplicas[i]}, - &sorters.CandidateReplica{Model: candidateServer.Model, Server: candidateServer.Server, Replica: candidateServer.ChosenReplicas[j]})}) + &sorters.CandidateReplica{Model: candidateServer.Model, Server: candidateServer.Server, Replica: candidateServer.ChosenReplicas[j]}) + }) } } // Filter servers for this model func (s *SimpleScheduler) filterServers(model *store.ModelVersion, servers []*store.ServerSnapshot) []*store.ServerSnapshot { var filteredServers []*store.ServerSnapshot - for _,server := range servers { + for _, server := range servers { ok := true for _, serverFilter := range s.serverFilters { if !serverFilter.Filter(model, server) { @@ -154,12 +158,11 @@ func (s *SimpleScheduler) filterServers(model *store.ModelVersion, servers []*st return filteredServers } - func (s *SimpleScheduler) filterReplicas(model *store.ModelVersion, server *store.ServerSnapshot) *sorters.CandidateServer { candidateServer := sorters.CandidateServer{Model: model, Server: server} - for _,replica := range server.Replicas { + for _, replica := range server.Replicas { ok := true - for _,replicaFilter := range s.replicaFilters { + for _, replicaFilter := range s.replicaFilters { if !replicaFilter.Filter(model, replica) { ok = false break @@ -171,5 +174,3 @@ func (s *SimpleScheduler) filterReplicas(model *store.ModelVersion, server *stor } return &candidateServer } - - diff --git a/scheduler/pkg/scheduler/scheduler_test.go b/scheduler/pkg/scheduler/scheduler_test.go index d00f5bdda9..fd9395d6dc 100644 --- a/scheduler/pkg/scheduler/scheduler_test.go +++ b/scheduler/pkg/scheduler/scheduler_test.go @@ -1,6 +1,8 @@ package scheduler import ( + "testing" + . "github.com/onsi/gomega" "github.com/seldonio/seldon-core/scheduler/apis/mlops/agent" pb "github.com/seldonio/seldon-core/scheduler/apis/mlops/scheduler" @@ -8,15 +10,12 @@ import ( "github.com/seldonio/seldon-core/scheduler/pkg/scheduler/sorters" "github.com/seldonio/seldon-core/scheduler/pkg/store" log "github.com/sirupsen/logrus" - "testing" ) - - type mockStore struct { - models map[string]*store.ModelSnapshot - servers []*store.ServerSnapshot - scheduledServer string + models map[string]*store.ModelSnapshot + servers []*store.ServerSnapshot + scheduledServer string scheduledReplicas []int } @@ -43,7 +42,7 @@ func (f mockStore) GetServer(serverKey string) (*store.ServerSnapshot, error) { func (f *mockStore) UpdateLoadedModels(modelKey string, version string, serverKey string, replicas []*store.ServerReplica) error { f.scheduledServer = serverKey var replicaIdxs []int - for _,rep := range(replicas) { + for _, rep := range replicas { replicaIdxs = append(replicaIdxs, rep.GetReplicaIdx()) } f.scheduledReplicas = replicaIdxs @@ -62,110 +61,105 @@ func (f mockStore) RemoveServerReplica(serverName string, replicaIdx int) ([]str panic("implement me") } - - func TestScheduler(t *testing.T) { logger := log.New() g := NewGomegaWithT(t) newTestModel := func(name string, requiredMemory uint64, requirements []string, server *string, replicas uint32, loadedModels []int, deleted bool, scheduledServer string) *store.ModelSnapshot { config := &pb.ModelDetails{ - Name: name, - MemoryBytes: &requiredMemory, + Name: name, + MemoryBytes: &requiredMemory, Requirements: requirements, - Server: server, - Replicas: replicas, + Server: server, + Replicas: replicas, } rmap := make(map[int]store.ModelReplicaState) - for _,ridx := range loadedModels { + for _, ridx := range loadedModels { rmap[ridx] = store.Loaded } return &store.ModelSnapshot{ - Name: name, + Name: name, Versions: []*store.ModelVersion{store.NewModelVersion(config, scheduledServer, rmap, false, store.ModelProgressing)}, - Deleted: deleted, + Deleted: deleted, } } gsr := func(replicaIdx int, availableMemory uint64, capabilities []string) *store.ServerReplica { - return store.NewServerReplica("svc", 8080, 5001, replicaIdx,nil, capabilities,availableMemory,availableMemory,nil,true) + return store.NewServerReplica("svc", 8080, 5001, replicaIdx, nil, capabilities, availableMemory, availableMemory, nil, true) } - type test struct { - name string - model *store.ModelSnapshot - servers []*store.ServerSnapshot - serverFilters []ServerFilter - replicaFilters []ReplicaFilter - serverSorts []sorters.ServerSorter - replicaSort []sorters.ReplicaSorter - scheduled bool - scheduledServer string + name string + model *store.ModelSnapshot + servers []*store.ServerSnapshot + serverFilters []ServerFilter + replicaFilters []ReplicaFilter + serverSorts []sorters.ServerSorter + replicaSort []sorters.ReplicaSorter + scheduled bool + scheduledServer string scheduledReplicas []int } - tests := []test { + tests := []test{ { name: "SmokeTest", model: newTestModel("model1", 100, []string{"sklearn"}, nil, 1, []int{}, false, ""), servers: []*store.ServerSnapshot{ { - Name: "server1", - Replicas: map[int]*store.ServerReplica{0:gsr(0, 200, []string{"sklearn"})}, - Shared: true, + Name: "server1", + Replicas: map[int]*store.ServerReplica{0: gsr(0, 200, []string{"sklearn"})}, + Shared: true, }, }, - serverFilters: []ServerFilter{filters.SharingServerFilter{}}, - replicaFilters: []ReplicaFilter{filters.RequirementsReplicaFilter{}, filters.AvailableMemoryFilter{}}, - serverSorts: []sorters.ServerSorter{}, - replicaSort: []sorters.ReplicaSorter{sorters.ModelAlreadyLoadedSorter{}}, - scheduled: true, - scheduledServer: "server1", + serverFilters: []ServerFilter{filters.SharingServerFilter{}}, + replicaFilters: []ReplicaFilter{filters.RequirementsReplicaFilter{}, filters.AvailableMemoryFilter{}}, + serverSorts: []sorters.ServerSorter{}, + replicaSort: []sorters.ReplicaSorter{sorters.ModelAlreadyLoadedSorter{}}, + scheduled: true, + scheduledServer: "server1", scheduledReplicas: []int{0}, - }, { name: "ReplicasTwo", model: newTestModel("model1", 100, []string{"sklearn"}, nil, 2, []int{}, false, ""), servers: []*store.ServerSnapshot{ { - Name: "server1", - Replicas: map[int]*store.ServerReplica{0:gsr(0, 200, []string{"sklearn"})}, - Shared: true, + Name: "server1", + Replicas: map[int]*store.ServerReplica{0: gsr(0, 200, []string{"sklearn"})}, + Shared: true, }, { Name: "server2", Replicas: map[int]*store.ServerReplica{ - 0:gsr(0, 200, []string{"sklearn"}), - 1:gsr(1, 200, []string{"sklearn"}), + 0: gsr(0, 200, []string{"sklearn"}), + 1: gsr(1, 200, []string{"sklearn"}), }, Shared: true, }, }, - serverFilters: []ServerFilter{filters.SharingServerFilter{}}, - replicaFilters: []ReplicaFilter{filters.RequirementsReplicaFilter{}, filters.AvailableMemoryFilter{}}, - serverSorts: []sorters.ServerSorter{}, - replicaSort: []sorters.ReplicaSorter{sorters.ModelAlreadyLoadedSorter{}}, - scheduled: true, - scheduledServer: "server2", - scheduledReplicas: []int{0,1}, - + serverFilters: []ServerFilter{filters.SharingServerFilter{}}, + replicaFilters: []ReplicaFilter{filters.RequirementsReplicaFilter{}, filters.AvailableMemoryFilter{}}, + serverSorts: []sorters.ServerSorter{}, + replicaSort: []sorters.ReplicaSorter{sorters.ModelAlreadyLoadedSorter{}}, + scheduled: true, + scheduledServer: "server2", + scheduledReplicas: []int{0, 1}, }, { name: "NotEnoughReplicas", model: newTestModel("model1", 100, []string{"sklearn"}, nil, 2, []int{}, false, ""), servers: []*store.ServerSnapshot{ { - Name: "server1", - Replicas: map[int]*store.ServerReplica{0:gsr(0, 200, []string{"sklearn"})}, - Shared: true, + Name: "server1", + Replicas: map[int]*store.ServerReplica{0: gsr(0, 200, []string{"sklearn"})}, + Shared: true, }, { Name: "server2", Replicas: map[int]*store.ServerReplica{ - 0:gsr(0, 200, []string{"sklearn"}), - 1:gsr(1, 200, []string{"foo"}), + 0: gsr(0, 200, []string{"sklearn"}), + 1: gsr(1, 200, []string{"foo"}), }, Shared: true, }, @@ -174,60 +168,58 @@ func TestScheduler(t *testing.T) { replicaFilters: []ReplicaFilter{filters.RequirementsReplicaFilter{}, filters.AvailableMemoryFilter{}}, serverSorts: []sorters.ServerSorter{}, replicaSort: []sorters.ReplicaSorter{sorters.ModelAlreadyLoadedSorter{}}, - scheduled: false, + scheduled: false, }, { name: "MemoryOneServer", model: newTestModel("model1", 100, []string{"sklearn"}, nil, 1, []int{}, false, ""), servers: []*store.ServerSnapshot{ { - Name: "server1", - Replicas: map[int]*store.ServerReplica{0:gsr(0, 50, []string{"sklearn"})}, - Shared: true, + Name: "server1", + Replicas: map[int]*store.ServerReplica{0: gsr(0, 50, []string{"sklearn"})}, + Shared: true, }, { Name: "server2", Replicas: map[int]*store.ServerReplica{ - 0:gsr(0, 200, []string{"sklearn"}), + 0: gsr(0, 200, []string{"sklearn"}), }, Shared: true, }, }, - serverFilters: []ServerFilter{filters.SharingServerFilter{}}, - replicaFilters: []ReplicaFilter{filters.RequirementsReplicaFilter{}, filters.AvailableMemoryFilter{}}, - serverSorts: []sorters.ServerSorter{}, - replicaSort: []sorters.ReplicaSorter{sorters.ModelAlreadyLoadedSorter{}}, - scheduled: true, - scheduledServer: "server2", + serverFilters: []ServerFilter{filters.SharingServerFilter{}}, + replicaFilters: []ReplicaFilter{filters.RequirementsReplicaFilter{}, filters.AvailableMemoryFilter{}}, + serverSorts: []sorters.ServerSorter{}, + replicaSort: []sorters.ReplicaSorter{sorters.ModelAlreadyLoadedSorter{}}, + scheduled: true, + scheduledServer: "server2", scheduledReplicas: []int{0}, - }, { name: "ModelsLoaded", model: newTestModel("model1", 100, []string{"sklearn"}, nil, 2, []int{1}, false, ""), servers: []*store.ServerSnapshot{ { - Name: "server1", - Replicas: map[int]*store.ServerReplica{0:gsr(0, 50, []string{"sklearn"})}, - Shared: true, + Name: "server1", + Replicas: map[int]*store.ServerReplica{0: gsr(0, 50, []string{"sklearn"})}, + Shared: true, }, { Name: "server2", Replicas: map[int]*store.ServerReplica{ - 0:gsr(0, 200, []string{"sklearn"}), - 1:gsr(1, 200, []string{"sklearn"}), + 0: gsr(0, 200, []string{"sklearn"}), + 1: gsr(1, 200, []string{"sklearn"}), }, Shared: true, }, }, - serverFilters: []ServerFilter{filters.SharingServerFilter{}}, - replicaFilters: []ReplicaFilter{filters.RequirementsReplicaFilter{}, filters.AvailableMemoryFilter{}}, - serverSorts: []sorters.ServerSorter{}, - replicaSort: []sorters.ReplicaSorter{sorters.ModelAlreadyLoadedSorter{}}, - scheduled: true, - scheduledServer: "server2", - scheduledReplicas: []int{1,0}, - + serverFilters: []ServerFilter{filters.SharingServerFilter{}}, + replicaFilters: []ReplicaFilter{filters.RequirementsReplicaFilter{}, filters.AvailableMemoryFilter{}}, + serverSorts: []sorters.ServerSorter{}, + replicaSort: []sorters.ReplicaSorter{sorters.ModelAlreadyLoadedSorter{}}, + scheduled: true, + scheduledServer: "server2", + scheduledReplicas: []int{1, 0}, }, { name: "ModelUnLoaded", @@ -236,21 +228,19 @@ func TestScheduler(t *testing.T) { { Name: "server2", Replicas: map[int]*store.ServerReplica{ - 0:gsr(0, 200, []string{"sklearn"}), - 1:gsr(1, 200, []string{"sklearn"}), + 0: gsr(0, 200, []string{"sklearn"}), + 1: gsr(1, 200, []string{"sklearn"}), }, Shared: true, }, }, - serverFilters: []ServerFilter{}, - replicaFilters: []ReplicaFilter{}, - serverSorts: []sorters.ServerSorter{}, - replicaSort: []sorters.ReplicaSorter{}, - scheduled: true, - scheduledServer: "server2", + serverFilters: []ServerFilter{}, + replicaFilters: []ReplicaFilter{}, + serverSorts: []sorters.ServerSorter{}, + replicaSort: []sorters.ReplicaSorter{}, + scheduled: true, + scheduledServer: "server2", scheduledReplicas: nil, - - }, } @@ -258,12 +248,12 @@ func TestScheduler(t *testing.T) { modelMap := make(map[string]*store.ModelSnapshot) modelMap[model.Name] = model return &mockStore{ - models: modelMap, - servers: servers, - } + models: modelMap, + servers: servers, + } } - for _,test := range tests { + for _, test := range tests { t.Run(test.name, func(t *testing.T) { //t.Logf("Running schedule test %d",tidx) mockStore := newMockStore(test.model, test.servers) @@ -278,7 +268,6 @@ func TestScheduler(t *testing.T) { } }) - } } \ No newline at end of file diff --git a/scheduler/pkg/scheduler/sorters/interface.go b/scheduler/pkg/scheduler/sorters/interface.go index dd23e2e942..0b733156fd 100644 --- a/scheduler/pkg/scheduler/sorters/interface.go +++ b/scheduler/pkg/scheduler/sorters/interface.go @@ -20,4 +20,4 @@ type ServerSorter interface { type ReplicaSorter interface { IsLess(i *CandidateReplica, j *CandidateReplica) bool -} \ No newline at end of file +} diff --git a/scheduler/pkg/scheduler/sorters/loaded.go b/scheduler/pkg/scheduler/sorters/loaded.go index 202777b381..af6a94ed5f 100644 --- a/scheduler/pkg/scheduler/sorters/loaded.go +++ b/scheduler/pkg/scheduler/sorters/loaded.go @@ -7,6 +7,3 @@ func (m ModelAlreadyLoadedSorter) IsLess(i *CandidateReplica, j *CandidateReplic jIsLoading := j.Model.IsLoading(j.Replica.GetReplicaIdx()) return iIsLoading && !jIsLoading } - - - diff --git a/scheduler/pkg/scheduler/sorters/loaded_test.go b/scheduler/pkg/scheduler/sorters/loaded_test.go index 85a5b395af..400bc6ad07 100644 --- a/scheduler/pkg/scheduler/sorters/loaded_test.go +++ b/scheduler/pkg/scheduler/sorters/loaded_test.go @@ -1,17 +1,18 @@ package sorters import ( - . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/scheduler/pkg/store" "sort" "testing" + + . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/scheduler/pkg/store" ) func TestModelAlreadyLoadedSort(t *testing.T) { g := NewGomegaWithT(t) type test struct { - name string + name string replicas []*CandidateReplica ordering []int } @@ -19,22 +20,22 @@ func TestModelAlreadyLoadedSort(t *testing.T) { model := store.NewModelVersion( nil, "server1", - map[int]store.ModelReplicaState{3:store.Loading}, + map[int]store.ModelReplicaState{3: store.Loading}, false, store.ModelProgressing) - tests := []test { + tests := []test{ { name: "OneLoadedModel", replicas: []*CandidateReplica{ - {Model: model, Replica: store.NewServerReplica("",8080, 5001, 2,nil,[]string{},100,100,map[string]bool{},true)}, - {Model: model, Replica: store.NewServerReplica("",8080, 5001, 1,nil,[]string{},100,100,map[string]bool{},true)}, - {Model: model, Replica: store.NewServerReplica("",8080, 5001, 3,nil,[]string{},100,100,map[string]bool{},true)}, + {Model: model, Replica: store.NewServerReplica("", 8080, 5001, 2, nil, []string{}, 100, 100, map[string]bool{}, true)}, + {Model: model, Replica: store.NewServerReplica("", 8080, 5001, 1, nil, []string{}, 100, 100, map[string]bool{}, true)}, + {Model: model, Replica: store.NewServerReplica("", 8080, 5001, 3, nil, []string{}, 100, 100, map[string]bool{}, true)}, }, - ordering: []int{3,2,1}, + ordering: []int{3, 2, 1}, }, } - for _,test := range tests { + for _, test := range tests { t.Run(test.name, func(t *testing.T) { sorter := ModelAlreadyLoadedSorter{} sort.SliceStable(test.replicas, func(i, j int) bool { return sorter.IsLess(test.replicas[i], test.replicas[j]) }) diff --git a/scheduler/pkg/scheduler/sorters/replicamemory_test.go b/scheduler/pkg/scheduler/sorters/replicamemory_test.go index 80baf776eb..c6f15bca0f 100644 --- a/scheduler/pkg/scheduler/sorters/replicamemory_test.go +++ b/scheduler/pkg/scheduler/sorters/replicamemory_test.go @@ -1,18 +1,18 @@ package sorters - import ( - . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/scheduler/pkg/store" "sort" "testing" + + . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/scheduler/pkg/store" ) func TestMemoryReplicaFilter(t *testing.T) { g := NewGomegaWithT(t) type test struct { - name string + name string replicas []*CandidateReplica ordering []int } @@ -20,26 +20,26 @@ func TestMemoryReplicaFilter(t *testing.T) { model := store.NewModelVersion( nil, "server1", - map[int]store.ModelReplicaState{3:store.Loading}, + map[int]store.ModelReplicaState{3: store.Loading}, false, store.ModelProgressing) - tests := []test { + tests := []test{ { name: "ThreeReplicasDifferentMemory", replicas: []*CandidateReplica{ - {Model: model, Replica: store.NewServerReplica("",8080, 5001, 1,nil,[]string{},100,100,map[string]bool{},true)}, - {Model: model, Replica: store.NewServerReplica("",8080, 5001, 2,nil,[]string{},100,200,map[string]bool{},true)}, - {Model: model, Replica: store.NewServerReplica("",8080, 5001, 3,nil,[]string{},100,150,map[string]bool{},true)}, + {Model: model, Replica: store.NewServerReplica("", 8080, 5001, 1, nil, []string{}, 100, 100, map[string]bool{}, true)}, + {Model: model, Replica: store.NewServerReplica("", 8080, 5001, 2, nil, []string{}, 100, 200, map[string]bool{}, true)}, + {Model: model, Replica: store.NewServerReplica("", 8080, 5001, 3, nil, []string{}, 100, 150, map[string]bool{}, true)}, }, - ordering: []int{2,3,1}, + ordering: []int{2, 3, 1}, }, } - for _,test := range tests { + for _, test := range tests { t.Run(test.name, func(t *testing.T) { sorter := AvailableMemorySorter{} - sort.SliceStable(test.replicas, func(i, j int) bool {return sorter.IsLess(test.replicas[i], test.replicas[j])}) - for idx,expected := range test.ordering { + sort.SliceStable(test.replicas, func(i, j int) bool { return sorter.IsLess(test.replicas[i], test.replicas[j]) }) + for idx, expected := range test.ordering { g.Expect(test.replicas[idx].Replica.GetReplicaIdx()).To(Equal(expected)) } }) diff --git a/scheduler/pkg/server/server.go b/scheduler/pkg/server/server.go index 420519511f..879a1eab72 100644 --- a/scheduler/pkg/server/server.go +++ b/scheduler/pkg/server/server.go @@ -3,6 +3,8 @@ package server import ( "context" "fmt" + "net" + pb "github.com/seldonio/seldon-core/scheduler/apis/mlops/scheduler" "github.com/seldonio/seldon-core/scheduler/pkg/agent" scheduler2 "github.com/seldonio/seldon-core/scheduler/pkg/scheduler" @@ -11,7 +13,6 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" - "net" ) var ( @@ -20,7 +21,7 @@ var ( type SchedulerServer struct { pb.UnimplementedSchedulerServer - logger log.FieldLogger + logger log.FieldLogger store store.SchedulerStore scheduler scheduler2.Scheduler agentHandler agent.AgentHandler @@ -30,7 +31,7 @@ func (s SchedulerServer) SubscribeModelEvents(req *pb.ModelSubscriptionRequest, panic("implement me") } -func(s SchedulerServer) StartGrpcServer(schedulerPort uint) error { +func (s SchedulerServer) StartGrpcServer(schedulerPort uint) error { lis, err := net.Listen("tcp", fmt.Sprintf(":%d", schedulerPort)) if err != nil { log.Fatalf("failed to create listener: %v", err) @@ -39,23 +40,23 @@ func(s SchedulerServer) StartGrpcServer(schedulerPort uint) error { grpcServer := grpc.NewServer(opts...) pb.RegisterSchedulerServer(grpcServer, s) s.logger.Printf("Scheduler server running on %d", schedulerPort) - return grpcServer.Serve(lis) + return grpcServer.Serve(lis) } func NewSchedulerServer(logger log.FieldLogger, store store.SchedulerStore, scheduler scheduler2.Scheduler, agentHandler agent.AgentHandler) *SchedulerServer { s := &SchedulerServer{ - logger: logger, - store: store, - scheduler: scheduler, + logger: logger, + store: store, + scheduler: scheduler, agentHandler: agentHandler, } return s } func (s SchedulerServer) LoadModel(ctx context.Context, req *pb.LoadModelRequest) (*pb.LoadModelResponse, error) { - logger := s.logger.WithField("func","LoadModel") - logger.Debugf("Load model %s",req.GetModel().GetName()) + logger := s.logger.WithField("func", "LoadModel") + logger.Debugf("Load model %s", req.GetModel().GetName()) err := s.store.UpdateModel(req.GetModel()) if err != nil { return nil, status.Errorf(codes.FailedPrecondition, err.Error()) @@ -69,8 +70,8 @@ func (s SchedulerServer) LoadModel(ctx context.Context, req *pb.LoadModelRequest } func (s SchedulerServer) UnloadModel(ctx context.Context, reference *pb.ModelReference) (*pb.UnloadModelResponse, error) { - logger := s.logger.WithField("func","UnloadModel") - logger.Debugf("Unload model %s",reference.Name) + logger := s.logger.WithField("func", "UnloadModel") + logger.Debugf("Unload model %s", reference.Name) err := s.store.RemoveModel(reference.Name) if err != nil { return nil, status.Errorf(codes.FailedPrecondition, err.Error()) @@ -89,16 +90,16 @@ func (s SchedulerServer) ModelStatus(ctx context.Context, reference *pb.ModelRef return nil, status.Errorf(codes.FailedPrecondition, err.Error()) } if model == nil { - return nil, status.Errorf(codes.FailedPrecondition, fmt.Sprintf("Failed to find model %s",reference.Name)) + return nil, status.Errorf(codes.FailedPrecondition, fmt.Sprintf("Failed to find model %s", reference.Name)) } latestModel := model.GetLatest() if latestModel == nil { - return nil, status.Errorf(codes.FailedPrecondition, fmt.Sprintf("Failed to find model %s",reference.Name)) + return nil, status.Errorf(codes.FailedPrecondition, fmt.Sprintf("Failed to find model %s", reference.Name)) } return &pb.ModelStatusResponse{ - ModelName: reference.Name, - Version: latestModel.GetVersion(), - ServerName: latestModel.Server(), + ModelName: reference.Name, + Version: latestModel.GetVersion(), + ServerName: latestModel.Server(), ModelReplicaState: latestModel.ReplicaState(), }, nil } @@ -109,7 +110,7 @@ func (s SchedulerServer) ServerStatus(ctx context.Context, reference *pb.ServerR return nil, status.Errorf(codes.FailedPrecondition, err.Error()) } if server == nil { - return nil, status.Errorf(codes.FailedPrecondition, fmt.Sprintf("Failed to find server %s",reference.Name)) + return nil, status.Errorf(codes.FailedPrecondition, fmt.Sprintf("Failed to find server %s", reference.Name)) } ss := &pb.ServerStatusResponse{ ServerName: reference.Name, @@ -117,7 +118,7 @@ func (s SchedulerServer) ServerStatus(ctx context.Context, reference *pb.ServerR for _, replica := range server.Replicas { ss.Resources = append(ss.Resources, &pb.ServerResources{ - Memory: replica.GetMemory(), + Memory: replica.GetMemory(), AvailableMemoryBytes: replica.GetAvailableMemory(), }) } diff --git a/scheduler/pkg/server/server_test.go b/scheduler/pkg/server/server_test.go index 90a4eba3f8..d24fdf90c0 100644 --- a/scheduler/pkg/server/server_test.go +++ b/scheduler/pkg/server/server_test.go @@ -2,6 +2,8 @@ package server import ( "context" + "testing" + . "github.com/onsi/gomega" pba "github.com/seldonio/seldon-core/scheduler/apis/mlops/agent" pb "github.com/seldonio/seldon-core/scheduler/apis/mlops/scheduler" @@ -12,15 +14,13 @@ import ( log "github.com/sirupsen/logrus" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" - "testing" ) - type mockAgentHandler struct { numSyncs int } -func (m *mockAgentHandler) SendAgentSync(modelName string){ +func (m *mockAgentHandler) SendAgentSync(modelName string) { m.numSyncs++ } @@ -28,7 +28,7 @@ func TestLoadModel(t *testing.T) { t.Logf("Started") g := NewGomegaWithT(t) - createTestScheduler := func() (*SchedulerServer, *mockAgentHandler){ + createTestScheduler := func() (*SchedulerServer, *mockAgentHandler) { logger := log.New() log.SetLevel(log.DebugLevel) schedulerStore := store.NewMemoryStore(logger, store.NewLocalSchedulerStore()) @@ -44,49 +44,49 @@ func TestLoadModel(t *testing.T) { } type test struct { - name string - req []*pba.AgentSubscribeRequest + name string + req []*pba.AgentSubscribeRequest model *pb.ModelDetails - code codes.Code + code codes.Code } smallMemory := uint64(100) largeMemory := uint64(2000) - tests := []test { + tests := []test{ { name: "Simple", req: []*pba.AgentSubscribeRequest{ {ServerName: "server1", ReplicaIdx: 0, Shared: true, ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, MemoryBytes: 1000, AvailableMemoryBytes: 1000, Capabilities: []string{"sklearn"}}}}, - model: &pb.ModelDetails{Name: "model1", Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes:&smallMemory, Replicas: 1}, - code: codes.OK}, + model: &pb.ModelDetails{Name: "model1", Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes: &smallMemory, Replicas: 1}, + code: codes.OK}, { name: "TooManyReplicas", req: []*pba.AgentSubscribeRequest{ {ServerName: "server1", ReplicaIdx: 0, Shared: true, ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, MemoryBytes: 1000, AvailableMemoryBytes: 1000, Capabilities: []string{"sklearn"}}}}, - model: &pb.ModelDetails{Name: "model1", Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes:&smallMemory, Replicas: 2}, - code: codes.FailedPrecondition}, + model: &pb.ModelDetails{Name: "model1", Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes: &smallMemory, Replicas: 2}, + code: codes.FailedPrecondition}, { name: "TooMuchMemory", req: []*pba.AgentSubscribeRequest{ {ServerName: "server1", ReplicaIdx: 0, Shared: true, ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, MemoryBytes: 1000, AvailableMemoryBytes: 1000, Capabilities: []string{"sklearn"}}}}, - model: &pb.ModelDetails{Name: "model1", Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes:&largeMemory, Replicas: 1}, - code: codes.FailedPrecondition}, + model: &pb.ModelDetails{Name: "model1", Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes: &largeMemory, Replicas: 1}, + code: codes.FailedPrecondition}, { name: "FailedRequirements", req: []*pba.AgentSubscribeRequest{ {ServerName: "server1", ReplicaIdx: 0, Shared: true, ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, MemoryBytes: 1000, AvailableMemoryBytes: 1000, Capabilities: []string{"sklearn"}}}}, - model: &pb.ModelDetails{Name: "model1", Uri: "gs://model", Requirements: []string{"xgboost"}, MemoryBytes:&smallMemory, Replicas: 1}, - code: codes.FailedPrecondition}, + model: &pb.ModelDetails{Name: "model1", Uri: "gs://model", Requirements: []string{"xgboost"}, MemoryBytes: &smallMemory, Replicas: 1}, + code: codes.FailedPrecondition}, { name: "MultipleRequirements", req: []*pba.AgentSubscribeRequest{ {ServerName: "server1", ReplicaIdx: 0, Shared: true, - ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, MemoryBytes: 1000, AvailableMemoryBytes: 1000, Capabilities: []string{"sklearn","xgboost"}}}}, - model: &pb.ModelDetails{Name: "model1", Uri: "gs://model", Requirements: []string{"xgboost","sklearn"}, MemoryBytes:&smallMemory, Replicas: 1}, - code: codes.OK}, + ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, MemoryBytes: 1000, AvailableMemoryBytes: 1000, Capabilities: []string{"sklearn", "xgboost"}}}}, + model: &pb.ModelDetails{Name: "model1", Uri: "gs://model", Requirements: []string{"xgboost", "sklearn"}, MemoryBytes: &smallMemory, Replicas: 1}, + code: codes.OK}, { name: "TwoReplicas", req: []*pba.AgentSubscribeRequest{ @@ -94,8 +94,8 @@ func TestLoadModel(t *testing.T) { ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, MemoryBytes: 1000, AvailableMemoryBytes: 1000, Capabilities: []string{"sklearn"}}}, {ServerName: "server1", ReplicaIdx: 1, Shared: true, ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, MemoryBytes: 1000, AvailableMemoryBytes: 1000, Capabilities: []string{"sklearn"}}}}, - model: &pb.ModelDetails{Name: "model1", Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes:&smallMemory, Replicas: 2}, - code: codes.OK}, + model: &pb.ModelDetails{Name: "model1", Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes: &smallMemory, Replicas: 2}, + code: codes.OK}, { name: "TwoReplicasFail", req: []*pba.AgentSubscribeRequest{ @@ -103,10 +103,10 @@ func TestLoadModel(t *testing.T) { ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, MemoryBytes: 1000, AvailableMemoryBytes: 1000, Capabilities: []string{"sklearn"}}}, {ServerName: "server1", ReplicaIdx: 1, Shared: true, ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, MemoryBytes: 1000, AvailableMemoryBytes: 1000, Capabilities: []string{"foo"}}}}, - model: &pb.ModelDetails{Name: "model1", Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes:&smallMemory, Replicas: 2}, - code: codes.FailedPrecondition}, // schedule to 2 replicas but 1 fails + model: &pb.ModelDetails{Name: "model1", Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes: &smallMemory, Replicas: 2}, + code: codes.FailedPrecondition}, // schedule to 2 replicas but 1 fails } - for _,test := range tests { + for _, test := range tests { t.Run(test.name, func(t *testing.T) { s, mockAgent := createTestScheduler() for _, repReq := range test.req { @@ -135,7 +135,7 @@ func TestUnloadModel(t *testing.T) { t.Logf("Started") g := NewGomegaWithT(t) - createTestScheduler := func() (*SchedulerServer, *mockAgentHandler){ + createTestScheduler := func() (*SchedulerServer, *mockAgentHandler) { logger := log.New() log.SetLevel(log.DebugLevel) schedulerStore := store.NewMemoryStore(logger, store.NewLocalSchedulerStore()) @@ -146,28 +146,28 @@ func TestUnloadModel(t *testing.T) { []scheduler2.ReplicaFilter{filters.RequirementsReplicaFilter{}, filters.AvailableMemoryFilter{}}, []sorters.ServerSorter{}, []sorters.ReplicaSorter{sorters.ModelAlreadyLoadedSorter{}}) - s := NewSchedulerServer(logger, schedulerStore, scheduler,mockAgent) + s := NewSchedulerServer(logger, schedulerStore, scheduler, mockAgent) return s, mockAgent } type test struct { - name string - req []*pba.AgentSubscribeRequest - model *pb.ModelDetails - code codes.Code + name string + req []*pba.AgentSubscribeRequest + model *pb.ModelDetails + code codes.Code modelReplicaStates map[int]store.ModelReplicaState } modelName := "model1" smallMemory := uint64(100) - tests := []test { + tests := []test{ { name: "Simple", req: []*pba.AgentSubscribeRequest{ {ServerName: "server1", ReplicaIdx: 0, Shared: true, ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, AvailableMemoryBytes: 1000, Capabilities: []string{"sklearn"}}}}, - model: &pb.ModelDetails{Name: modelName, Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes:&smallMemory, Replicas: 1}, - code: codes.OK, - modelReplicaStates: map[int]store.ModelReplicaState{0:store.UnloadRequested}, + model: &pb.ModelDetails{Name: modelName, Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes: &smallMemory, Replicas: 1}, + code: codes.OK, + modelReplicaStates: map[int]store.ModelReplicaState{0: store.UnloadRequested}, }, { name: "Multiple", @@ -185,8 +185,8 @@ func TestUnloadModel(t *testing.T) { ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, AvailableMemoryBytes: 1000, Capabilities: []string{"sklearn"}}}, {ServerName: "server1", ReplicaIdx: 1, Shared: true, ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, AvailableMemoryBytes: 1000, Capabilities: []string{"sklearn"}}}}, - model: &pb.ModelDetails{Name: modelName, Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes:&smallMemory, Replicas: 2}, - code: codes.OK, + model: &pb.ModelDetails{Name: modelName, Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes: &smallMemory, Replicas: 2}, + code: codes.OK, modelReplicaStates: map[int]store.ModelReplicaState{0: store.UnloadRequested, 1: store.UnloadRequested}, }, { @@ -195,9 +195,9 @@ func TestUnloadModel(t *testing.T) { {ServerName: "server1", ReplicaIdx: 0, Shared: true, ReplicaConfig: &pba.ReplicaConfig{InferenceSvc: "server1", InferenceHttpPort: 1, AvailableMemoryBytes: 1000, Capabilities: []string{"sklearn"}}}}, model: nil, - code: codes.FailedPrecondition}, + code: codes.FailedPrecondition}, } - for _,test := range tests { + for _, test := range tests { t.Run(test.name, func(t *testing.T) { s, mockAgent := createTestScheduler() for _, repReq := range test.req { @@ -226,7 +226,7 @@ func TestUnloadModel(t *testing.T) { ms, err := s.store.GetModel(modelName) g.Expect(err).To(BeNil()) g.Expect(mockAgent.numSyncs).To(Equal(2)) - for replicaIdx,state := range test.modelReplicaStates { + for replicaIdx, state := range test.modelReplicaStates { g.Expect(ms.GetLatest().GetModelReplicaState(replicaIdx)).To(Equal(state)) } } diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index 8fcf4245f0..bffde3b228 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -2,10 +2,11 @@ package store import ( "fmt" + "sync" + "github.com/seldonio/seldon-core/scheduler/apis/mlops/agent" pb "github.com/seldonio/seldon-core/scheduler/apis/mlops/scheduler" log "github.com/sirupsen/logrus" - "sync" ) type MemoryStore struct { @@ -16,19 +17,18 @@ type MemoryStore struct { func NewMemoryStore(logger log.FieldLogger, store *LocalSchedulerStore) *MemoryStore { return &MemoryStore{ - store: store, - logger: logger.WithField("source","MemoryStore"), + store: store, + logger: logger.WithField("source", "MemoryStore"), } } - -func (m *MemoryStore) updateModelImpl(config *pb.ModelDetails, addAsLatest bool) (*ModelVersion, error) { +func (m *MemoryStore) updateModelImpl(config *pb.ModelDetails, addAsLatest bool) (*ModelVersion, error) { model, ok := m.store.models[config.Name] if !ok { model = NewModel() m.store.models[config.Name] = model } - if existingModelVersion,ok := model.versionMap[config.Version]; !ok { + if existingModelVersion, ok := model.versionMap[config.Version]; !ok { modelVersion := NewDefaultModelVersion(config) model.versionMap[config.Version] = modelVersion if addAsLatest { @@ -39,7 +39,7 @@ func (m *MemoryStore) updateModelImpl(config *pb.ModelDetails, addAsLatest bool) return modelVersion, nil } else { if addAsLatest { - return nil, fmt.Errorf("Model %s version %s already exists. %w",config.Name,config.Version, ModelVersionExistsErr) + return nil, fmt.Errorf("Model %s version %s already exists. %w", config.Name, config.Version, ModelVersionExistsErr) } else { return existingModelVersion, nil } @@ -60,14 +60,14 @@ func (m *MemoryStore) GetModel(key string) (*ModelSnapshot, error) { model, ok := m.store.models[key] if ok { return &ModelSnapshot{ - Name: key, + Name: key, Versions: model.versions, //TODO make a copy for safety? - Deleted: model.deleted, + Deleted: model.deleted, }, nil } else { return &ModelSnapshot{ - Name: key, - Versions:nil, + Name: key, + Versions: nil, }, nil } @@ -81,12 +81,14 @@ func (m *MemoryStore) RemoveModel(modelKey string) error { return nil } - func createServerSnapshot(server *Server) *ServerSnapshot { + if server == nil { + return nil + } return &ServerSnapshot{ - Name: server.name, + Name: server.name, Replicas: server.replicas, - Shared: server.shared, + Shared: server.shared, } } @@ -94,7 +96,7 @@ func (m *MemoryStore) GetServers() ([]*ServerSnapshot, error) { m.mu.RLock() defer m.mu.RUnlock() var servers []*ServerSnapshot - for _,server := range m.store.servers { + for _, server := range m.store.servers { servers = append(servers, createServerSnapshot(server)) } return servers, nil @@ -103,7 +105,10 @@ func (m *MemoryStore) GetServers() ([]*ServerSnapshot, error) { func (m *MemoryStore) GetServer(serverKey string) (*ServerSnapshot, error) { m.mu.RLock() defer m.mu.RUnlock() - server, _ := m.store.servers[serverKey] + server, ok := m.store.servers[serverKey] + if !ok { + return nil, fmt.Errorf("failed to find server %s", serverKey) + } return createServerSnapshot(server), nil } @@ -115,10 +120,10 @@ func (m *MemoryStore) getModelServer(modelKey string, version string, serverKey } modelVersion := model.Latest() if modelVersion == nil { - return nil, nil, nil, fmt.Errorf("No latest version for model %s",modelKey) + return nil, nil, nil, fmt.Errorf("No latest version for model %s", modelKey) } if modelVersion.config.Version != version { - return nil, nil, nil, fmt.Errorf("Model version is not match found %s but was trying to update %s. %w",modelVersion.config.Version,version,ModelNotLatestVersionRejectErr) + return nil, nil, nil, fmt.Errorf("Model version is not match found %s but was trying to update %s. %w", modelVersion.config.Version, version, ModelNotLatestVersionRejectErr) } server, ok := m.store.servers[serverKey] if !ok { @@ -128,7 +133,7 @@ func (m *MemoryStore) getModelServer(modelKey string, version string, serverKey } func (m *MemoryStore) UpdateLoadedModels(modelKey string, version string, serverKey string, replicas []*ServerReplica) error { - logger := m.logger.WithField("func","UpdateLoadedModels") + logger := m.logger.WithField("func", "UpdateLoadedModels") m.mu.Lock() defer m.mu.Unlock() @@ -137,7 +142,7 @@ func (m *MemoryStore) UpdateLoadedModels(modelKey string, version string, server if err != nil { return err } - for _,replica := range replicas { + for _, replica := range replicas { _, ok := server.replicas[replica.GetReplicaIdx()] if !ok { return fmt.Errorf("Failed to reserve replica %d as it does not exist on server %s", replica.GetReplicaIdx(), serverKey) @@ -146,24 +151,24 @@ func (m *MemoryStore) UpdateLoadedModels(modelKey string, version string, server // Update updatedReplicas := make(map[int]bool) - for _,replica := range replicas { + for _, replica := range replicas { existingState := modelVersion.replicas[replica.GetReplicaIdx()] if !existingState.AlreadyLoadingOrLoaded() { - logger.Debugf("Setting model %s on server %s replica %d to LoadRequested",modelKey, serverKey, replica.GetReplicaIdx()) + logger.Debugf("Setting model %s on server %s replica %d to LoadRequested", modelKey, serverKey, replica.GetReplicaIdx()) modelVersion.replicas[replica.GetReplicaIdx()] = LoadRequested } else { - logger.Debugf("model %s on server %s replica %d already loaded",modelKey, serverKey, replica.GetReplicaIdx()) + logger.Debugf("model %s on server %s replica %d already loaded", modelKey, serverKey, replica.GetReplicaIdx()) } updatedReplicas[replica.GetReplicaIdx()] = true } - for replicaIdx,existingState := range modelVersion.replicas { - logger.Debugf("Looking at replicaidx %d with state %s but ignoring processed %v",replicaIdx, existingState.String(), updatedReplicas) - if _,ok := updatedReplicas[replicaIdx]; !ok { + for replicaIdx, existingState := range modelVersion.replicas { + logger.Debugf("Looking at replicaidx %d with state %s but ignoring processed %v", replicaIdx, existingState.String(), updatedReplicas) + if _, ok := updatedReplicas[replicaIdx]; !ok { if !existingState.AlreadyUnloadingOrUnloaded() { - logger.Debugf("Setting model %s on server %s replica %d to UnloadRequested",modelKey, serverKey, replicaIdx) + logger.Debugf("Setting model %s on server %s replica %d to UnloadRequested", modelKey, serverKey, replicaIdx) modelVersion.replicas[replicaIdx] = UnloadRequested } else { - logger.Debugf("model %s on server %s replica %d already unloaded",modelKey, serverKey, replicaIdx) + logger.Debugf("model %s on server %s replica %d already unloaded", modelKey, serverKey, replicaIdx) } } } @@ -172,7 +177,7 @@ func (m *MemoryStore) UpdateLoadedModels(modelKey string, version string, server } func (m *MemoryStore) UpdateModelState(modelKey string, version string, serverKey string, replicaIdx int, availableMemory *uint64, state ModelReplicaState) error { - logger := m.logger.WithField("func","UpdateModelState") + logger := m.logger.WithField("func", "UpdateModelState") m.mu.Lock() defer m.mu.Unlock() @@ -183,7 +188,7 @@ func (m *MemoryStore) UpdateModelState(modelKey string, version string, serverKe } modelVersion.replicas[replicaIdx] = state - logger.Debugf("Setting model %s version %s on server %s replica %d to %s",modelKey, version, serverKey, replicaIdx, state.String()) + logger.Debugf("Setting model %s version %s on server %s replica %d to %s", modelKey, version, serverKey, replicaIdx, state.String()) // Update models loaded onto replica if loaded or unloaded is state if state == Loaded || state == Unloaded { server, ok := m.store.servers[serverKey] @@ -193,7 +198,7 @@ func (m *MemoryStore) UpdateModelState(modelKey string, version string, serverKe if state == Loaded { replica.loadedModels[modelKey] = true } else { - delete(replica.loadedModels,modelKey) + delete(replica.loadedModels, modelKey) } } } @@ -218,7 +223,7 @@ func (m *MemoryStore) AddServerReplica(request *agent.AgentSubscribeRequest) err m.store.servers[request.ServerName] = server } loadedModels := make(map[string]bool) - for _,modelConfig := range request.LoadedModels { + for _, modelConfig := range request.LoadedModels { modelVersion, err := m.updateModelImpl(modelConfig, false) if err != nil { return err @@ -234,18 +239,18 @@ func (m *MemoryStore) AddServerReplica(request *agent.AgentSubscribeRequest) err func (m *MemoryStore) RemoveServerReplica(serverName string, replicaIdx int) ([]string, error) { server, ok := m.store.servers[serverName] if !ok { - return nil, fmt.Errorf("Failed to find server %s",serverName) + return nil, fmt.Errorf("Failed to find server %s", serverName) } serverReplica, ok := server.replicas[replicaIdx] if !ok { - return nil, fmt.Errorf("Failed to find replica %d for server %s",replicaIdx,serverName) + return nil, fmt.Errorf("Failed to find replica %d for server %s", replicaIdx, serverName) } - delete(server.replicas,replicaIdx) + delete(server.replicas, replicaIdx) if len(server.replicas) == 0 { delete(m.store.servers, serverName) } var modelNames []string - for modelName,_ := range serverReplica.loadedModels { + for modelName := range serverReplica.loadedModels { model, ok := m.store.models[modelName] if ok { latestVersion := model.Latest() @@ -254,7 +259,7 @@ func (m *MemoryStore) RemoveServerReplica(serverName string, replicaIdx int) ([] delete(latestVersion.replicas, replicaIdx) } } - modelNames = append(modelNames,modelName) + modelNames = append(modelNames, modelName) } return modelNames, nil } diff --git a/scheduler/pkg/store/memory_test.go b/scheduler/pkg/store/memory_test.go index 6d4832d723..d968d4fb22 100644 --- a/scheduler/pkg/store/memory_test.go +++ b/scheduler/pkg/store/memory_test.go @@ -2,10 +2,11 @@ package store import ( "errors" + "testing" + . "github.com/onsi/gomega" pb "github.com/seldonio/seldon-core/scheduler/apis/mlops/scheduler" log "github.com/sirupsen/logrus" - "testing" ) func TestUpdateModel(t *testing.T) { @@ -13,47 +14,44 @@ func TestUpdateModel(t *testing.T) { g := NewGomegaWithT(t) type test struct { - name string - store *LocalSchedulerStore + name string + store *LocalSchedulerStore config *pb.ModelDetails - addAsLatest bool - err error + err error } tests := []test{ { - name: "simple", - store: NewLocalSchedulerStore(), + name: "simple", + store: NewLocalSchedulerStore(), config: &pb.ModelDetails{Name: "model", Version: "1"}, - err: nil, + err: nil, }, { name: "VersionAlreadyExists", - store: &LocalSchedulerStore{models: map[string]*Model{"model":&Model{ - versionMap: map[string]*ModelVersion{"1":&ModelVersion{ - config: &pb.ModelDetails{Name: "model", Version: "1"}, + store: &LocalSchedulerStore{models: map[string]*Model{"model": &Model{ + versionMap: map[string]*ModelVersion{"1": &ModelVersion{ + config: &pb.ModelDetails{Name: "model", Version: "1"}, }}, - versions: []*ModelVersion{&ModelVersion{config: &pb.ModelDetails{Name: "model", Version: "1"}}}, + versions: []*ModelVersion{&ModelVersion{config: &pb.ModelDetails{Name: "model", Version: "1"}}}, }}}, config: &pb.ModelDetails{Name: "model", Version: "1"}, - addAsLatest: true, - err: ModelVersionExistsErr, + err: ModelVersionExistsErr, }, { name: "VersionAdded", - store: &LocalSchedulerStore{models: map[string]*Model{"model":&Model{ - versionMap: map[string]*ModelVersion{"1":&ModelVersion{ - config: &pb.ModelDetails{Name: "model", Version: "1"}, + store: &LocalSchedulerStore{models: map[string]*Model{"model": &Model{ + versionMap: map[string]*ModelVersion{"1": &ModelVersion{ + config: &pb.ModelDetails{Name: "model", Version: "1"}, }}, - versions: []*ModelVersion{&ModelVersion{config: &pb.ModelDetails{Name: "model", Version: "1"}}}, + versions: []*ModelVersion{&ModelVersion{config: &pb.ModelDetails{Name: "model", Version: "1"}}}, }}}, config: &pb.ModelDetails{Name: "model", Version: "2"}, - addAsLatest: true, - err: nil, + err: nil, }, } - for _,test := range tests { + for _, test := range tests { t.Run(test.name, func(t *testing.T) { ms := NewMemoryStore(logger, test.store) err := ms.UpdateModel(test.config) @@ -68,42 +66,41 @@ func TestUpdateModel(t *testing.T) { } } - func TestGetModel(t *testing.T) { logger := log.New() g := NewGomegaWithT(t) type test struct { - name string - store *LocalSchedulerStore - key string + name string + store *LocalSchedulerStore + key string versions int - err error + err error } tests := []test{ { - name: "NoModel", - store: NewLocalSchedulerStore(), - key: "model", + name: "NoModel", + store: NewLocalSchedulerStore(), + key: "model", versions: 0, - err: nil, + err: nil, }, { name: "VersionAlreadyExists", - store: &LocalSchedulerStore{models: map[string]*Model{"model":&Model{ - versionMap: map[string]*ModelVersion{"1":&ModelVersion{ - config: &pb.ModelDetails{Name: "model", Version: "1"}, + store: &LocalSchedulerStore{models: map[string]*Model{"model": &Model{ + versionMap: map[string]*ModelVersion{"1": &ModelVersion{ + config: &pb.ModelDetails{Name: "model", Version: "1"}, }}, - versions: []*ModelVersion{&ModelVersion{config: &pb.ModelDetails{Name: "model", Version: "1"}}}, + versions: []*ModelVersion{&ModelVersion{config: &pb.ModelDetails{Name: "model", Version: "1"}}}, }}}, - key: "model", + key: "model", versions: 1, - err: nil, + err: nil, }, } - for _,test := range tests { + for _, test := range tests { t.Run(test.name, func(t *testing.T) { ms := NewMemoryStore(logger, test.store) model, err := ms.GetModel(test.key) @@ -123,33 +120,33 @@ func TestRemoveModel(t *testing.T) { g := NewGomegaWithT(t) type test struct { - name string + name string store *LocalSchedulerStore - key string - err error + key string + err error } tests := []test{ { - name: "NoModel", + name: "NoModel", store: NewLocalSchedulerStore(), - key: "model", - err: nil, + key: "model", + err: nil, }, { name: "VersionAlreadyExists", - store: &LocalSchedulerStore{models: map[string]*Model{"model":&Model{ - versionMap: map[string]*ModelVersion{"1":&ModelVersion{ - config: &pb.ModelDetails{Name: "model", Version: "1"}, + store: &LocalSchedulerStore{models: map[string]*Model{"model": &Model{ + versionMap: map[string]*ModelVersion{"1": &ModelVersion{ + config: &pb.ModelDetails{Name: "model", Version: "1"}, }}, - versions: []*ModelVersion{&ModelVersion{config: &pb.ModelDetails{Name: "model", Version: "1"}}}, + versions: []*ModelVersion{&ModelVersion{config: &pb.ModelDetails{Name: "model", Version: "1"}}}, }}}, key: "model", err: nil, }, } - for _,test := range tests { + for _, test := range tests { t.Run(test.name, func(t *testing.T) { ms := NewMemoryStore(logger, test.store) err := ms.RemoveModel(test.key) @@ -162,39 +159,37 @@ func TestRemoveModel(t *testing.T) { } } - func TestUpdateLoadedModels(t *testing.T) { logger := log.New() g := NewGomegaWithT(t) type test struct { - name string - store *LocalSchedulerStore - modelKey string - version string - serverKey string - replicas []*ServerReplica + name string + store *LocalSchedulerStore + modelKey string + version string + serverKey string + replicas []*ServerReplica expectedStates map[int]ModelReplicaState - err error + err error } tests := []test{ { name: "ModelVersionNotLatest", store: &LocalSchedulerStore{ - models: map[string]*Model{"model":&Model{ - versionMap: map[string]*ModelVersion{"1": - { - config: &pb.ModelDetails{Name: "model", Version: "1"}, - replicas: map[int]ModelReplicaState{}, - }}, + models: map[string]*Model{"model": &Model{ + versionMap: map[string]*ModelVersion{"1": { + config: &pb.ModelDetails{Name: "model", Version: "1"}, + replicas: map[int]ModelReplicaState{}, + }}, versions: []*ModelVersion{ { - config: &pb.ModelDetails{Name: "model", Version: "1"}, + config: &pb.ModelDetails{Name: "model", Version: "1"}, replicas: map[int]ModelReplicaState{}, }, { - config: &pb.ModelDetails{Name: "model", Version: "2"}, + config: &pb.ModelDetails{Name: "model", Version: "2"}, replicas: map[int]ModelReplicaState{}, }, }, @@ -205,24 +200,23 @@ func TestUpdateLoadedModels(t *testing.T) { }, }, }, - modelKey: "model", - version: "1", + modelKey: "model", + version: "1", serverKey: "server", - replicas: nil, - err: ModelNotLatestVersionRejectErr, + replicas: nil, + err: ModelNotLatestVersionRejectErr, }, { name: "UpdatedVersionsOK", store: &LocalSchedulerStore{ - models: map[string]*Model{"model":&Model{ - versionMap: map[string]*ModelVersion{"1": - { - config: &pb.ModelDetails{Name: "model", Version: "1"}, + models: map[string]*Model{"model": &Model{ + versionMap: map[string]*ModelVersion{"1": { + config: &pb.ModelDetails{Name: "model", Version: "1"}, replicas: map[int]ModelReplicaState{}, }}, versions: []*ModelVersion{ { - config: &pb.ModelDetails{Name: "model", Version: "1"}, + config: &pb.ModelDetails{Name: "model", Version: "1"}, replicas: map[int]ModelReplicaState{}, }, }, @@ -237,27 +231,26 @@ func TestUpdateLoadedModels(t *testing.T) { }, }, }, - modelKey: "model", - version: "1", + modelKey: "model", + version: "1", serverKey: "server", replicas: []*ServerReplica{ - {replicaIdx: 0},{replicaIdx: 1}, + {replicaIdx: 0}, {replicaIdx: 1}, }, - expectedStates: map[int]ModelReplicaState{0:LoadRequested,1:LoadRequested}, - err: nil, + expectedStates: map[int]ModelReplicaState{0: LoadRequested, 1: LoadRequested}, + err: nil, }, { name: "WithAlreadyLoadedModels", store: &LocalSchedulerStore{ - models: map[string]*Model{"model":&Model{ - versionMap: map[string]*ModelVersion{"1": - { - config: &pb.ModelDetails{Name: "model", Version: "1"}, + models: map[string]*Model{"model": &Model{ + versionMap: map[string]*ModelVersion{"1": { + config: &pb.ModelDetails{Name: "model", Version: "1"}, replicas: map[int]ModelReplicaState{}, }}, versions: []*ModelVersion{ { - config: &pb.ModelDetails{Name: "model", Version: "1"}, + config: &pb.ModelDetails{Name: "model", Version: "1"}, replicas: map[int]ModelReplicaState{ 0: Loaded, }, @@ -274,27 +267,26 @@ func TestUpdateLoadedModels(t *testing.T) { }, }, }, - modelKey: "model", - version: "1", + modelKey: "model", + version: "1", serverKey: "server", replicas: []*ServerReplica{ - {replicaIdx: 0},{replicaIdx: 1}, + {replicaIdx: 0}, {replicaIdx: 1}, }, - expectedStates: map[int]ModelReplicaState{0:Loaded,1:LoadRequested}, - err: nil, + expectedStates: map[int]ModelReplicaState{0: Loaded, 1: LoadRequested}, + err: nil, }, { name: "UnloadModelsNotSelected", store: &LocalSchedulerStore{ - models: map[string]*Model{"model":&Model{ - versionMap: map[string]*ModelVersion{"1": - { - config: &pb.ModelDetails{Name: "model", Version: "1"}, + models: map[string]*Model{"model": &Model{ + versionMap: map[string]*ModelVersion{"1": { + config: &pb.ModelDetails{Name: "model", Version: "1"}, replicas: map[int]ModelReplicaState{}, }}, versions: []*ModelVersion{ { - config: &pb.ModelDetails{Name: "model", Version: "1"}, + config: &pb.ModelDetails{Name: "model", Version: "1"}, replicas: map[int]ModelReplicaState{ 0: Loaded, }, @@ -311,28 +303,27 @@ func TestUpdateLoadedModels(t *testing.T) { }, }, }, - modelKey: "model", - version: "1", + modelKey: "model", + version: "1", serverKey: "server", replicas: []*ServerReplica{ {replicaIdx: 1}, }, - expectedStates: map[int]ModelReplicaState{0:UnloadRequested,1:LoadRequested}, - err: nil, + expectedStates: map[int]ModelReplicaState{0: UnloadRequested, 1: LoadRequested}, + err: nil, }, { name: "DeletedModel", store: &LocalSchedulerStore{ - models: map[string]*Model{"model":&Model{ - versionMap: map[string]*ModelVersion{"1": - { - config: &pb.ModelDetails{Name: "model", Version: "1"}, + models: map[string]*Model{"model": &Model{ + versionMap: map[string]*ModelVersion{"1": { + config: &pb.ModelDetails{Name: "model", Version: "1"}, replicas: map[int]ModelReplicaState{}, }}, versions: []*ModelVersion{ { - config: &pb.ModelDetails{Name: "model", Version: "1"}, - replicas: map[int]ModelReplicaState{0:Loaded}, + config: &pb.ModelDetails{Name: "model", Version: "1"}, + replicas: map[int]ModelReplicaState{0: Loaded}, }, }, deleted: true, @@ -347,26 +338,25 @@ func TestUpdateLoadedModels(t *testing.T) { }, }, }, - modelKey: "model", - version: "1", - serverKey: "server", - replicas: []*ServerReplica{}, - expectedStates: map[int]ModelReplicaState{0:UnloadRequested}, - err: nil, + modelKey: "model", + version: "1", + serverKey: "server", + replicas: []*ServerReplica{}, + expectedStates: map[int]ModelReplicaState{0: UnloadRequested}, + err: nil, }, { name: "DeletedModelNoReplicas", store: &LocalSchedulerStore{ - models: map[string]*Model{"model":&Model{ - versionMap: map[string]*ModelVersion{"1": - { - config: &pb.ModelDetails{Name: "model", Version: "1"}, + models: map[string]*Model{"model": &Model{ + versionMap: map[string]*ModelVersion{"1": { + config: &pb.ModelDetails{Name: "model", Version: "1"}, replicas: map[int]ModelReplicaState{}, }}, versions: []*ModelVersion{ { - config: &pb.ModelDetails{Name: "model", Version: "1"}, - replicas: map[int]ModelReplicaState{0:Unloaded}, + config: &pb.ModelDetails{Name: "model", Version: "1"}, + replicas: map[int]ModelReplicaState{0: Unloaded}, }, }, deleted: true, @@ -381,22 +371,22 @@ func TestUpdateLoadedModels(t *testing.T) { }, }, }, - modelKey: "model", - version: "1", - serverKey: "server", - replicas: []*ServerReplica{}, - expectedStates: map[int]ModelReplicaState{0:Unloaded}, - err: nil, + modelKey: "model", + version: "1", + serverKey: "server", + replicas: []*ServerReplica{}, + expectedStates: map[int]ModelReplicaState{0: Unloaded}, + err: nil, }, } - for _,test := range tests { + for _, test := range tests { t.Run(test.name, func(t *testing.T) { ms := NewMemoryStore(logger, test.store) err := ms.UpdateLoadedModels(test.modelKey, test.version, test.serverKey, test.replicas) if test.err == nil { g.Expect(err).To(BeNil()) - for replicaIdx,state := range test.expectedStates { + for replicaIdx, state := range test.expectedStates { g.Expect(test.store.models[test.modelKey].Latest().GetModelReplicaState(replicaIdx)).To(Equal(state)) } } else { @@ -407,38 +397,36 @@ func TestUpdateLoadedModels(t *testing.T) { } } - func TestUpdateModelState(t *testing.T) { logger := log.New() g := NewGomegaWithT(t) type test struct { - name string - store *LocalSchedulerStore - modelKey string - version string - serverKey string - replicaIdx int - state ModelReplicaState + name string + store *LocalSchedulerStore + modelKey string + version string + serverKey string + replicaIdx int + state ModelReplicaState availableMemory uint64 - loaded bool - deleted bool - err error + loaded bool + deleted bool + err error } tests := []test{ { name: "LoadedModel", store: &LocalSchedulerStore{ - models: map[string]*Model{"model":&Model{ - versionMap: map[string]*ModelVersion{"1": - { - config: &pb.ModelDetails{Name: "model", Version: "1"}, + models: map[string]*Model{"model": &Model{ + versionMap: map[string]*ModelVersion{"1": { + config: &pb.ModelDetails{Name: "model", Version: "1"}, replicas: map[int]ModelReplicaState{}, }}, versions: []*ModelVersion{ { - config: &pb.ModelDetails{Name: "model", Version: "1"}, + config: &pb.ModelDetails{Name: "model", Version: "1"}, replicas: map[int]ModelReplicaState{}, }, }, @@ -447,33 +435,32 @@ func TestUpdateModelState(t *testing.T) { "server": { name: "server", replicas: map[int]*ServerReplica{ - 0: {loadedModels: map[string]bool {}}, - 1: {loadedModels: map[string]bool {}}, + 0: {loadedModels: map[string]bool{}}, + 1: {loadedModels: map[string]bool{}}, }, }, }, }, - modelKey: "model", - version: "1", - serverKey: "server", - replicaIdx: 0, - state: Loaded, - loaded: true, + modelKey: "model", + version: "1", + serverKey: "server", + replicaIdx: 0, + state: Loaded, + loaded: true, availableMemory: 20, - err: nil, + err: nil, }, { name: "UnloadedModel", store: &LocalSchedulerStore{ - models: map[string]*Model{"model":&Model{ - versionMap: map[string]*ModelVersion{"1": - { - config: &pb.ModelDetails{Name: "model", Version: "1"}, + models: map[string]*Model{"model": &Model{ + versionMap: map[string]*ModelVersion{"1": { + config: &pb.ModelDetails{Name: "model", Version: "1"}, replicas: map[int]ModelReplicaState{}, }}, versions: []*ModelVersion{ { - config: &pb.ModelDetails{Name: "model", Version: "1"}, + config: &pb.ModelDetails{Name: "model", Version: "1"}, replicas: map[int]ModelReplicaState{}, }, }, @@ -482,33 +469,32 @@ func TestUpdateModelState(t *testing.T) { "server": { name: "server", replicas: map[int]*ServerReplica{ - 0: {loadedModels: map[string]bool {}}, - 1: {loadedModels: map[string]bool {}}, + 0: {loadedModels: map[string]bool{}}, + 1: {loadedModels: map[string]bool{}}, }, }, }, }, - modelKey: "model", - version: "1", - serverKey: "server", - replicaIdx: 0, - state: Unloaded, - loaded: false, + modelKey: "model", + version: "1", + serverKey: "server", + replicaIdx: 0, + state: Unloaded, + loaded: false, availableMemory: 20, - err: nil, + err: nil, }, { name: "DeletedModel", store: &LocalSchedulerStore{ - models: map[string]*Model{"model":&Model{ - versionMap: map[string]*ModelVersion{"1": - { - config: &pb.ModelDetails{Name: "model", Version: "1"}, + models: map[string]*Model{"model": &Model{ + versionMap: map[string]*ModelVersion{"1": { + config: &pb.ModelDetails{Name: "model", Version: "1"}, replicas: map[int]ModelReplicaState{}, }}, versions: []*ModelVersion{ { - config: &pb.ModelDetails{Name: "model", Version: "1"}, + config: &pb.ModelDetails{Name: "model", Version: "1"}, replicas: map[int]ModelReplicaState{}, }, }, @@ -518,25 +504,25 @@ func TestUpdateModelState(t *testing.T) { "server": { name: "server", replicas: map[int]*ServerReplica{ - 0: {loadedModels: map[string]bool {}}, - 1: {loadedModels: map[string]bool {}}, + 0: {loadedModels: map[string]bool{}}, + 1: {loadedModels: map[string]bool{}}, }, }, }, }, - modelKey: "model", - version: "1", - serverKey: "server", - replicaIdx: 0, - state: Unloaded, - loaded: false, + modelKey: "model", + version: "1", + serverKey: "server", + replicaIdx: 0, + state: Unloaded, + loaded: false, availableMemory: 20, - deleted: true, - err: nil, + deleted: true, + err: nil, }, } - for _,test := range tests { + for _, test := range tests { t.Run(test.name, func(t *testing.T) { ms := NewMemoryStore(logger, test.store) err := ms.UpdateModelState(test.modelKey, test.version, test.serverKey, test.replicaIdx, &test.availableMemory, test.state) @@ -556,4 +542,3 @@ func TestUpdateModelState(t *testing.T) { }) } } - diff --git a/scheduler/pkg/store/mesh.go b/scheduler/pkg/store/mesh.go index ab27d69b75..262fd5a9cc 100644 --- a/scheduler/pkg/store/mesh.go +++ b/scheduler/pkg/store/mesh.go @@ -7,8 +7,8 @@ import ( ) type LocalSchedulerStore struct { - servers map[string]*Server - models map[string]*Model + servers map[string]*Server + models map[string]*Model failedToScheduleModels map[string]bool } @@ -22,8 +22,8 @@ func NewLocalSchedulerStore() *LocalSchedulerStore { type Model struct { versionMap map[string]*ModelVersion - versions []*ModelVersion - deleted bool + versions []*ModelVersion + deleted bool } func NewModel() *Model { @@ -33,19 +33,19 @@ func NewModel() *Model { } type ModelVersion struct { - config *pb.ModelDetails - server string + config *pb.ModelDetails + server string replicas map[int]ModelReplicaState - deleted bool - state ModelState + deleted bool + state ModelState } func NewDefaultModelVersion(config *pb.ModelDetails) *ModelVersion { return &ModelVersion{ - config: config, + config: config, replicas: make(map[int]ModelReplicaState), - deleted: false, - state: ModelStateUnknown, + deleted: false, + state: ModelStateUnknown, } } @@ -55,39 +55,39 @@ func NewModelVersion(config *pb.ModelDetails, deleted bool, state ModelState) *ModelVersion { return &ModelVersion{ - config: config, - server: server, + config: config, + server: server, replicas: replicas, - deleted: deleted, - state: state, + deleted: deleted, + state: state, } } type Server struct { - name string + name string replicas map[int]*ServerReplica - shared bool + shared bool } func NewServer(name string, shared bool) *Server { return &Server{ - name: name, + name: name, replicas: make(map[int]*ServerReplica), - shared: shared, + shared: shared, } } type ServerReplica struct { - inferenceSvc string + inferenceSvc string inferenceHttpPort int32 inferenceGrpcPort int32 - replicaIdx int - server *Server - capabilities []string - memory uint64 - availableMemory uint64 - loadedModels map[string]bool - overCommit bool + replicaIdx int + server *Server + capabilities []string + memory uint64 + availableMemory uint64 + loadedModels map[string]bool + overCommit bool } func NewServerReplica(inferenceSvc string, @@ -101,31 +101,31 @@ func NewServerReplica(inferenceSvc string, loadedModels map[string]bool, overCommit bool) *ServerReplica { return &ServerReplica{ - inferenceSvc: inferenceSvc, + inferenceSvc: inferenceSvc, inferenceHttpPort: inferenceHttpPort, inferenceGrpcPort: inferenceGrpcPort, - replicaIdx: replicaIdx, - server: server, - capabilities: capabilities, - memory: memory, - availableMemory: availableMemory, - loadedModels: loadedModels, - overCommit: overCommit, + replicaIdx: replicaIdx, + server: server, + capabilities: capabilities, + memory: memory, + availableMemory: availableMemory, + loadedModels: loadedModels, + overCommit: overCommit, } } -func NewServerReplicaFromConfig (server *Server, replicaIdx int, loadedModels map[string]bool, config *pba.ReplicaConfig) *ServerReplica { +func NewServerReplicaFromConfig(server *Server, replicaIdx int, loadedModels map[string]bool, config *pba.ReplicaConfig) *ServerReplica { return &ServerReplica{ - inferenceSvc: config.GetInferenceSvc(), + inferenceSvc: config.GetInferenceSvc(), inferenceHttpPort: config.GetInferenceHttpPort(), inferenceGrpcPort: config.GetInferenceGrpcPort(), - replicaIdx: replicaIdx, - server: server, - capabilities: config.GetCapabilities(), - memory: config.GetMemoryBytes(), - availableMemory: config.GetAvailableMemoryBytes(), - loadedModels: loadedModels, - overCommit: config.GetOverCommit(), + replicaIdx: replicaIdx, + server: server, + capabilities: config.GetCapabilities(), + memory: config.GetMemoryBytes(), + availableMemory: config.GetAvailableMemoryBytes(), + loadedModels: loadedModels, + overCommit: config.GetOverCommit(), } } @@ -161,14 +161,14 @@ func (m *Model) HasLatest() bool { func (m *Model) Latest() *ModelVersion { if len(m.versions) > 0 { - return m.versions[len(m.versions) - 1] + return m.versions[len(m.versions)-1] } else { return nil } } func (m *Model) Inactive() bool { - for _,mv := range m.versions { + for _, mv := range m.versions { if !mv.Inactive() { return false } @@ -206,13 +206,12 @@ func (m *ModelVersion) Server() string { func (m *ModelVersion) ReplicaState() map[int32]string { replicaState := make(map[int32]string) - for k,v:= range m.replicas { + for k, v := range m.replicas { replicaState[int32(k)] = v.String() } return replicaState } - func (m *ModelVersion) GetModelReplicaState(replicaIdx int) ModelReplicaState { state, ok := m.replicas[replicaIdx] if !ok { @@ -240,7 +239,7 @@ func (m *ModelVersion) HasServer() bool { } func (m *ModelVersion) Inactive() bool { - for _,v := range m.replicas { + for _, v := range m.replicas { if !(v == Unloaded || v == UnloadFailed || v == ModelReplicaStateUnknown) { return false } @@ -249,7 +248,7 @@ func (m *ModelVersion) Inactive() bool { } func (m *ModelVersion) IsLoading(replicaIdx int) bool { - for r,v := range m.replicas { + for r, v := range m.replicas { if r == replicaIdx && (v == Loaded || v == LoadRequested || v == Loading) { return true } @@ -258,19 +257,19 @@ func (m *ModelVersion) IsLoading(replicaIdx int) bool { } func (m *ModelVersion) NoLiveReplica() bool { - for _,v := range m.replicas { + for _, v := range m.replicas { if !v.NoEndpoint() { return false } } - return true; + return true } func (m *ModelVersion) GetAssignment() []int { var assignment []int - for k,v := range m.replicas { + for k, v := range m.replicas { if v == Loaded { - assignment = append(assignment,k) + assignment = append(assignment, k) } } return assignment @@ -314,8 +313,6 @@ func (s *Server) GetReplicaInferenceHttpPort(idx int) int32 { return s.replicas[idx].inferenceHttpPort } - - func (s *ServerReplica) GetLoadedModels() []string { var models []string for model := range s.loadedModels { @@ -388,7 +385,3 @@ func (m ModelReplicaState) IsLoadingState() bool { return false } } - - - - diff --git a/scheduler/pkg/store/store.go b/scheduler/pkg/store/store.go index e924f8e4c3..ae6d372923 100644 --- a/scheduler/pkg/store/store.go +++ b/scheduler/pkg/store/store.go @@ -2,20 +2,21 @@ package store import ( "errors" + pba "github.com/seldonio/seldon-core/scheduler/apis/mlops/agent" pb "github.com/seldonio/seldon-core/scheduler/apis/mlops/scheduler" ) type ServerSnapshot struct { - Name string + Name string Replicas map[int]*ServerReplica - Shared bool + Shared bool } type ModelSnapshot struct { - Name string + Name string Versions []*ModelVersion - Deleted bool + Deleted bool } func (m *ModelSnapshot) GetLatest() *ModelVersion { @@ -28,7 +29,7 @@ func (m *ModelSnapshot) GetLatest() *ModelVersion { } var ( - ModelVersionExistsErr = errors.New("model version already exists") + ModelVersionExistsErr = errors.New("model version already exists") ModelNotLatestVersionRejectErr = errors.New("Model version is not latest. Rejecting update.") )