diff --git a/.gitignore b/.gitignore index 9eedfe1..f8f7a55 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -gen \ No newline at end of file +gen +.idea \ No newline at end of file diff --git a/caller/caller.go b/caller/caller.go index bab4347..517a0c2 100644 --- a/caller/caller.go +++ b/caller/caller.go @@ -77,7 +77,16 @@ func initLogger() { // 设置日志输出格式为JSON Logger.SetFormatter(&LogFormatter{}) // 设置日志级别为Info - Logger.SetLevel(logrus.InfoLevel) + switch ConfigValue.LogConfig.Level { + case "info": + Logger.SetLevel(logrus.InfoLevel) + case "debug": + Logger.SetLevel(logrus.DebugLevel) + case "trace": + Logger.SetLevel(logrus.TraceLevel) + default: + Logger.SetLevel(logrus.InfoLevel) + } // 创建一个 lumberjack.Logger,用于日志轮转配置 logFile := &lumberjack.Logger{ diff --git a/config/config.yaml b/config/config.yaml index f924c2b..14f2833 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,3 +1,6 @@ +log: + level: "info" + # slurm 数据库配置 mysql: host: 127.0.0.1 diff --git a/services/account/account.go b/services/account/account.go index 0300ad5..8dad11c 100644 --- a/services/account/account.go +++ b/services/account/account.go @@ -37,6 +37,7 @@ func (s *ServerAccount) ListAccounts(ctx context.Context, in *pb.ListAccountsReq } st := status.New(codes.Internal, "The username contains illegal characters.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("ListAccounts failed: %v", st.Err()) return nil, st.Err() } // 获取集群名 @@ -52,6 +53,7 @@ func (s *ServerAccount) ListAccounts(ctx context.Context, in *pb.ListAccountsReq message := fmt.Sprintf("%s does not exists.", in.UserId) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("ListAccounts failed: %v", st.Err()) return nil, st.Err() } // 查询用户相关联的所有账户信息 @@ -63,6 +65,7 @@ func (s *ServerAccount) ListAccounts(ctx context.Context, in *pb.ListAccountsReq } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("ListAccounts failed: %v", st.Err()) return nil, st.Err() } defer rows.Close() @@ -74,6 +77,7 @@ func (s *ServerAccount) ListAccounts(ctx context.Context, in *pb.ListAccountsReq } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("ListAccounts failed: %v", st.Err()) return nil, st.Err() } acctList = append(acctList, assocAcct) @@ -85,8 +89,10 @@ func (s *ServerAccount) ListAccounts(ctx context.Context, in *pb.ListAccountsReq } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("ListAccounts failed: %v", st.Err()) return nil, st.Err() } + caller.Logger.Tracef("ListAccounts Response: %v", &pb.ListAccountsResponse{Accounts: acctList}) return &pb.ListAccountsResponse{Accounts: acctList}, nil } @@ -106,6 +112,7 @@ func (s *ServerAccount) CreateAccount(ctx context.Context, in *pb.CreateAccountR } st := status.New(codes.Internal, "The account or username contains illegal characters.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("CreateAccount failed: %v", st.Err()) return nil, st.Err() } // 获取系统中默认的Qos信息 @@ -121,6 +128,7 @@ func (s *ServerAccount) CreateAccount(ctx context.Context, in *pb.CreateAccountR } st := status.New(codes.Internal, "Exec command failed or don't set partitions.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("CreateAccount failed: %v", st.Err()) return nil, st.Err() } // 获取系统中Qos @@ -132,6 +140,7 @@ func (s *ServerAccount) CreateAccount(ctx context.Context, in *pb.CreateAccountR } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("CreateAccount failed: %v", st.Err()) return nil, st.Err() } defer rows.Close() @@ -143,6 +152,7 @@ func (s *ServerAccount) CreateAccount(ctx context.Context, in *pb.CreateAccountR } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("CreateAccount failed: %v", st.Err()) return nil, st.Err() } qosList = append(qosList, qosName) @@ -155,6 +165,7 @@ func (s *ServerAccount) CreateAccount(ctx context.Context, in *pb.CreateAccountR } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("CreateAccount failed: %v", st.Err()) return nil, st.Err() } baseQos := strings.Join(qosList, ",") @@ -166,6 +177,7 @@ func (s *ServerAccount) CreateAccount(ctx context.Context, in *pb.CreateAccountR } st := status.New(codes.Internal, "Exec command failed.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("CreateAccount failed: %v", st.Err()) return nil, st.Err() } for _, p := range partitions { @@ -178,6 +190,7 @@ func (s *ServerAccount) CreateAccount(ctx context.Context, in *pb.CreateAccountR } st := status.New(codes.Internal, "Exec command failed.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("CreateAccount failed: %v", st.Err()) return nil, st.Err() } retcode02 := utils.ExecuteShellCommand(modifyUserCmd) @@ -187,9 +200,11 @@ func (s *ServerAccount) CreateAccount(ctx context.Context, in *pb.CreateAccountR } st := status.New(codes.Internal, "Exec command failed.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("CreateAccount failed: %v", st.Err()) return nil, st.Err() } } + caller.Logger.Infof("CreateAccount sucess! account is: %v, owerUserId is: %v", in.AccountName, in.OwnerUserId) return &pb.CreateAccountResponse{}, nil } errInfo := &errdetails.ErrorInfo{ @@ -198,6 +213,7 @@ func (s *ServerAccount) CreateAccount(ctx context.Context, in *pb.CreateAccountR message := fmt.Sprintf("The %s is already exists.", in.AccountName) st := status.New(codes.AlreadyExists, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("CreateAccount failed: %v", st.Err()) return nil, st.Err() } @@ -219,6 +235,7 @@ func (s *ServerAccount) BlockAccount(ctx context.Context, in *pb.BlockAccountReq } st := status.New(codes.Internal, "The account contains illegal characters.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("BlockAccount failed: %v", st.Err()) return nil, st.Err() } @@ -233,6 +250,7 @@ func (s *ServerAccount) BlockAccount(ctx context.Context, in *pb.BlockAccountReq message := fmt.Sprintf("%s does not exists.", in.AccountName) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("BlockAccount failed: %v", st.Err()) return nil, st.Err() } // 获取系统中计算分区信息 @@ -243,6 +261,7 @@ func (s *ServerAccount) BlockAccount(ctx context.Context, in *pb.BlockAccountReq } st := status.New(codes.Internal, "Exec command failed or don't set partitions.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("BlockAccount failed: %v", st.Err()) return nil, st.Err() } // 获取计算分区AllowAccounts的值 @@ -256,6 +275,7 @@ func (s *ServerAccount) BlockAccount(ctx context.Context, in *pb.BlockAccountReq } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("BlockAccount failed: %v", st.Err()) return nil, st.Err() } if output == "ALL" { @@ -267,6 +287,7 @@ func (s *ServerAccount) BlockAccount(ctx context.Context, in *pb.BlockAccountReq } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("BlockAccount failed: %v", st.Err()) return nil, st.Err() } defer rows.Close() @@ -278,6 +299,7 @@ func (s *ServerAccount) BlockAccount(ctx context.Context, in *pb.BlockAccountReq } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("BlockAccount failed: %v", st.Err()) return nil, st.Err() } acctList = append(acctList, assocAcctName) @@ -289,6 +311,7 @@ func (s *ServerAccount) BlockAccount(ctx context.Context, in *pb.BlockAccountReq } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("BlockAccount failed: %v", st.Err()) return nil, st.Err() } allowAcct := strings.Join(acctList, ",") @@ -301,6 +324,7 @@ func (s *ServerAccount) BlockAccount(ctx context.Context, in *pb.BlockAccountReq } st := status.New(codes.Internal, "Exec command failed.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("BlockAccount failed: %v", st.Err()) return nil, st.Err() } } @@ -324,9 +348,11 @@ func (s *ServerAccount) BlockAccount(ctx context.Context, in *pb.BlockAccountReq } st := status.New(codes.Internal, "Exec command failed.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("BlockAccount failed: %v", st.Err()) return nil, st.Err() } } + caller.Logger.Infof("BlockAccount sucess! account is: %v", in.AccountName) return &pb.BlockAccountResponse{}, nil } @@ -346,6 +372,7 @@ func (s *ServerAccount) UnblockAccount(ctx context.Context, in *pb.UnblockAccoun } st := status.New(codes.Internal, "The account contains illegal characters.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("UnblockAccount failed: %v", st.Err()) return nil, st.Err() } // 检查账户名是否在slurm中 @@ -358,6 +385,7 @@ func (s *ServerAccount) UnblockAccount(ctx context.Context, in *pb.UnblockAccoun message := fmt.Sprintf("%s does not exists.", in.AccountName) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("UnblockAccount failed: %v", st.Err()) return nil, st.Err() } // 获取系统中计算分区信息 @@ -368,6 +396,7 @@ func (s *ServerAccount) UnblockAccount(ctx context.Context, in *pb.UnblockAccoun } st := status.New(codes.Internal, "Exec command failed or don't set partitions.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("UnblockAccount failed: %v", st.Err()) return nil, st.Err() } getAllowAcctCmd := fmt.Sprintf("scontrol show partition %s | grep AllowAccounts | awk '{print $2}' | awk -F '=' '{print $2}'", partitions[0]) @@ -378,9 +407,11 @@ func (s *ServerAccount) UnblockAccount(ctx context.Context, in *pb.UnblockAccoun } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("UnblockAccount failed: %v", st.Err()) return nil, st.Err() } if output == "ALL" { + caller.Logger.Infof("Accout %v is Unblocked!", in.AccountName) return &pb.UnblockAccountResponse{}, nil } AllowAcctList := strings.Split(output, ",") @@ -397,9 +428,11 @@ func (s *ServerAccount) UnblockAccount(ctx context.Context, in *pb.UnblockAccoun } st := status.New(codes.Internal, "Exec command failed.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("UnblockAccount failed: %v", st.Err()) return nil, st.Err() } } + caller.Logger.Infof("Accout %v Unblocked sucess!", in.AccountName) return &pb.UnblockAccountResponse{}, nil } return &pb.UnblockAccountResponse{}, nil @@ -427,6 +460,7 @@ func (s *ServerAccount) GetAllAccountsWithUsers(ctx context.Context, in *pb.GetA } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAllAccountsWithUsers failed: %v", st.Err()) return nil, st.Err() } defer rows.Close() @@ -438,6 +472,7 @@ func (s *ServerAccount) GetAllAccountsWithUsers(ctx context.Context, in *pb.GetA } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAllAccountsWithUsers failed: %v", st.Err()) return nil, st.Err() } acctList = append(acctList, acctName) @@ -449,6 +484,7 @@ func (s *ServerAccount) GetAllAccountsWithUsers(ctx context.Context, in *pb.GetA } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAllAccountsWithUsers failed: %v", st.Err()) return nil, st.Err() } @@ -460,6 +496,7 @@ func (s *ServerAccount) GetAllAccountsWithUsers(ctx context.Context, in *pb.GetA } st := status.New(codes.Internal, "Exec command failed or don't set partitions.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAllAccountsWithUsers failed: %v", st.Err()) return nil, st.Err() } getAllowAcctCmd := fmt.Sprintf("scontrol show partition %s | grep AllowAccounts | awk '{print $2}' | awk -F '=' '{print $2}'", partitions[0]) @@ -470,6 +507,7 @@ func (s *ServerAccount) GetAllAccountsWithUsers(ctx context.Context, in *pb.GetA } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAllAccountsWithUsers failed: %v", st.Err()) return nil, st.Err() } // 获取和每个账户关联的用户的信息 @@ -483,6 +521,7 @@ func (s *ServerAccount) GetAllAccountsWithUsers(ctx context.Context, in *pb.GetA } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAllAccountsWithUsers failed: %v", st.Err()) return nil, st.Err() } defer rows.Close() @@ -511,6 +550,7 @@ func (s *ServerAccount) GetAllAccountsWithUsers(ctx context.Context, in *pb.GetA } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAllAccountsWithUsers failed: %v", st.Err()) return nil, st.Err() } if output == "ALL" { @@ -537,6 +577,7 @@ func (s *ServerAccount) GetAllAccountsWithUsers(ctx context.Context, in *pb.GetA } } } + caller.Logger.Tracef("GetAllAccountsWithUsers: %v", acctInfo) return &pb.GetAllAccountsWithUsersResponse{Accounts: acctInfo}, nil } @@ -554,6 +595,7 @@ func (s *ServerAccount) QueryAccountBlockStatus(ctx context.Context, in *pb.Quer } st := status.New(codes.Internal, "The account contains illegal characters.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("QueryAccountBlockStatus failed: %v", st.Err()) return nil, st.Err() } // 检查账户名是否在slurm中 @@ -566,6 +608,7 @@ func (s *ServerAccount) QueryAccountBlockStatus(ctx context.Context, in *pb.Quer message := fmt.Sprintf("%s does not exists.", in.AccountName) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("QueryAccountBlockStatus failed: %v", st.Err()) return nil, st.Err() } // 获取系统中计算分区信息 @@ -576,6 +619,7 @@ func (s *ServerAccount) QueryAccountBlockStatus(ctx context.Context, in *pb.Quer } st := status.New(codes.Internal, "Exec command failed or don't set partitions.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("QueryAccountBlockStatus failed: %v", st.Err()) return nil, st.Err() } // 获取系统中分区AllowAccounts信息 @@ -587,6 +631,7 @@ func (s *ServerAccount) QueryAccountBlockStatus(ctx context.Context, in *pb.Quer } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("QueryAccountBlockStatus failed: %v", st.Err()) return nil, st.Err() } if output == "ALL" { @@ -595,8 +640,10 @@ func (s *ServerAccount) QueryAccountBlockStatus(ctx context.Context, in *pb.Quer acctList := strings.Split(output, ",") index := arrays.ContainsString(acctList, in.AccountName) if index == -1 { + caller.Logger.Infof("Account %v is Blocked", in.AccountName) return &pb.QueryAccountBlockStatusResponse{Blocked: true}, nil } + caller.Logger.Infof("Account %v is Unblocked", in.AccountName) return &pb.QueryAccountBlockStatusResponse{Blocked: false}, nil } @@ -615,6 +662,7 @@ func (s *ServerAccount) DeleteAccount(ctx context.Context, in *pb.DeleteAccountR message := fmt.Sprintf("%s does not exists.", in.AccountName) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("DeleteAccount failed: %v", st.Err()) return nil, st.Err() } // 作业的判断 @@ -626,6 +674,7 @@ func (s *ServerAccount) DeleteAccount(ctx context.Context, in *pb.DeleteAccountR } st := status.New(codes.NotFound, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("DeleteAccount failed: %v", st.Err()) return nil, st.Err() } if len(runningJobInfo) == 0 { @@ -640,6 +689,7 @@ func (s *ServerAccount) DeleteAccount(ctx context.Context, in *pb.DeleteAccountR } st := status.New(codes.NotFound, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("DeleteAccount failed: %v", st.Err()) return nil, st.Err() } return &pb.DeleteAccountResponse{}, nil @@ -650,6 +700,7 @@ func (s *ServerAccount) DeleteAccount(ctx context.Context, in *pb.DeleteAccountR } st := status.New(codes.NotFound, "Exist running jobs.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("DeleteAccount failed: %v", st.Err()) return nil, st.Err() } } diff --git a/services/config/config.go b/services/config/config.go index 90f631c..ccc0820 100644 --- a/services/config/config.go +++ b/services/config/config.go @@ -40,6 +40,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or don't set partitions.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } // 查系统中的所有qos @@ -51,6 +52,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } defer rows.Close() @@ -62,6 +64,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } qosList = append(qosList, qosName) @@ -73,6 +76,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } // fmt.Println(caller.ConfigValue.PartitionDesc, 112222) @@ -108,6 +112,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } totalMemsTmp, err = utils.RunCommand(totalMemsCmd) @@ -117,6 +122,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } totalNodes, err = utils.RunCommand(totalNodesCmd) @@ -126,6 +132,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } @@ -141,6 +148,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } totalMemsTmp, err = utils.RunCommand(totalMemsCmd) @@ -150,6 +158,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } totalNodes, err = utils.RunCommand(totalNodesCmd) @@ -159,6 +168,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } } @@ -188,6 +198,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld dwon.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } totalCpuInt, _ = strconv.Atoi(totalCpus) @@ -199,6 +210,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } totalNodeNumInt, _ = strconv.Atoi(totalNodes) @@ -212,6 +224,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } nodeArray := strings.Split(nodeOutput, ",") @@ -225,6 +238,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } nodeName := strings.Join(strings.Split(nodeNameOutput, " "), "") @@ -236,6 +250,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } @@ -250,6 +265,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } @@ -262,6 +278,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } @@ -274,6 +291,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } nodeArray := strings.Split(nodeOutput, ",") @@ -288,6 +306,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } nodeName := strings.Join(strings.Split(nodeNameOutput, " "), "") @@ -299,6 +318,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } if gpusOutput == "Gres=(null)" { @@ -317,6 +337,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } if gpusOutput == "Gres=(null)" { @@ -334,6 +355,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } qosArray := strings.Split(qosOutput, "=") @@ -348,6 +370,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterConfig failed: %v", st.Err()) return nil, st.Err() } @@ -377,6 +400,7 @@ func (s *ServerConfig) GetClusterConfig(ctx context.Context, in *pb.GetClusterCo Comment: &comment, }) } + caller.Logger.Tracef("GetClusterConfig: %v", &pb.GetClusterConfigResponse{Partitions: parts, SchedulerName: "slurm"}) return &pb.GetClusterConfigResponse{Partitions: parts, SchedulerName: "slurm"}, nil } @@ -401,6 +425,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "The username contains illegal characters.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } // 获取集群名 @@ -416,6 +441,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva message := fmt.Sprintf("%s does not exists.", in.AccountName) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } // 判断用户是否存在 @@ -428,6 +454,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva message := fmt.Sprintf("%s does not exists.", in.UserId) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } // 检查账户和用户之间是否存在关联关系 @@ -440,6 +467,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva message := fmt.Sprintf("%s and %s assocation is not exists!", in.UserId, in.AccountName) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } // 查系统中的所有qos @@ -451,6 +479,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } defer rows.Close() @@ -462,6 +491,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } qosList = append(qosList, qosName) @@ -473,6 +503,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } // 关联关系存在的情况下去找用户 @@ -483,6 +514,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "Exec command failed or don't set partitions.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } for _, partition := range partitions { @@ -500,6 +532,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } index := arrays.Contains(strings.Split(accouts, ","), in.AccountName) @@ -512,6 +545,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } totalCpuInt, _ = strconv.Atoi(totalCpus) @@ -523,6 +557,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } totalNodeNumInt, _ = strconv.Atoi(totalNodes) @@ -535,6 +570,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } if output != "" { @@ -547,6 +583,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } @@ -571,6 +608,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } nodeArray := strings.Split(nodeOutput, ",") @@ -584,6 +622,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } nodeName := strings.Join(strings.Split(nodeNameOutput, " "), "") @@ -595,6 +634,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } nodeMem, _ := strconv.Atoi(memOutput) @@ -608,6 +648,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } nodeMem, _ := strconv.Atoi(memOutput) @@ -623,6 +664,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } nodeArray := strings.Split(nodeOutput, ",") @@ -637,6 +679,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } nodeName := strings.Join(strings.Split(nodeNameOutput, " "), "") @@ -648,6 +691,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } if gpusOutput == "Gres=(null)" { @@ -666,6 +710,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } if gpusOutput == "Gres=(null)" { @@ -683,6 +728,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } qosArray := strings.Split(qosOutput, "=") @@ -697,6 +743,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetAvailablePartitions failed: %v", st.Err()) return nil, st.Err() } @@ -729,6 +776,7 @@ func (s *ServerConfig) GetAvailablePartitions(ctx context.Context, in *pb.GetAva continue } } + caller.Logger.Tracef("GetAvailablePartitions: %v", &pb.GetAvailablePartitionsResponse{Partitions: parts}) return &pb.GetAvailablePartitionsResponse{Partitions: parts}, nil } @@ -842,6 +890,7 @@ func (s *ServerConfig) GetClusterNodesInfo(ctx context.Context, in *pb.GetCluste } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterNodesInfo failed: %v", st.Err()) return nil, st.Err() } // 按行分割输出 @@ -851,7 +900,7 @@ func (s *ServerConfig) GetClusterNodesInfo(ctx context.Context, in *pb.GetCluste nodeInfo := extractNodeInfo(line) nodesInfo = append(nodesInfo, nodeInfo) } - caller.Logger.Infof("GetClusterNodesInfoResponse: %v", nodesInfo) + caller.Logger.Tracef("GetClusterNodesInfoResponse: %v", nodesInfo) return &pb.GetClusterNodesInfoResponse{Nodes: nodesInfo}, nil } @@ -876,11 +925,12 @@ func (s *ServerConfig) GetClusterNodesInfo(ctx context.Context, in *pb.GetCluste select { case err := <-errChan: if err != nil { + caller.Logger.Errorf("GetClusterNodesInfo failed: %v", err) return nil, err } default: } - caller.Logger.Infof("GetClusterNodesInfoResponse: %v", nodesInfo) + caller.Logger.Tracef("GetClusterNodesInfoResponse: %v", nodesInfo) return &pb.GetClusterNodesInfoResponse{Nodes: nodesInfo}, nil } @@ -898,6 +948,7 @@ func (s *ServerConfig) GetClusterInfo(ctx context.Context, in *pb.GetClusterInfo } st := status.New(codes.Internal, "Exec command failed or don't set partitions.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterInfo failed: %v", st.Err()) return nil, st.Err() } for _, v := range partitions { @@ -927,6 +978,7 @@ func (s *ServerConfig) GetClusterInfo(ctx context.Context, in *pb.GetClusterInfo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterInfo failed: %v", st.Err()) return nil, st.Err() } @@ -982,6 +1034,7 @@ func (s *ServerConfig) GetClusterInfo(ctx context.Context, in *pb.GetClusterInfo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterInfo failed: %v", st.Err()) return nil, st.Err() } pdJobNum, _ = strconv.Atoi(pdresult) @@ -993,6 +1046,7 @@ func (s *ServerConfig) GetClusterInfo(ctx context.Context, in *pb.GetClusterInfo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterInfo failed: %v", st.Err()) return nil, st.Err() } runningJobNum, _ = strconv.Atoi(runningresult) @@ -1051,6 +1105,7 @@ func (s *ServerConfig) GetClusterInfo(ctx context.Context, in *pb.GetClusterInfo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterInfo failed: %v", st.Err()) return nil, st.Err() } pdJobNum, _ = strconv.Atoi(pdresult) @@ -1062,6 +1117,7 @@ func (s *ServerConfig) GetClusterInfo(ctx context.Context, in *pb.GetClusterInfo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterInfo failed: %v", st.Err()) return nil, st.Err() } runningJobNum, _ = strconv.Atoi(runningResult) @@ -1076,6 +1132,7 @@ func (s *ServerConfig) GetClusterInfo(ctx context.Context, in *pb.GetClusterInfo } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetClusterInfo failed: %v", st.Err()) return nil, st.Err() } runningGpus, _ = strconv.Atoi(useGpuCardResult) @@ -1131,5 +1188,6 @@ func (s *ServerConfig) GetClusterInfo(ctx context.Context, in *pb.GetClusterInfo } } } + caller.Logger.Tracef("GetClusterInfo: %v", &pb.GetClusterInfoResponse{ClusterName: clusterName, Partitions: parts}) return &pb.GetClusterInfoResponse{ClusterName: clusterName, Partitions: parts}, nil } diff --git a/services/job/job.go b/services/job/job.go index 33f23f6..56645f1 100644 --- a/services/job/job.go +++ b/services/job/job.go @@ -38,6 +38,7 @@ func (s *ServerJob) CancelJob(ctx context.Context, in *pb.CancelJobRequest) (*pb } st := status.New(codes.Internal, "The username contains illegal characters.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("CancelJob failed: %v", st.Err()) return nil, st.Err() } // 判断用户是否存在 @@ -50,6 +51,7 @@ func (s *ServerJob) CancelJob(ctx context.Context, in *pb.CancelJobRequest) (*pb message := fmt.Sprintf("%s does not exists.", in.UserId) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("CancelJob failed: %v", st.Err()) return nil, st.Err() } // 从squeue来获取对应的作业信息 @@ -61,6 +63,7 @@ func (s *ServerJob) CancelJob(ctx context.Context, in *pb.CancelJobRequest) (*pb } st := status.New(codes.NotFound, "The job does not exist.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("CancelJob failed: %v", st.Err()) return nil, st.Err() } // 取消作业 @@ -71,6 +74,7 @@ func (s *ServerJob) CancelJob(ctx context.Context, in *pb.CancelJobRequest) (*pb } st := status.New(codes.Unknown, response) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("CancelJob failed: %v", st.Err()) return nil, st.Err() } return &pb.CancelJobResponse{}, nil @@ -91,6 +95,7 @@ func (s *ServerJob) QueryJobTimeLimit(ctx context.Context, in *pb.QueryJobTimeLi } st := status.New(codes.NotFound, "The job does not exist.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("QueryJobTimeLimit failed: %v", st.Err()) return nil, st.Err() } return &pb.QueryJobTimeLimitResponse{TimeLimitMinutes: timeLimit}, nil @@ -108,6 +113,7 @@ func (s *ServerJob) ChangeJobTimeLimit(ctx context.Context, in *pb.ChangeJobTime } st := status.New(codes.NotFound, "The job does not exist.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("ChangeJobTimeLimit failed: %v", st.Err()) return nil, st.Err() } if in.DeltaMinutes >= 0 { @@ -119,6 +125,7 @@ func (s *ServerJob) ChangeJobTimeLimit(ctx context.Context, in *pb.ChangeJobTime } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("ChangeJobTimeLimit failed: %v", st.Err()) return nil, st.Err() } } else { @@ -131,6 +138,7 @@ func (s *ServerJob) ChangeJobTimeLimit(ctx context.Context, in *pb.ChangeJobTime } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("ChangeJobTimeLimit failed: %v", st.Err()) return nil, st.Err() } } @@ -187,6 +195,7 @@ func (s *ServerJob) GetJobById(ctx context.Context, in *pb.GetJobByIdRequest) (* } st := status.New(codes.NotFound, "The job does not exist.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("Failed get job by id, error is: %v", st.Err()) return nil, st.Err() } // 查询cputresId、memTresId、nodeTresId值 @@ -218,6 +227,7 @@ func (s *ServerJob) GetJobById(ctx context.Context, in *pb.GetJobByIdRequest) (* } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("Failed get job by id, error is: %v", st.Err()) return nil, st.Err() } @@ -230,6 +240,7 @@ func (s *ServerJob) GetJobById(ctx context.Context, in *pb.GetJobByIdRequest) (* } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("Failed get job by id, error is: %v", st.Err()) return nil, st.Err() } defer rows.Close() @@ -241,6 +252,7 @@ func (s *ServerJob) GetJobById(ctx context.Context, in *pb.GetJobByIdRequest) (* } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("Failed get job by id, error is: %v", st.Err()) return nil, st.Err() } gpuIdList = append(gpuIdList, gpuId) @@ -252,6 +264,7 @@ func (s *ServerJob) GetJobById(ctx context.Context, in *pb.GetJobByIdRequest) (* } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("Failed get job by id, error is: %v", st.Err()) return nil, st.Err() } @@ -265,6 +278,7 @@ func (s *ServerJob) GetJobById(ctx context.Context, in *pb.GetJobByIdRequest) (* } st := status.New(codes.Internal, "Exec command failed or slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("Failed get job by id, error is: %v", st.Err()) return nil, st.Err() } reason = output @@ -351,6 +365,7 @@ func (s *ServerJob) GetJobById(ctx context.Context, in *pb.GetJobByIdRequest) (* ElapsedSeconds: &elapsedSeconds, GpusAlloc: &gpusAlloc, } + caller.Logger.Infof("GetJobByIdResponse: %v", jobInfo) return &pb.GetJobByIdResponse{Job: jobInfo}, nil } else { jobInfo := &pb.JobInfo{} @@ -406,6 +421,7 @@ func (s *ServerJob) GetJobById(ctx context.Context, in *pb.GetJobByIdRequest) (* jobInfo.EndTime = endTimeTimestamp } } + caller.Logger.Infof("GetJobByIdResponse: %v", jobInfo) return &pb.GetJobByIdResponse{Job: jobInfo}, nil } } @@ -480,6 +496,7 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get } st := status.New(codes.Internal, "slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetJobs failed: %v", st.Err()) return nil, st.Err() } if len(pendingUserResult) != 0 { @@ -512,6 +529,7 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get } getFullCmdLine := getJobInfoCmdLine + " " + "--format='%b %a %A %C %D %j %l %m %M %P %q %S %T %u %V %Z %N' | tr '\n' ';'" + caller.Logger.Tracef("GetJobs get jobs command: %v", getFullCmdLine) runningjobInfo, err := utils.RunCommand(getFullCmdLine) if err != nil || utils.CheckSlurmStatus(runningjobInfo) { errInfo := &errdetails.ErrorInfo{ @@ -519,6 +537,7 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get } st := status.New(codes.Internal, "slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetJobs Failed: %v", st.Err()) return nil, st.Err() } // runningJobInfoList := strings.Split(runningjobInfo, ",") @@ -648,6 +667,7 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get sortJobinfo := utils.SortJobInfo(sortKey, sortOrder, jobInfo) return &pb.GetJobsResponse{Jobs: sortJobinfo}, nil } + caller.Logger.Tracef("GetJobs GetJobsResponse is: %v", &pb.GetJobsResponse{Jobs: jobInfo}) return &pb.GetJobsResponse{Jobs: jobInfo}, nil } @@ -660,6 +680,7 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get } st := status.New(codes.Internal, "slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetJobs failed: %v", st.Err()) return nil, st.Err() } @@ -679,6 +700,7 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetJobs Failed: %v", st.Err()) return nil, st.Err() } defer rowList.Close() @@ -690,6 +712,7 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetJobs Failed: %v", st.Err()) return nil, st.Err() } gpuIdList = append(gpuIdList, gpuId) @@ -701,6 +724,7 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetJobs Failed: %v", st.Err()) return nil, st.Err() } @@ -740,6 +764,13 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get } baseSQL := fmt.Sprintf("SELECT account, id_user, cpus_req, job_name, id_job, id_qos, mem_req, nodelist, nodes_alloc, `partition`, state, timelimit, time_submit, time_start, time_end, time_suspended, gres_used, work_dir, tres_alloc, tres_req FROM %s_job_table WHERE ", clusterName) + databaseEncode := caller.ConfigValue.MySQLConfig.DatabaseEncode + caller.Logger.Tracef("Database encode is: %s", databaseEncode) + // 正常情况下,数据库编码格式是latin1,环境部署时config.yaml中databaseEncode也会配成latin1。 但是为了防止config.yaml中databaseEncode配成utf8,查询时需要做这样的转换。 + if strings.Contains(databaseEncode, "utf8") { + baseSQL = fmt.Sprintf("SELECT account, id_user, cpus_req, CONVERT(CAST(job_name AS BINARY) USING utf8) AS job_name, id_job, id_qos, mem_req, nodelist, nodes_alloc, `partition`, state, timelimit, time_submit, time_start, time_end, time_suspended, gres_used, CONVERT(CAST(work_dir AS BINARY) USING utf8) AS work_dir, tres_alloc, tres_req FROM %s_job_table WHERE ", clusterName) + } + conditions := []string{} if uidListString != "" { @@ -778,7 +809,7 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get totalParams = append(totalParams, *in.Filter.JobId) } if in.Filter.JobName != nil { - conditions = append(conditions, "job_name = ?") + conditions = append(conditions, "CONVERT(CAST(job_name AS BINARY) USING utf8) = ?") params = append(params, *in.Filter.JobName) totalParams = append(totalParams, *in.Filter.JobName) } @@ -804,6 +835,13 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get } else { // 没有搜索条件 baseSQL := fmt.Sprintf("SELECT account, id_user, cpus_req, job_name, id_job, id_qos, mem_req, nodelist, nodes_alloc, `partition`, state, timelimit, time_submit, time_start, time_end, time_suspended, gres_used, work_dir, tres_alloc, tres_req FROM %s_job_table ", clusterName) + databaseEncode := caller.ConfigValue.MySQLConfig.DatabaseEncode + caller.Logger.Tracef("Database encode is: %s", databaseEncode) + // 正常情况下,数据库编码格式是latin1,环境部署时config.yaml中databaseEncode也会配成latin1。 但是为了防止config.yaml中databaseEncode配成utf8,查询时需要做这样的转换。 + if strings.Contains(databaseEncode, "utf8") { + baseSQL = fmt.Sprintf("SELECT account, id_user, cpus_req, CONVERT(CAST(job_name AS BINARY) USING utf8) AS job_name, id_job, id_qos, mem_req, nodelist, nodes_alloc, `partition`, state, timelimit, time_submit, time_start, time_end, time_suspended, gres_used, CONVERT(CAST(work_dir AS BINARY) USING utf8) AS work_dir, tres_alloc, tres_req FROM %s_job_table WHERE ", clusterName) + } + if in.PageInfo != nil { // 分页的情况 没有搜索的情况 page := in.PageInfo.Page @@ -823,6 +861,7 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get jobSqlTotalConfig = fmt.Sprintf("SELECT count(*) FROM %s_job_table", clusterName) // 总的作业条数 } } + caller.Logger.Tracef("GetJobs sql: %v, params: %v", jobSqlConfig, params) rows, err := caller.DB.Query(jobSqlConfig, params...) if err != nil { errInfo := &errdetails.ErrorInfo{ @@ -830,6 +869,7 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetJobs Failed: %v", st.Err()) return nil, st.Err() } defer rows.Close() @@ -842,6 +882,7 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get } st := status.New(codes.Internal, "slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetJobs Failed: %v", st.Err()) return nil, st.Err() } if len(pendingResult) != 0 { @@ -850,8 +891,12 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get for rows.Next() { err := rows.Scan(&account, &idUser, &cpusReq, &jobName, &jobId, &idQos, &memReq, &nodeList, &nodesAlloc, &partition, &state, &timeLimitMinutes, &submitTime, &startTime, &endTime, &timeSuspended, &gresUsed, &workingDirectory, &tresAlloc, &tresReq) if err != nil { + caller.Logger.Errorf("GetJobs Failed: %v ", err) continue } + caller.Logger.Tracef("GetJobs account: %v, idUser: %v, cpusReq: %v, jobName: %v, jobId: %v, idQos: %v, memReq: %v, nodeList: %v, nodesAlloc: %v, partition: %v, state: %v, "+ + "timeLimitMinutes: %v, submitTime: %v, startTime: %v, endTime: %v, timeSuspended: %v, gresUsed: %v, workingDirectory: %v, tresAlloc: %v, tresReq: %v", account, idUser, + cpusReq, jobName, jobId, idQos, memReq, nodeList, nodesAlloc, partition, state, timeLimitMinutes, submitTime, startTime, endTime, timeSuspended, gresUsed, workingDirectory, tresAlloc, tresReq) var ( elapsedSeconds int64 reason string @@ -902,6 +947,7 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get } st := status.New(codes.Internal, "slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetJobs Failed: %v ", st.Err()) return nil, st.Err() } @@ -943,6 +989,7 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get } st := status.New(codes.Internal, "slurmctld down.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetJobs failed: %v", st.Err()) return nil, st.Err() } @@ -1077,14 +1124,17 @@ func (s *ServerJob) GetJobs(ctx context.Context, in *pb.GetJobsRequest) (*pb.Get } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("GetJobs failed: %v", st.Err()) return nil, st.Err() } // 获取总的页数逻辑 if jobSqlTotalConfig != "" { caller.DB.QueryRow(jobSqlTotalConfig, totalParams...).Scan(&count) totalCount = uint32(count) + caller.Logger.Tracef("GetJobs GetJobsResponse is: %v", &pb.GetJobsResponse{Jobs: jobInfo, TotalCount: &totalCount}) return &pb.GetJobsResponse{Jobs: jobInfo, TotalCount: &totalCount}, nil } + caller.Logger.Tracef("GetJobs GetJobsResponse is: %v", &pb.GetJobsResponse{Jobs: jobInfo}) return &pb.GetJobsResponse{Jobs: jobInfo}, nil } @@ -1103,6 +1153,7 @@ func (s *ServerJob) SubmitJob(ctx context.Context, in *pb.SubmitJobRequest) (*pb } st := status.New(codes.Internal, "The account or username contains illegal characters.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("SubmitJob failed: %v", st.Err()) return nil, st.Err() } // 检查账户是否在slurm中 @@ -1115,6 +1166,7 @@ func (s *ServerJob) SubmitJob(ctx context.Context, in *pb.SubmitJobRequest) (*pb message := fmt.Sprintf("%s does not exists.", in.UserId) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("SubmitJob failed: %v", st.Err()) return nil, st.Err() } @@ -1181,11 +1233,13 @@ func (s *ServerJob) SubmitJob(ctx context.Context, in *pb.SubmitJobRequest) (*pb } st := status.New(codes.Unknown, submitResponse) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("SubmitJob failed: %v", st.Err()) return nil, st.Err() } responseList := strings.Split(strings.TrimSpace(string(submitResponse)), " ") jobIdString := responseList[len(responseList)-1] jobId, _ := strconv.Atoi(jobIdString) + caller.Logger.Infof("SubmitJobResponse: %v", &pb.SubmitJobResponse{JobId: uint32(jobId), GeneratedScript: scriptString}) return &pb.SubmitJobResponse{JobId: uint32(jobId), GeneratedScript: scriptString}, nil } @@ -1202,6 +1256,7 @@ func (s *ServerJob) SubmitScriptAsJob(ctx context.Context, in *pb.SubmitScriptAs } st := status.New(codes.Internal, "The username contains illegal characters.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("SubmitScriptAsJob failed: %v", st.Err()) return nil, st.Err() } // 检查账户是否在slurm中 @@ -1214,6 +1269,7 @@ func (s *ServerJob) SubmitScriptAsJob(ctx context.Context, in *pb.SubmitScriptAs message := fmt.Sprintf("%s does not exists.", in.UserId) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("SubmitScriptAsJob failed: %v", st.Err()) return nil, st.Err() } @@ -1230,6 +1286,7 @@ func (s *ServerJob) SubmitScriptAsJob(ctx context.Context, in *pb.SubmitScriptAs } st := status.New(codes.Unknown, "ScriptFileFullPath not setting") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("SubmitScriptAsJob failed: %v", st.Err()) return nil, st.Err() } chdirString := fmt.Sprintf("#SBATCH --chdir=%s\n", *in.ScriptFileFullPath) @@ -1247,12 +1304,14 @@ func (s *ServerJob) SubmitScriptAsJob(ctx context.Context, in *pb.SubmitScriptAs } st := status.New(codes.Unknown, submitResponse) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("SubmitScriptAsJob failed: %v", st.Err()) return nil, st.Err() } else { // 这里还要获取jobid responseList := strings.Split(strings.TrimSpace(string(submitResponse)), " ") jobIdString := responseList[len(responseList)-1] jobId, _ := strconv.Atoi(jobIdString) + caller.Logger.Infof("SubmitJobResponse: %v", &pb.SubmitScriptAsJobResponse{JobId: uint32(jobId)}) return &pb.SubmitScriptAsJobResponse{JobId: uint32(jobId)}, nil } } diff --git a/services/user/user.go b/services/user/user.go index 373c6d9..a6eb7e4 100644 --- a/services/user/user.go +++ b/services/user/user.go @@ -37,6 +37,7 @@ func (s *ServerUser) AddUserToAccount(ctx context.Context, in *pb.AddUserToAccou } st := status.New(codes.Internal, "The account or username contains illegal characters.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("AddUserToAccount failed: %v", st.Err()) return nil, st.Err() } @@ -50,6 +51,7 @@ func (s *ServerUser) AddUserToAccount(ctx context.Context, in *pb.AddUserToAccou message := fmt.Sprintf("%s does not exists.", in.AccountName) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("AddUserToAccount failed: %v", st.Err()) return nil, st.Err() } @@ -62,6 +64,7 @@ func (s *ServerUser) AddUserToAccount(ctx context.Context, in *pb.AddUserToAccou } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("AddUserToAccount failed: %v", st.Err()) return nil, st.Err() } defer rows.Close() @@ -73,6 +76,7 @@ func (s *ServerUser) AddUserToAccount(ctx context.Context, in *pb.AddUserToAccou } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("AddUserToAccount failed: %v", st.Err()) return nil, st.Err() } qosList = append(qosList, qosName) @@ -84,6 +88,7 @@ func (s *ServerUser) AddUserToAccount(ctx context.Context, in *pb.AddUserToAccou } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("AddUserToAccount failed: %v", st.Err()) return nil, st.Err() } @@ -96,6 +101,7 @@ func (s *ServerUser) AddUserToAccount(ctx context.Context, in *pb.AddUserToAccou } st := status.New(codes.Internal, "Exec command failed or don't set partitions.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("AddUserToAccount failed: %v", st.Err()) return nil, st.Err() } @@ -114,6 +120,7 @@ func (s *ServerUser) AddUserToAccount(ctx context.Context, in *pb.AddUserToAccou } st := status.New(codes.AlreadyExists, "Command exec fail.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("AddUserToAccount failed: %v", st.Err()) return nil, st.Err() } retcode02 := utils.ExecuteShellCommand(modifyUserCmd) @@ -123,6 +130,7 @@ func (s *ServerUser) AddUserToAccount(ctx context.Context, in *pb.AddUserToAccou } st := status.New(codes.AlreadyExists, "Command exec fail.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("AddUserToAccount failed: %v", st.Err()) return nil, st.Err() } } @@ -143,6 +151,7 @@ func (s *ServerUser) AddUserToAccount(ctx context.Context, in *pb.AddUserToAccou } st := status.New(codes.AlreadyExists, "Command exec fail. ") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("AddUserToAccount failed: %v", st.Err()) return nil, st.Err() } retCode2 := utils.ExecuteShellCommand(modifyUserCmd) @@ -152,9 +161,11 @@ func (s *ServerUser) AddUserToAccount(ctx context.Context, in *pb.AddUserToAccou } st := status.New(codes.AlreadyExists, "Command exec fail.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("AddUserToAccount failed: %v", st.Err()) return nil, st.Err() } } + caller.Logger.Infof("AddUserToAccount sucess! User is: %v, Account is: %v", in.UserId, in.AccountName) return &pb.AddUserToAccountResponse{}, nil } // 关联已经存在的情况 @@ -163,6 +174,7 @@ func (s *ServerUser) AddUserToAccount(ctx context.Context, in *pb.AddUserToAccou } st := status.New(codes.AlreadyExists, "The user already exists in account.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("AddUserToAccount failed: %v", st.Err()) return nil, st.Err() } @@ -186,6 +198,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse } st := status.New(codes.Internal, "The account or username contains illegal characters.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("RemoveUserFromAccount failed: %v", st.Err()) return nil, st.Err() } @@ -199,6 +212,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse message := fmt.Sprintf("%s does not exists.", in.AccountName) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("RemoveUserFromAccount failed: %v", st.Err()) return nil, st.Err() } // 检查用户名是否在slurm中 @@ -211,6 +225,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse message := fmt.Sprintf("%s does not exists.", in.UserId) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("RemoveUserFromAccount failed: %v", st.Err()) return nil, st.Err() } @@ -224,6 +239,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse message := fmt.Sprintf("%s and %s assocation is not exists!", in.UserId, in.AccountName) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("RemoveUserFromAccount failed: %v", st.Err()) return nil, st.Err() } @@ -236,6 +252,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("RemoveUserFromAccount failed: %v", st.Err()) return nil, st.Err() } defer rows.Close() @@ -247,6 +264,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("RemoveUserFromAccount failed: %v", st.Err()) return nil, st.Err() } acctList = append(acctList, acct) @@ -258,6 +276,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("RemoveUserFromAccount failed: %v", st.Err()) return nil, st.Err() } @@ -270,6 +289,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse message := fmt.Sprintf("%s does not exists.", in.UserId) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("RemoveUserFromAccount failed: %v", st.Err()) return nil, st.Err() } // 检查用户是否有未结束的作业 @@ -281,6 +301,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("RemoveUserFromAccount failed: %v", st.Err()) return nil, st.Err() } defer jobRows.Close() @@ -292,6 +313,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("RemoveUserFromAccount failed: %v", st.Err()) return nil, st.Err() } jobList = append(jobList, jobName) @@ -303,6 +325,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("RemoveUserFromAccount failed: %v", st.Err()) return nil, st.Err() } @@ -315,6 +338,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse message := fmt.Sprintf("The %s have running jobs!", in.UserId) st := status.New(codes.Internal, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("RemoveUserFromAccount failed: %v", st.Err()) return nil, st.Err() } @@ -322,6 +346,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse deletedUserCmd := fmt.Sprintf("sacctmgr -i delete user name=%s", in.UserId) res := utils.ExecuteShellCommand(deletedUserCmd) if res == 0 { + caller.Logger.Infof("RemoveUserFromAccount sucess! User is: %v, Account is: %v", in.UserId, in.AccountName) return &pb.RemoveUserFromAccountResponse{}, nil } errInfo := &errdetails.ErrorInfo{ @@ -329,6 +354,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse } st := status.New(codes.Internal, "Shell command execute falied!") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("RemoveUserFromAccount failed: %v", st.Err()) return nil, st.Err() } // 更改默认账号 @@ -339,6 +365,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse message := fmt.Sprintf("The %s have running jobs!", in.UserId) st := status.New(codes.Internal, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("RemoveUserFromAccount failed: %v", st.Err()) return nil, st.Err() } updateDefaultAcctCmd := fmt.Sprintf("sacctmgr -i update user set DefaultAccount=%s where user=%s", acctList[0], in.UserId) @@ -349,6 +376,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse } st := status.New(codes.Internal, "Shell command execute falied!") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("RemoveUserFromAccount failed: %v", st.Err()) return nil, st.Err() } deleteUerFromAcctCmd := fmt.Sprintf("sacctmgr -i delete user name=%s account=%s", in.UserId, in.AccountName) @@ -359,6 +387,7 @@ func (s *ServerUser) RemoveUserFromAccount(ctx context.Context, in *pb.RemoveUse } st := status.New(codes.Internal, "Shell command execute falied!") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("RemoveUserFromAccount failed: %v", st.Err()) return nil, st.Err() } return &pb.RemoveUserFromAccountResponse{}, nil @@ -379,6 +408,7 @@ func (s *ServerUser) BlockUserInAccount(ctx context.Context, in *pb.BlockUserInA } st := status.New(codes.Internal, "The account or username contains illegal characters.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("BlockUserInAccount failed: %v", st.Err()) return nil, st.Err() } @@ -392,6 +422,7 @@ func (s *ServerUser) BlockUserInAccount(ctx context.Context, in *pb.BlockUserInA } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("BlockUserInAccount failed: %v", st.Err()) return nil, st.Err() } // 检查用户是否在slurm中 @@ -403,6 +434,7 @@ func (s *ServerUser) BlockUserInAccount(ctx context.Context, in *pb.BlockUserInA } st := status.New(codes.Internal, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("BlockUserInAccount failed: %v", st.Err()) return nil, st.Err() } // 检查账户与用户是否存在关联关系 @@ -416,12 +448,14 @@ func (s *ServerUser) BlockUserInAccount(ctx context.Context, in *pb.BlockUserInA message := fmt.Sprintf("%s and %s assocation is not exists!", in.UserId, in.AccountName) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("BlockUserInAccount failed: %v", st.Err()) return nil, st.Err() } // 关联存在的情况下直接封锁账户 blockUserCmd := fmt.Sprintf("sacctmgr -i -Q modify user where name=%s account=%s set MaxSubmitJobs=0 MaxJobs=0 GrpJobs=0 GrpSubmit=0 GrpSubmitJobs=0 MaxSubmitJobs=0", in.UserId, in.AccountName) res := utils.ExecuteShellCommand(blockUserCmd) if res == 0 { + caller.Logger.Infof("BlockUserInAccount sucess! User is: %v, Account is: %v", in.UserId, in.AccountName) return &pb.BlockUserInAccountResponse{}, nil } errInfo := &errdetails.ErrorInfo{ @@ -429,6 +463,7 @@ func (s *ServerUser) BlockUserInAccount(ctx context.Context, in *pb.BlockUserInA } st := status.New(codes.Internal, "Shell command execute falied!") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("BlockUserInAccount failed: %v", st.Err()) return nil, st.Err() } @@ -448,6 +483,7 @@ func (s *ServerUser) UnblockUserInAccount(ctx context.Context, in *pb.UnblockUse } st := status.New(codes.Internal, "The account or username contains illegal characters.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("UnblockUserInAccount failed: %v", st.Err()) return nil, st.Err() } @@ -462,6 +498,7 @@ func (s *ServerUser) UnblockUserInAccount(ctx context.Context, in *pb.UnblockUse mesaage := fmt.Sprintf("%s does not exists.", in.AccountName) st := status.New(codes.NotFound, mesaage) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("UnblockUserInAccount failed: %v", st.Err()) return nil, st.Err() } // 检查用户是否在slurm中 @@ -474,6 +511,7 @@ func (s *ServerUser) UnblockUserInAccount(ctx context.Context, in *pb.UnblockUse message := fmt.Sprintf("%s does not exists.", in.UserId) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("UnblockUserInAccount failed: %v", st.Err()) return nil, st.Err() } @@ -487,12 +525,14 @@ func (s *ServerUser) UnblockUserInAccount(ctx context.Context, in *pb.UnblockUse message := fmt.Sprintf("%s and %s assocation is not exists!", in.UserId, in.AccountName) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("UnblockUserInAccount failed: %v", st.Err()) return nil, st.Err() } // 最大提交作业数为NULL表示没被封锁 maxSubmitJobsSqlConfig := fmt.Sprintf("SELECT DISTINCT max_submit_jobs FROM %s_assoc_table WHERE user = ? AND acct = ? AND deleted = 0", clusterName) err = caller.DB.QueryRow(maxSubmitJobsSqlConfig, in.UserId, in.AccountName).Scan(&maxSubmitJobs) if err != nil { + caller.Logger.Infof("UnblockUserInAccount sucess! User id: %v, Account is: %v", in.UserId, in.AccountName) return &pb.UnblockUserInAccountResponse{}, nil } // 用户从账户中解封的操作 @@ -506,6 +546,7 @@ func (s *ServerUser) UnblockUserInAccount(ctx context.Context, in *pb.UnblockUse } st := status.New(codes.Internal, "Shell command execute falied!") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("UnblockUserInAccount failed: %v", st.Err()) return nil, st.Err() } @@ -527,6 +568,7 @@ func (s *ServerUser) QueryUserInAccountBlockStatus(ctx context.Context, in *pb.Q } st := status.New(codes.Internal, "The account or username contains illegal characters.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("QueryUserInAccountBlockStatus failed: %v", st.Err()) return nil, st.Err() } @@ -541,6 +583,7 @@ func (s *ServerUser) QueryUserInAccountBlockStatus(ctx context.Context, in *pb.Q message := fmt.Sprintf("%s does not exists.", in.AccountName) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("QueryUserInAccountBlockStatus failed: %v", st.Err()) return nil, st.Err() } // 判断用户是否在slurm中 @@ -553,6 +596,7 @@ func (s *ServerUser) QueryUserInAccountBlockStatus(ctx context.Context, in *pb.Q message := fmt.Sprintf("%s does not exists.", in.UserId) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("QueryUserInAccountBlockStatus failed: %v", st.Err()) return nil, st.Err() } @@ -566,14 +610,17 @@ func (s *ServerUser) QueryUserInAccountBlockStatus(ctx context.Context, in *pb.Q message := fmt.Sprintf("%s and %s assocation is not exists!", in.UserId, in.AccountName) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("QueryUserInAccountBlockStatus failed: %v", st.Err()) return nil, st.Err() } // 查询max_submit_jobs的值,通过max_submit_jobs来判断用户是否被封锁 maxSubmitJobSqlConfig := fmt.Sprintf("SELECT DISTINCT max_submit_jobs FROM %s_assoc_table WHERE user = ? AND acct = ? AND deleted = 0", clusterName) err = caller.DB.QueryRow(maxSubmitJobSqlConfig, in.UserId, in.AccountName).Scan(&maxSubmitJobs) if err != nil { + caller.Logger.Infof("User %v In Account %v is Unblocked Status", in.UserId, in.AccountName) return &pb.QueryUserInAccountBlockStatusResponse{Blocked: false}, nil } + caller.Logger.Infof("User %v In Account %v is Blocked Status", in.UserId, in.AccountName) return &pb.QueryUserInAccountBlockStatusResponse{Blocked: true}, nil } @@ -591,6 +638,7 @@ func (s *ServerUser) DeleteUser(ctx context.Context, in *pb.DeleteUserRequest) ( message := fmt.Sprintf("%s does not exists.", in.UserId) st := status.New(codes.NotFound, message) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("DeleteUser failed: %v", st.Err()) return nil, st.Err() } @@ -604,6 +652,7 @@ func (s *ServerUser) DeleteUser(ctx context.Context, in *pb.DeleteUserRequest) ( } st := status.New(codes.NotFound, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("DeleteUser failed: %v", st.Err()) return nil, st.Err() } @@ -616,8 +665,10 @@ func (s *ServerUser) DeleteUser(ctx context.Context, in *pb.DeleteUserRequest) ( } st := status.New(codes.NotFound, err.Error()) st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("DeleteUser failed: %v", st.Err()) return nil, st.Err() } + caller.Logger.Infof("Delete User: %v sucess!", in.UserId) // 执行成功直接返回 return &pb.DeleteUserResponse{}, nil } else { @@ -627,6 +678,7 @@ func (s *ServerUser) DeleteUser(ctx context.Context, in *pb.DeleteUserRequest) ( } st := status.New(codes.NotFound, "Exist running jobs.") st, _ = st.WithDetails(errInfo) + caller.Logger.Errorf("DeleteUser failed: %v", st.Err()) return nil, st.Err() } } diff --git a/services/version/version.go b/services/version/version.go index 9b93625..6faa56d 100644 --- a/services/version/version.go +++ b/services/version/version.go @@ -2,6 +2,7 @@ package version import ( "context" + "scow-slurm-adapter/caller" pb "scow-slurm-adapter/gen/go" ) @@ -10,5 +11,6 @@ type ServerVersion struct { } func (s *ServerVersion) GetVersion(ctx context.Context, in *pb.GetVersionRequest) (*pb.GetVersionResponse, error) { + caller.Logger.Tracef("Adapter Version is: %v", &pb.GetVersionResponse{Major: 1, Minor: 6, Patch: 0}) return &pb.GetVersionResponse{Major: 1, Minor: 6, Patch: 0}, nil } diff --git a/utils/utils.go b/utils/utils.go index a9f470f..ca09ffa 100644 --- a/utils/utils.go +++ b/utils/utils.go @@ -20,6 +20,10 @@ import ( "gopkg.in/yaml.v3" ) +type LogConfig struct { + Level string `yaml:"level"` +} + type MySQLConfig struct { Host string `yaml:"host"` Port int `yaml:"port"` @@ -49,6 +53,7 @@ type PartitionDesc struct { } type Config struct { + LogConfig LogConfig `yaml:"log"` MySQLConfig MySQLConfig `yaml:"mysql"` Service Service `yaml:"service"` Slurm Slurm `yaml:"slurm"`