cms-sw · cmsbuild · Jan 30, 2024 · Nov 21, 2023 · Dec 5, 2023 · Dec 11, 2023
diff --git a/HeterogeneousCore/SonicTriton/README.md b/HeterogeneousCore/SonicTriton/README.md
@@ -124,14 +124,18 @@ In a SONIC Triton producer, the basic flow should follow this pattern:
 
 ## Services
 
+### `cmsTriton`
+
 A script [`cmsTriton`](./scripts/cmsTriton) is provided to launch and manage local servers.
-The script has two operations (`start` and `stop`) and the following options:
+The script has three operations (`start`, `stop`, `check`) and the following options:
 * `-c`: don't cleanup temporary dir (for debugging)
+* `-C [dir]`: directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available)
 * `-D`: dry run: print container commands rather than executing them
 * `-d`: use Docker instead of Apptainer
 * `-f`: force reuse of (possibly) existing container instance
 * `-g`: use GPU instead of CPU
 * `-i` [name]`: server image name (default: fastml/triton-torchgeo:22.07-py3-geometric)
+* `-I [num]`: number of model instances (default: 0 -> means no local editing of config files)
 * `-M [dir]`: model repository (can be given more than once)
 * `-m [dir]`: specific model directory (can be given more than one)
 * `-n [name]`: name of container instance, also used for hidden temporary dir (default: triton_server_instance)
@@ -148,13 +152,32 @@ Additional details and caveats:
 * The `start` and `stop` operations for a given container instance should always be executed in the same directory
 if a relative path is used for the hidden temporary directory (including the default from the container instance name),
 in order to ensure that everything is properly cleaned up.
+* The `check` operation just checks if the server can run on the current system, based on driver compatibility.
 * A model repository is a folder that contains multiple model directories, while a model directory contains the files for a specific file.
 (In the example below, `$CMSSW_BASE/src/HeterogeneousCore/SonicTriton/data/models` is a model repository,
 while `$CMSSW_BASE/src/HeterogeneousCore/SonicTriton/data/models/resnet50_netdef` is a model directory.)
 If a model repository is provided, all of the models it contains will be provided to the server.
 * Older versions of Apptainer (Singularity) have a short timeout that may cause launching the server to fail the first time the command is executed.
 The `-r` (retry) flag exists to work around this issue.
 
+### `cmsTritonConfigTool`
+
+The `config.pbtxt` files used for model configuration are written in the protobuf text format.
+To ease modification of these files, a dedicated Python tool [`cmsTritonConfigTool`](./scripts/cmsTritonConfigTool) is provided.
+The tool has several modes of operation (each with its own options, which can be viewed using `--help`):
+* `schema`: displays all field names and types for the Triton ModelConfig message class.
+* `view`: displays the field values from a provided `config.pbtxt` file.
+* `edit`: allows changing any field value in a `config.pbtxt` file. Non-primitive types are specified using JSON format.
+* `checksum`: checks and updates checksums for model files (to enforce versioning).
+* `versioncheck`: checks and updates checksums for all `config.pbtxt` files in `$CMSSW_SEARCH_PATH`.
+* `threadcontrol`: adds job- and ML framework-specific thread control settings.
+
+The `edit` mode is intended for generic modifications, and only supports overwriting existing values
+(not modifying, removing, deleting, etc.).
+Additional dedicated modes, like `checksum` and `threadcontrol`, can easily be added for more complicated tasks.
+
+### `TritonService`
+
 A central `TritonService` is provided to keep track of all available servers and which models they can serve.
 The servers will automatically be assigned to clients at startup.
 If some models are not served by any server, the `TritonService` can launch a fallback server using the `cmsTriton` script described above.

diff --git a/HeterogeneousCore/SonicTriton/interface/triton_utils.h b/HeterogeneousCore/SonicTriton/interface/triton_utils.h
@@ -83,6 +83,7 @@ extern template std::string triton_utils::printColl(const edm::Span<std::vector<
                                                     const std::string& delim);
 extern template std::string triton_utils::printColl(const std::vector<uint8_t>& coll, const std::string& delim);
 extern template std::string triton_utils::printColl(const std::vector<float>& coll, const std::string& delim);
+extern template std::string triton_utils::printColl(const std::vector<std::string>& coll, const std::string& delim);
 extern template std::string triton_utils::printColl(const std::unordered_set<std::string>& coll,
                                                     const std::string& delim);
 

diff --git a/HeterogeneousCore/SonicTriton/scripts/cmsTriton b/HeterogeneousCore/SonicTriton/scripts/cmsTriton
@@ -34,7 +34,7 @@ get_sandbox(){
 
 usage() {
 	ECHO="echo -e"
-	$ECHO "cmsTriton [options] [start|stop]"
+	$ECHO "cmsTriton [options] [start|stop|check]"
 	$ECHO
 	$ECHO "Options:"
 	$ECHO "-c          \t don't cleanup temporary dir (for debugging)"
@@ -338,57 +338,6 @@ wait_server(){
 	echo "server is ready!"
 }
 
-edit_model(){
-	MODELNAME=$1
-	NUMINSTANCES=$2
-
-	cp -r $MODELNAME $TMPDIR/$LOCALMODELREPO/
-	COPY_EXIT=$?
-	if [ "$COPY_EXIT" -ne 0 ]; then
-		echo "Could not copy $MODELNAME into $TMPDIR/$LOCALMODELREPO/"
-		exit "$COPY_EXIT"
-	fi
-	IFS='/' read -ra ADDR <<< "$MODELNAME"
-	CONFIG=$TMPDIR/$LOCALMODELREPO/${ADDR[-1]}/config.pbtxt
-
-	PLATFORM=$(grep -m 1 "^platform:" "$CONFIG")
-
-	if [[ $PLATFORM == *"ensemble"* ]]; then
-		#recurse over submodels of ensemble model
-		MODELLOC=$(echo ""${ADDR[@]:0:${#ADDR[@]}-1} | sed "s/ /\//g")
-		SUBNAME=$(grep "model_name:" "$CONFIG" | sed 's/model_name://; s/"//g')
-		for SUBMODEL in ${SUBNAME}; do
-			SUBMODEL=${MODELLOC}/${SUBMODEL}
-			edit_model $SUBMODEL "$INSTANCES"
-		done
-	else
-		#This is not an ensemble model, so we should edit the config file
-		cat <<EOF >> $CONFIG
-instance_group [
-  {
-    count: $NUMINSTANCES
-    kind: KIND_CPU
-  }
-]
-
-EOF
-		if [[ $PLATFORM == *"onnx"* ]]; then
-			cat <<EOF >> $CONFIG
-parameters { key: "intra_op_thread_count" value: { string_value: "1" } }
-parameters { key: "inter_op_thread_count" value: { string_value: "1" } }
-EOF
-		elif [[ $PLATFORM == *"tensorflow"* ]]; then
-			cat <<EOF >> $CONFIG
-parameters { key: "TF_NUM_INTRA_THREADS" value: { string_value: "1" } }
-parameters { key: "TF_NUM_INTER_THREADS" value: { string_value: "1" } }
-parameters { key: "TF_USE_PER_SESSION_THREADS" value: { string_value: "1" } }
-EOF
-		else
-			echo "Warning: thread (instance) control not implemented for $PLATFORM"
-		fi
-	fi
-}
-
 list_models(){
 	# make list of model repositories
 	LOCALMODELREPO="local_model_repo"
@@ -411,7 +360,12 @@ list_models(){
 			MODEL="$(dirname "$MODEL")"
 		fi
 		if [ "$INSTANCES" -gt 0 ]; then
-			edit_model $MODEL "$INSTANCES"
+			$DRYRUN cmsTritonConfigTool threadcontrol -c ${MODEL}/config.pbtxt --copy $TMPDIR/$LOCALMODELREPO --nThreads $INSTANCES
+			TOOL_EXIT=$?
+			if [ "$TOOL_EXIT" -ne 0 ]; then
+				echo "Could not apply threadcontrol to $MODEL"
+				exit "$TOOL_EXIT"
+			fi
 		else
 			REPOS+=("$(dirname "$MODEL")")
 		fi