From bf9595ee7c77d913db0506a29ccf455f8eb6e98b Mon Sep 17 00:00:00 2001
From: Eli Bendersky <eliben@gmail.com>
Date: Thu, 15 Aug 2024 16:18:21 -0600
Subject: [PATCH] vertexai: import go-sentencepiece to internal (#10689)

The import script `import-go-sentencepiece.sh` is the only new piece of code here

Tests are excluded, since they require setting/downloading the proto model. We'll add some smoke testing for the new package wrapping this for the SDK
---
 vertexai/internal/LICENSE_HEADER              |   14 +
 vertexai/internal/import-go-sentencepiece.sh  |   53 +
 vertexai/internal/sentencepiece/.gitignore    |   25 +
 vertexai/internal/sentencepiece/LICENSE       |  201 +++
 vertexai/internal/sentencepiece/README.md     |   56 +
 vertexai/internal/sentencepiece/encoder.go    |  332 ++++
 .../sentencepiece/internal/cmd/dumper/main.go |   87 +
 .../sentencepiece/internal/model/gen.sh       |   11 +
 .../internal/model/sentencepiece_model.pb.go  | 1556 +++++++++++++++++
 .../internal/model/sentencepiece_model.proto  |  332 ++++
 .../internal/prefixmatcher/prefixmatcher.go   |   82 +
 .../internal/priorityqueue/priorityqueue.go   |  108 ++
 vertexai/internal/sentencepiece/normalize.go  |   34 +
 vertexai/internal/sentencepiece/token.go      |   29 +
 14 files changed, 2920 insertions(+)
 create mode 100644 vertexai/internal/LICENSE_HEADER
 create mode 100755 vertexai/internal/import-go-sentencepiece.sh
 create mode 100644 vertexai/internal/sentencepiece/.gitignore
 create mode 100644 vertexai/internal/sentencepiece/LICENSE
 create mode 100644 vertexai/internal/sentencepiece/README.md
 create mode 100644 vertexai/internal/sentencepiece/encoder.go
 create mode 100644 vertexai/internal/sentencepiece/internal/cmd/dumper/main.go
 create mode 100755 vertexai/internal/sentencepiece/internal/model/gen.sh
 create mode 100644 vertexai/internal/sentencepiece/internal/model/sentencepiece_model.pb.go
 create mode 100644 vertexai/internal/sentencepiece/internal/model/sentencepiece_model.proto
 create mode 100644 vertexai/internal/sentencepiece/internal/prefixmatcher/prefixmatcher.go
 create mode 100644 vertexai/internal/sentencepiece/internal/priorityqueue/priorityqueue.go
 create mode 100644 vertexai/internal/sentencepiece/normalize.go
 create mode 100644 vertexai/internal/sentencepiece/token.go

diff --git a/vertexai/internal/LICENSE_HEADER b/vertexai/internal/LICENSE_HEADER
new file mode 100644
index 000000000000..b3b714d5e766
--- /dev/null
+++ b/vertexai/internal/LICENSE_HEADER
@@ -0,0 +1,14 @@
+// Copyright 2024 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
diff --git a/vertexai/internal/import-go-sentencepiece.sh b/vertexai/internal/import-go-sentencepiece.sh
new file mode 100755
index 000000000000..50ba93f6541c
--- /dev/null
+++ b/vertexai/internal/import-go-sentencepiece.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Imports github.com/eliben/go-sentencepiece for local vendoring in our module,
+# with the author's permission.
+
+# Fail on any error
+set -eo pipefail
+
+# Display commands being run
+set -x
+
+# Create a temporary directory
+TEMP_DIR=$(mktemp -d)
+
+# Clone the repository with --depth 1 to get only the latest files
+git clone --depth 1 https://github.com/eliben/go-sentencepiece.git "$TEMP_DIR/go-sentencepiece"
+
+# Copy the repository contents to here, excluding the .git directory
+rm -rf sentencepiece
+mkdir -p sentencepiece
+rsync -av \
+    --exclude='.git' \
+    --exclude='go.mod' \
+    --exclude='go.sum' \
+    --exclude='test' \
+    --exclude='*_test.go' \
+    "$TEMP_DIR/go-sentencepiece/" sentencepiece
+
+# Replace import paths.
+find "sentencepiece" -type f -name '*.go' \
+    -exec sed -i 's|github.com/eliben/go-sentencepiece|cloud.google.com/go/vertexai/internal/sentencepiece|g' {} +
+
+# Prepend the LICENSE_HEADER to each .go file
+GO_FILES=$(find sentencepiece -type f -name '*.go')
+LICENSE_HEADER=$(realpath "LICENSE_HEADER")
+
+for gofile in $GO_FILES; do
+    cat "$LICENSE_HEADER" "$gofile" > "$gofile.tmp" && mv "$gofile.tmp" "$gofile"
+done
diff --git a/vertexai/internal/sentencepiece/.gitignore b/vertexai/internal/sentencepiece/.gitignore
new file mode 100644
index 000000000000..6f72f8926186
--- /dev/null
+++ b/vertexai/internal/sentencepiece/.gitignore
@@ -0,0 +1,25 @@
+# If you prefer the allow list template instead of the deny list, see community template:
+# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
+#
+# Binaries for programs and plugins
+*.exe
+*.exe~
+*.dll
+*.so
+*.dylib
+
+# Test binary, built with `go test -c`
+*.test
+
+# Output of the go coverage tool, specifically when used with LiteIDE
+*.out
+
+# Dependency directories (remove the comment below to include it)
+# vendor/
+
+# Go workspace file
+go.work
+go.work.sum
+
+# env file
+.env
diff --git a/vertexai/internal/sentencepiece/LICENSE b/vertexai/internal/sentencepiece/LICENSE
new file mode 100644
index 000000000000..261eeb9e9f8b
--- /dev/null
+++ b/vertexai/internal/sentencepiece/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/vertexai/internal/sentencepiece/README.md b/vertexai/internal/sentencepiece/README.md
new file mode 100644
index 000000000000..724221784780
--- /dev/null
+++ b/vertexai/internal/sentencepiece/README.md
@@ -0,0 +1,56 @@
+# go-sentencepiece
+
+[![Go Reference](https://pkg.go.dev/badge/github.com/eliben/go-sentencepiece.svg)](https://pkg.go.dev/github.com/eliben/go-sentencepiece)
+
+This is a pure Go implementation of encoding text with
+the [SentencePiece tokenizer](https://github.com/google/sentencepiece).
+
+"Encoding" is the operation used to split text into tokens, using
+a trained tokenizer model.
+
+SentencePiece is a general family of tokenizers that is configured
+by a protobuf configuration file. This repository currently focuses
+on implementing just the functionality required to reproduce the
+tokenization of [Gemma models](https://ai.google.dev/gemma) (the same
+tokenizer is used for Google's proprietary Gemini family of models).
+Specifically, it only implements BPE tokenization since this is what
+Gemma uses.
+
+## Current status
+
+This package should be ready to use for encoding text into tokens
+using the Gemma tokenizer; it's been reasonably optimized and extensively
+tested vs. the [SentencePiece Python bindings](https://pypi.org/project/sentencepiece/)
+(see `system_test.go` in this repository).
+
+If you find any problems or discrepancies, please open an issue.
+
+## Tokenizer configuration
+
+The configuration file for the tokenizer is a protobuf (structured
+data, serialized in the [protocol buffer format](https://protobuf.dev/))
+that describes a trained tokenizer model; it includes
+the complete learned vocabulary used for tokenization, as well as
+other configuration information.
+
+It is not part of this repository. Please fetch it from the
+[official Gemma implementation repository](https://github.com/google/gemma_pytorch/tree/main/tokenizer).
+`NewEncoder*` constructors will expect to read this file.
+
+## Developing
+
+A protobuf is used to configure the tokenizer. The structure of the
+protobuf is described by the `internal/model/sentencepiece_model.proto` file,
+which is vendored from https://github.com/google/sentencepiece
+
+To re-generate the `*.pb.go` file from it:
+
+```
+$ cd internal/model
+$ ./gen.sh
+```
+
+The configuration protobuf itself is obtained as described in the
+[Tokenizer configuration](#tokenizer-configuration) section. All
+tests require the `MODELPATH` env var to point to a local
+copy of the tokenizer configuration file.
diff --git a/vertexai/internal/sentencepiece/encoder.go b/vertexai/internal/sentencepiece/encoder.go
new file mode 100644
index 000000000000..89c8b7785761
--- /dev/null
+++ b/vertexai/internal/sentencepiece/encoder.go
@@ -0,0 +1,332 @@
+// Copyright 2024 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sentencepiece
+
+import (
+	"fmt"
+	"io"
+	"os"
+	"strconv"
+	"strings"
+	"unicode/utf8"
+
+	"cloud.google.com/go/vertexai/internal/sentencepiece/internal/model"
+	"cloud.google.com/go/vertexai/internal/sentencepiece/internal/prefixmatcher"
+	"cloud.google.com/go/vertexai/internal/sentencepiece/internal/priorityqueue"
+	"google.golang.org/protobuf/proto"
+)
+
+const debugEncode = false
+
+// Encoder represents a SentencePiece encoder (tokenizer).
+// An Encoder converts input text into a sequence of tokens LLMs use.
+// The mapping between token IDs and the text they represent is read from the
+// model proto (provided to the constructor); it's the same between all calls
+// to the Encode method.
+type Encoder struct {
+	model *model.ModelProto
+
+	pieces   map[string]int
+	reserved map[string]int
+
+	// unknownID is the token identifier of the UNKNOWN piece
+	unknownID int
+
+	// userDefinedMatcher is a prefix matcher for symbols that are of
+	// "user-defined" type in the model proto.
+	userDefinedMatcher *prefixmatcher.PrefixMatcher
+
+	// byteTokens is a cache of byte values and the tokens they represent
+	byteTokens map[byte]Token
+}
+
+// NewEncoderFromPath creates a new Encoder from a file path to the protobuf
+// data.
+func NewEncoderFromPath(protoFile string) (*Encoder, error) {
+	f, err := os.Open(protoFile)
+	if err != nil {
+		return nil, fmt.Errorf("unable to read %q: %v", protoFile, err)
+	}
+	defer f.Close()
+	return NewEncoder(f)
+}
+
+// NewEncoder creates a new Encoder from a reader with the protobuf data.
+func NewEncoder(protoReader io.Reader) (*Encoder, error) {
+	b, err := io.ReadAll(protoReader)
+	if err != nil {
+		return nil, fmt.Errorf("unable to read protobuf data: %v", err)
+	}
+
+	var mp model.ModelProto
+	err = proto.Unmarshal(b, &mp)
+	if err != nil {
+		return nil, fmt.Errorf("unable to unmarshal protobuf: %v", err)
+	}
+
+	tspec := mp.GetTrainerSpec()
+	if tspec.GetModelType() != model.TrainerSpec_BPE {
+		return nil, fmt.Errorf("model type %s not supported", tspec.GetModelType())
+	}
+
+	userDefined := make(map[string]bool)
+	pieces := make(map[string]int)
+	reserved := make(map[string]int)
+	byteTokens := make(map[byte]Token)
+	unkID := -1
+
+	for i, piece := range mp.GetPieces() {
+		isNormalPiece := (piece.GetType() == model.ModelProto_SentencePiece_NORMAL ||
+			piece.GetType() == model.ModelProto_SentencePiece_USER_DEFINED ||
+			piece.GetType() == model.ModelProto_SentencePiece_UNUSED)
+
+		if isNormalPiece {
+			pieces[piece.GetPiece()] = i
+		} else {
+			reserved[piece.GetPiece()] = i
+		}
+
+		if piece.GetType() == model.ModelProto_SentencePiece_USER_DEFINED {
+			userDefined[piece.GetPiece()] = true
+		} else if piece.GetType() == model.ModelProto_SentencePiece_UNKNOWN {
+			if unkID > 0 {
+				return nil, fmt.Errorf("unk redefined")
+			}
+			unkID = i
+		} else if piece.GetType() == model.ModelProto_SentencePiece_BYTE {
+			if !tspec.GetByteFallback() {
+				return nil, fmt.Errorf("byte piece %q is found although `byte_fallback=false`", piece.GetPiece())
+			}
+			bv := convertHexValue(piece.GetPiece())
+			if bv >= 0 && bv < 256 {
+				byteTokens[byte(bv)] = Token{ID: i, Text: piece.GetPiece()}
+			}
+		}
+	}
+
+	if unkID < 0 {
+		return nil, fmt.Errorf("unk symbol is not defined")
+	}
+
+	// In case byte_fallback is specified, make sure that all 256 possible byte
+	// values were found.
+	if tspec.GetByteFallback() {
+		for i := 0; i < 256; i++ {
+			if _, found := byteTokens[byte(i)]; !found {
+				return nil, fmt.Errorf("byte value 0x%02X not found", i)
+			}
+		}
+	}
+
+	return &Encoder{
+		model:              &mp,
+		userDefinedMatcher: prefixmatcher.NewFromSet(userDefined),
+		byteTokens:         byteTokens,
+		unknownID:          unkID,
+		pieces:             pieces,
+		reserved:           reserved,
+	}, nil
+}
+
+// Encode tokenizes the input text and returns a list of Tokens.
+func (enc *Encoder) Encode(text string) []Token {
+	text = normalize(text)
+
+	// We begin by having each symbol a single Unicode character (or a
+	// user-defined string), and will iteratively merge them into larger and
+	// larger symbols until we have the final list of tokens.
+	// Since this list of symbols changes a lot, we represent it as a
+	// doubly-linked list in the symList slice. Each element in this slice has
+	// prev/next links to the next "live" symbol in the list; noMerge means this
+	// is a user-defined symbol we're not allowed to merge with neighbors.
+	// After the algorithm is finished, many elements in symList will be "dead"
+	// (unreachable by next/prev links from the first element).
+	// This representation is inspired by the implementation of bpe::Model
+	// in the SentencePiece C++ library.
+
+	type symListElem struct {
+		prev, next int
+		noMerge    bool
+		symbol     string
+	}
+	symList := make([]symListElem, 0, len(text))
+
+	for {
+		// Match the next symbol in text
+		slen, found := enc.symbolMatch(text)
+
+		// Append a list element for this symbol; note that this element will be
+		// at index len(symList), so prev/next are set up accordingly.
+		sym := symListElem{
+			noMerge: found,
+			symbol:  text[:slen],
+			prev:    len(symList) - 1,
+			next:    len(symList) + 1,
+		}
+		symList = append(symList, sym)
+
+		// Advance the text slice to the next symbol; if no more text, we're done.
+		text = text[slen:]
+		if len(text) == 0 {
+			break
+		}
+	}
+
+	if len(symList) == 0 {
+		return nil
+	}
+	symList[len(symList)-1].next = -1
+
+	debugShowSymList := func(prefix string) {
+		if debugEncode {
+			fmt.Println(prefix)
+			for i, elem := range symList {
+				fmt.Printf("[%3d]: [prev: %3v, next: %3d, noMerge: %v] %q\n", i, elem.prev, elem.next, elem.noMerge, elem.symbol)
+			}
+		}
+	}
+	debugShowSymList("initial")
+
+	// To avoid repeating work, we manage a priority queue of "merge candidates".
+	// Each candidate has pointers to the symList list for the left and right
+	// symbol in the pair, as well as the combined symbol's score.
+	// The priority of merging is determined by this score, with position as
+	// the tie-breaker (earlier pairs are preferred).
+	type mergeCandidate struct {
+		left, right int
+		length      int
+		score       float32
+	}
+
+	mergeQueue := priorityqueue.New(func(a, b mergeCandidate) int {
+		if a.score > b.score || (a.score == b.score && a.left < b.left) {
+			return 1
+		}
+		return -1
+	})
+
+	// suggestNewMergePair is called to potentially add a new mergeCandidate to
+	// mergeQueue. The candidate is added if it's valid, both its parts are
+	// allowed to merge, and it appears in the vocabulary.
+	suggestNewMergePair := func(left, right int) {
+		if left == -1 || right == -1 || symList[left].noMerge || symList[right].noMerge {
+			return
+		}
+
+		mergedSymbol := symList[left].symbol + symList[right].symbol
+		if id, found := enc.pieces[mergedSymbol]; found {
+			mergeQueue.Insert(mergeCandidate{
+				left:   left,
+				right:  right,
+				length: len(mergedSymbol),
+				score:  enc.model.GetPieces()[id].GetScore(),
+			})
+		}
+	}
+
+	// Seed the merge queue with all pairs of symbols from symList
+	for i := 1; i < len(symList); i++ {
+		suggestNewMergePair(i-1, i)
+	}
+
+	// Main loop
+	for mergeQueue.Len() > 0 {
+		candidate := mergeQueue.PopMax()
+		leftSymbol := symList[candidate.left]
+		rightSymbol := symList[candidate.right]
+
+		// Make sure this candidate is not out of date. If one of its parts was
+		// already merged with another symbol, just skip this candidate.
+		if len(leftSymbol.symbol) == 0 ||
+			len(rightSymbol.symbol) == 0 ||
+			len(leftSymbol.symbol)+len(rightSymbol.symbol) != candidate.length {
+			continue
+		}
+
+		// Do the merge:
+		// 1. Merge the concatenation of leftSymbol and rightSymbol into leftSymbol
+		symList[candidate.left].symbol = leftSymbol.symbol + rightSymbol.symbol
+
+		// 2. Update prev/next pointers
+		symList[candidate.left].next = rightSymbol.next
+		if rightSymbol.next >= 0 {
+			symList[rightSymbol.next].prev = candidate.left
+		}
+
+		// 3. Mark the right element in the pair as outdated (it's been merged
+		//    into the left one).
+		symList[candidate.right].symbol = ""
+
+		// 4. Add merge suggestions for the newly merged symbol with its neighbors
+		suggestNewMergePair(leftSymbol.prev, candidate.left)
+		suggestNewMergePair(candidate.left, rightSymbol.next)
+	}
+
+	// Collect the final list of tokens from the remaining elements of symList.
+	tokens := make([]Token, 0, len(symList))
+	for i := 0; i >= 0; i = symList[i].next {
+		symbol := symList[i].symbol
+		id := enc.symbolToID(symbol)
+
+		if id == enc.unknownID && enc.model.GetTrainerSpec().GetByteFallback() {
+			// Decompose this symbol into bytes, and report each byte as a separate
+			// token.
+			for i := 0; i < len(symbol); i++ {
+				tokens = append(tokens, enc.byteTokens[symbol[i]])
+			}
+		} else {
+			tokens = append(tokens, Token{ID: id, Text: symbol})
+		}
+	}
+
+	return tokens
+}
+
+// symbolMatch finds the length of the first symbol in text. A symbol is either
+// a user-defined symbol from the proto or a single rune. The second return
+// value is true iff a user-defined symbol was matched.
+func (enc *Encoder) symbolMatch(text string) (int, bool) {
+	prefixLen := enc.userDefinedMatcher.FindPrefixLen(text)
+	if prefixLen > 0 {
+		return prefixLen, true
+	}
+	// Not found a user-defined prefix; get the length of next rune.
+	_, rlen := utf8.DecodeRuneInString(text)
+	return rlen, false
+}
+
+// symbolToID finds the right ID for the given textual symbol, or returns
+// enc.unknownID if the symbol is unknown.
+func (enc *Encoder) symbolToID(symbol string) int {
+	if id, found := enc.reserved[symbol]; found {
+		return id
+	}
+	if id, found := enc.pieces[symbol]; found {
+		return id
+	}
+	return enc.unknownID
+}
+
+// convertHexValue converts strings of the form "<0xXY>" to the (unsigned)
+// integer value of the hexadecimal number XY. -1 is returned for bad input.
+func convertHexValue(bv string) int {
+	bv = strings.TrimPrefix(bv, "<0x")
+	bv = strings.TrimSuffix(bv, ">")
+	n, err := strconv.ParseInt(bv, 16, 32)
+	if err != nil {
+		return -1
+	}
+	return int(n)
+}
diff --git a/vertexai/internal/sentencepiece/internal/cmd/dumper/main.go b/vertexai/internal/sentencepiece/internal/cmd/dumper/main.go
new file mode 100644
index 000000000000..74bafa991fdf
--- /dev/null
+++ b/vertexai/internal/sentencepiece/internal/cmd/dumper/main.go
@@ -0,0 +1,87 @@
+// Copyright 2024 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+// Command dumper is a debugging utility for internal use. It helps explore
+// the model proto and compare results with other tools.
+
+import (
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"os"
+	"unicode"
+
+	"cloud.google.com/go/vertexai/internal/sentencepiece"
+	"cloud.google.com/go/vertexai/internal/sentencepiece/internal/model"
+	"google.golang.org/protobuf/encoding/prototext"
+	"google.golang.org/protobuf/proto"
+)
+
+func main() {
+	fDumpAll := flag.Bool("dumpall", false, "dump entire model proto")
+	fFindUni := flag.Bool("finduni", false, "find unicode runes not in pieces")
+	fEncodeFile := flag.String("encodefile", "", "file name to open and encode")
+	flag.Parse()
+
+	modelPath := os.Getenv("MODELPATH")
+	if modelPath == "" {
+		log.Fatal("Need MODELPATH env var to run")
+	}
+
+	b, err := ioutil.ReadFile(modelPath)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	var model model.ModelProto
+	err = proto.Unmarshal(b, &model)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	if *fDumpAll {
+		fmt.Println(prototext.Format(&model))
+	} else if *fFindUni {
+		pieces := make(map[string]int)
+		for i, piece := range model.GetPieces() {
+			pieces[piece.GetPiece()] = i
+		}
+
+		for r := rune(0); r <= unicode.MaxRune; r++ {
+			if unicode.IsPrint(r) {
+				if _, found := pieces[string(r)]; !found {
+					fmt.Printf("not in pieces: %U %q\n", r, string(r))
+				}
+			}
+		}
+	} else if *fEncodeFile != "" {
+		enc, err := sentencepiece.NewEncoderFromPath(modelPath)
+		if err != nil {
+			log.Fatal(err)
+		}
+
+		b, err := ioutil.ReadFile(*fEncodeFile)
+		if err != nil {
+			log.Fatal(err)
+		}
+
+		tokens := enc.Encode(string(b))
+		for _, t := range tokens {
+			fmt.Println(t.ID)
+		}
+	}
+}
diff --git a/vertexai/internal/sentencepiece/internal/model/gen.sh b/vertexai/internal/sentencepiece/internal/model/gen.sh
new file mode 100755
index 000000000000..7669a851a88a
--- /dev/null
+++ b/vertexai/internal/sentencepiece/internal/model/gen.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+set -o pipefail
+set -eux
+
+protoc \
+  --go_out=. \
+  --go_opt="Msentencepiece_model.proto=;model" sentencepiece_model.proto
+
+goimports -w .
+
diff --git a/vertexai/internal/sentencepiece/internal/model/sentencepiece_model.pb.go b/vertexai/internal/sentencepiece/internal/model/sentencepiece_model.pb.go
new file mode 100644
index 000000000000..127f6f23da48
--- /dev/null
+++ b/vertexai/internal/sentencepiece/internal/model/sentencepiece_model.pb.go
@@ -0,0 +1,1556 @@
+// Copyright 2024 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.!
+
+// Code generated by protoc-gen-go. DO NOT EDIT.
+// versions:
+// 	protoc-gen-go v1.25.0-devel
+// 	protoc        v3.14.0
+// source: sentencepiece_model.proto
+
+package model
+
+import (
+	reflect "reflect"
+	sync "sync"
+
+	protoreflect "google.golang.org/protobuf/reflect/protoreflect"
+	protoiface "google.golang.org/protobuf/runtime/protoiface"
+	protoimpl "google.golang.org/protobuf/runtime/protoimpl"
+)
+
+const (
+	// Verify that this generated code is sufficiently up-to-date.
+	_ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion)
+	// Verify that runtime/protoimpl is sufficiently up-to-date.
+	_ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20)
+)
+
+// Model type. only have UNIGRAM now.
+type TrainerSpec_ModelType int32
+
+const (
+	TrainerSpec_UNIGRAM TrainerSpec_ModelType = 1 // Unigram language model with dynamic algorithm
+	TrainerSpec_BPE     TrainerSpec_ModelType = 2 // Byte Pair Encoding
+	TrainerSpec_WORD    TrainerSpec_ModelType = 3 // Delimitered by whitespace.
+	TrainerSpec_CHAR    TrainerSpec_ModelType = 4 // tokenizes into character sequence
+)
+
+// Enum value maps for TrainerSpec_ModelType.
+var (
+	TrainerSpec_ModelType_name = map[int32]string{
+		1: "UNIGRAM",
+		2: "BPE",
+		3: "WORD",
+		4: "CHAR",
+	}
+	TrainerSpec_ModelType_value = map[string]int32{
+		"UNIGRAM": 1,
+		"BPE":     2,
+		"WORD":    3,
+		"CHAR":    4,
+	}
+)
+
+func (x TrainerSpec_ModelType) Enum() *TrainerSpec_ModelType {
+	p := new(TrainerSpec_ModelType)
+	*p = x
+	return p
+}
+
+func (x TrainerSpec_ModelType) String() string {
+	return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x))
+}
+
+func (TrainerSpec_ModelType) Descriptor() protoreflect.EnumDescriptor {
+	return file_sentencepiece_model_proto_enumTypes[0].Descriptor()
+}
+
+func (TrainerSpec_ModelType) Type() protoreflect.EnumType {
+	return &file_sentencepiece_model_proto_enumTypes[0]
+}
+
+func (x TrainerSpec_ModelType) Number() protoreflect.EnumNumber {
+	return protoreflect.EnumNumber(x)
+}
+
+// Deprecated: Do not use.
+func (x *TrainerSpec_ModelType) UnmarshalJSON(b []byte) error {
+	num, err := protoimpl.X.UnmarshalJSONEnum(x.Descriptor(), b)
+	if err != nil {
+		return err
+	}
+	*x = TrainerSpec_ModelType(num)
+	return nil
+}
+
+// Deprecated: Use TrainerSpec_ModelType.Descriptor instead.
+func (TrainerSpec_ModelType) EnumDescriptor() ([]byte, []int) {
+	return file_sentencepiece_model_proto_rawDescGZIP(), []int{0, 0}
+}
+
+type ModelProto_SentencePiece_Type int32
+
+const (
+	ModelProto_SentencePiece_NORMAL       ModelProto_SentencePiece_Type = 1 // normal symbol
+	ModelProto_SentencePiece_UNKNOWN      ModelProto_SentencePiece_Type = 2 // unknown symbol. only <unk> for now.
+	ModelProto_SentencePiece_CONTROL      ModelProto_SentencePiece_Type = 3 // control symbols. </s>, <s>, <2ja> etc.
+	ModelProto_SentencePiece_USER_DEFINED ModelProto_SentencePiece_Type = 4 // user defined symbols.
+	// Typical usage of USER_DEFINED symbol
+	// is placeholder.
+	ModelProto_SentencePiece_BYTE   ModelProto_SentencePiece_Type = 6 // byte symbols. Used when `byte_fallback` is true.
+	ModelProto_SentencePiece_UNUSED ModelProto_SentencePiece_Type = 5 // this piece is not used.
+)
+
+// Enum value maps for ModelProto_SentencePiece_Type.
+var (
+	ModelProto_SentencePiece_Type_name = map[int32]string{
+		1: "NORMAL",
+		2: "UNKNOWN",
+		3: "CONTROL",
+		4: "USER_DEFINED",
+		6: "BYTE",
+		5: "UNUSED",
+	}
+	ModelProto_SentencePiece_Type_value = map[string]int32{
+		"NORMAL":       1,
+		"UNKNOWN":      2,
+		"CONTROL":      3,
+		"USER_DEFINED": 4,
+		"BYTE":         6,
+		"UNUSED":       5,
+	}
+)
+
+func (x ModelProto_SentencePiece_Type) Enum() *ModelProto_SentencePiece_Type {
+	p := new(ModelProto_SentencePiece_Type)
+	*p = x
+	return p
+}
+
+func (x ModelProto_SentencePiece_Type) String() string {
+	return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x))
+}
+
+func (ModelProto_SentencePiece_Type) Descriptor() protoreflect.EnumDescriptor {
+	return file_sentencepiece_model_proto_enumTypes[1].Descriptor()
+}
+
+func (ModelProto_SentencePiece_Type) Type() protoreflect.EnumType {
+	return &file_sentencepiece_model_proto_enumTypes[1]
+}
+
+func (x ModelProto_SentencePiece_Type) Number() protoreflect.EnumNumber {
+	return protoreflect.EnumNumber(x)
+}
+
+// Deprecated: Do not use.
+func (x *ModelProto_SentencePiece_Type) UnmarshalJSON(b []byte) error {
+	num, err := protoimpl.X.UnmarshalJSONEnum(x.Descriptor(), b)
+	if err != nil {
+		return err
+	}
+	*x = ModelProto_SentencePiece_Type(num)
+	return nil
+}
+
+// Deprecated: Use ModelProto_SentencePiece_Type.Descriptor instead.
+func (ModelProto_SentencePiece_Type) EnumDescriptor() ([]byte, []int) {
+	return file_sentencepiece_model_proto_rawDescGZIP(), []int{3, 0, 0}
+}
+
+// TrainerSpec encodes a various parameters for SentencePiece training.
+// Next id: 55
+type TrainerSpec struct {
+	state           protoimpl.MessageState
+	sizeCache       protoimpl.SizeCache
+	unknownFields   protoimpl.UnknownFields
+	extensionFields protoimpl.ExtensionFields
+
+	///////////////////////////////////////////////////////////////////
+	// General parameters
+	//
+	// Input corpus files.
+	//  Trainer accepts the following two formats:
+	//  A) Monolingual: plain text, one sentence per line.
+	//  B) Bilingual:   TSV, source sentence <tab> target sentence
+	//  When bilingual data is passed, shared vocabulary model is built.
+	//  Note that the input file must be raw corpus, not a preprocessed corpus.
+	//  Trainer only loads the first `input_sentence_size` sentences specified
+	//  with this parameter.
+	Input []string `protobuf:"bytes,1,rep,name=input" json:"input,omitempty"`
+	// Input corpus format:
+	// "text": one-sentence-per-line text format (default)
+	// "tsv":  sentence <tab> freq
+	InputFormat *string `protobuf:"bytes,7,opt,name=input_format,json=inputFormat" json:"input_format,omitempty"`
+	// Output model file prefix.
+	// <model_prefix>.model and <model_prefix>.vocab are generated.
+	ModelPrefix *string                `protobuf:"bytes,2,opt,name=model_prefix,json=modelPrefix" json:"model_prefix,omitempty"`
+	ModelType   *TrainerSpec_ModelType `protobuf:"varint,3,opt,name=model_type,json=modelType,enum=sentencepiece.TrainerSpec_ModelType,def=1" json:"model_type,omitempty"`
+	// Vocabulary size. 8k is the default size.
+	VocabSize *int32 `protobuf:"varint,4,opt,name=vocab_size,json=vocabSize,def=8000" json:"vocab_size,omitempty"`
+	// List of the languages this model can accept.
+	// Since the model is language-agnostic, this field is used as a reference.
+	AcceptLanguage []string `protobuf:"bytes,5,rep,name=accept_language,json=acceptLanguage" json:"accept_language,omitempty"`
+	// Size of self-test samples, which are encoded in the model file.
+	SelfTestSampleSize *int32 `protobuf:"varint,6,opt,name=self_test_sample_size,json=selfTestSampleSize,def=0" json:"self_test_sample_size,omitempty"`
+	// Whether to use DP version of sentencepiece. Use it with TSV input format
+	// (requires precomputed word tab counts to work).
+	EnableDifferentialPrivacy *bool `protobuf:"varint,50,opt,name=enable_differential_privacy,json=enableDifferentialPrivacy,def=0" json:"enable_differential_privacy,omitempty"`
+	// Set these parameters if you need DP version of sentencepiece.
+	// std of noise to add.
+	DifferentialPrivacyNoiseLevel *float32 `protobuf:"fixed32,51,opt,name=differential_privacy_noise_level,json=differentialPrivacyNoiseLevel,def=0" json:"differential_privacy_noise_level,omitempty"`
+	// Clipping threshold to apply after adding noise. All the words with
+	// frequency less than this value are dropped.
+	DifferentialPrivacyClippingThreshold *uint64 `protobuf:"varint,52,opt,name=differential_privacy_clipping_threshold,json=differentialPrivacyClippingThreshold,def=0" json:"differential_privacy_clipping_threshold,omitempty"`
+	///////////////////////////////////////////////////////////////////
+	// Training parameters.
+	//
+	// Uses characters which cover the corpus with the ratio of `chars_coverage`.
+	// This parameter determines the set of basic Alphabet of sentence piece.
+	// 1.0 - `chars_coverage` characters are treated as UNK.
+	// See also required_chars field.
+	CharacterCoverage *float32 `protobuf:"fixed32,10,opt,name=character_coverage,json=characterCoverage,def=0.9995" json:"character_coverage,omitempty"`
+	// Maximum size of sentences the trainer loads from `input` parameter.
+	// Trainer simply loads the `input` files in sequence.
+	// It is better to shuffle the input corpus randomly.
+	InputSentenceSize    *uint64 `protobuf:"varint,11,opt,name=input_sentence_size,json=inputSentenceSize,def=0" json:"input_sentence_size,omitempty"`
+	ShuffleInputSentence *bool   `protobuf:"varint,19,opt,name=shuffle_input_sentence,json=shuffleInputSentence,def=1" json:"shuffle_input_sentence,omitempty"`
+	// Maximum size of sentences to make seed sentence pieces.
+	// Extended suffix array is constructed to extract frequent
+	// sub-strings from the corpus. This uses 20N working space,
+	// where N is the size of corpus.
+	//
+	// Deprecated: Do not use.
+	MiningSentenceSize *int32 `protobuf:"varint,12,opt,name=mining_sentence_size,json=miningSentenceSize" json:"mining_sentence_size,omitempty"`
+	// Maximum size of sentences to train sentence pieces.
+	//
+	// Deprecated: Do not use.
+	TrainingSentenceSize *int32 `protobuf:"varint,13,opt,name=training_sentence_size,json=trainingSentenceSize" json:"training_sentence_size,omitempty"`
+	// The size of seed sentencepieces.
+	// `seed_sentencepiece_size` must be larger than `vocab_size`.
+	SeedSentencepieceSize *int32 `protobuf:"varint,14,opt,name=seed_sentencepiece_size,json=seedSentencepieceSize,def=1000000" json:"seed_sentencepiece_size,omitempty"`
+	// In every EM sub-iterations, keeps top
+	// `shrinking_factor` * `current sentencepieces size` with respect to
+	// the loss of the sentence piece. This value should be smaller than 1.0.
+	ShrinkingFactor *float32 `protobuf:"fixed32,15,opt,name=shrinking_factor,json=shrinkingFactor,def=0.75" json:"shrinking_factor,omitempty"`
+	// The maximum sentence length in byte. The sentences with the length
+	// larger than `max_sentence_length` is simply ignored.
+	// Longer input tends to bring the following risks:
+	//  * Overflow during EM training (unigram language model only)
+	//  * Performance drop because of O(n log n) cost in BPE.
+	MaxSentenceLength *int32 `protobuf:"varint,18,opt,name=max_sentence_length,json=maxSentenceLength,def=4192" json:"max_sentence_length,omitempty"`
+	// Number of threads in the training.
+	NumThreads *int32 `protobuf:"varint,16,opt,name=num_threads,json=numThreads,def=16" json:"num_threads,omitempty"`
+	// Number of EM sub iterations.
+	NumSubIterations *int32 `protobuf:"varint,17,opt,name=num_sub_iterations,json=numSubIterations,def=2" json:"num_sub_iterations,omitempty"`
+	///////////////////////////////////////////////////////////////////
+	// SentencePiece parameters which control the shapes of sentence piece.
+	//
+	// Maximum length of sentencepiece.
+	MaxSentencepieceLength *int32 `protobuf:"varint,20,opt,name=max_sentencepiece_length,json=maxSentencepieceLength,def=16" json:"max_sentencepiece_length,omitempty"`
+	// Uses Unicode script to split sentence pieces.
+	// When `split_by_unicode_script` is true, we do not allow sentence piece to
+	// include multiple Unicode scripts, e.g. "F1" is not a valid piece.
+	// Exception: CJ characters (Hiragana/Katakana/Han) are all handled
+	// as one script type, since Japanese word can consist of multiple scripts.
+	// This exception is always applied regardless of the accept-language
+	// parameter.
+	SplitByUnicodeScript *bool `protobuf:"varint,21,opt,name=split_by_unicode_script,json=splitByUnicodeScript,def=1" json:"split_by_unicode_script,omitempty"`
+	// When `split_by_number` is true, put a boundary between number and
+	// non-number transition. If we want to treat "F1" is one token, set this flag
+	// to be false.
+	SplitByNumber *bool `protobuf:"varint,23,opt,name=split_by_number,json=splitByNumber,def=1" json:"split_by_number,omitempty"`
+	// Use a white space to split sentence pieces.
+	// When `split_by_whitespace` is false, we may have the piece containing
+	// a white space in the middle. e.g., "in_the".
+	SplitByWhitespace *bool `protobuf:"varint,22,opt,name=split_by_whitespace,json=splitByWhitespace,def=1" json:"split_by_whitespace,omitempty"`
+	// Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello =>
+	// hello_. When `treat_whitespace_as_suffix` is true,
+	// NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end
+	// of sentence.
+	TreatWhitespaceAsSuffix *bool `protobuf:"varint,24,opt,name=treat_whitespace_as_suffix,json=treatWhitespaceAsSuffix,def=0" json:"treat_whitespace_as_suffix,omitempty"`
+	// Allows pieces that only contain whitespaces instead of appearing only as
+	// prefix or suffix of other pieces.
+	AllowWhitespaceOnlyPieces *bool `protobuf:"varint,26,opt,name=allow_whitespace_only_pieces,json=allowWhitespaceOnlyPieces,def=0" json:"allow_whitespace_only_pieces,omitempty"`
+	// Split all digits (0-9) into separate pieces.
+	SplitDigits *bool `protobuf:"varint,25,opt,name=split_digits,json=splitDigits,def=0" json:"split_digits,omitempty"`
+	// Defines the pre-tokenization delimiter.
+	// When specified, no pieces crossing this delimiter is not included
+	// in the vocab. Then the delimiter string is virtually ignored
+	// during the training. This field can allows constraints on the vocabulary
+	// selection. Note that this field is available on unigram mode.
+	PretokenizationDelimiter *string `protobuf:"bytes,53,opt,name=pretokenization_delimiter,json=pretokenizationDelimiter,def=" json:"pretokenization_delimiter,omitempty"`
+	///////////////////////////////////////////////////////////////////
+	// Vocabulary management
+	//
+	// Defines control symbols used as an indicator to
+	// change the behavior of the decoder. <s> and </s> are pre-defined.
+	// We can use this field to encode various meta information,
+	// including language indicator in multilingual model.
+	// These symbols are not visible to users, but visible to
+	// the decoder. Note that when the input sentence contains control symbols,
+	// they are not treated as one token, but segmented into normal pieces.
+	// Control symbols must be inserted independently from the segmentation.
+	ControlSymbols []string `protobuf:"bytes,30,rep,name=control_symbols,json=controlSymbols" json:"control_symbols,omitempty"`
+	// Defines user defined symbols.
+	// These symbols are added with extremely high score
+	// so they are always treated as one unique symbol in any context.
+	// Typical usage of user_defined_symbols is placeholder for named entities.
+	UserDefinedSymbols []string `protobuf:"bytes,31,rep,name=user_defined_symbols,json=userDefinedSymbols" json:"user_defined_symbols,omitempty"`
+	// Defines required characters. Each UTF8 character in this string is included
+	// in the character set regardless of character_coverage value. Unlike
+	// user_defined_symbols, these characters have scores based on the frequency
+	// on input sentences, and the model can form subwords using characters
+	// in this field.
+	RequiredChars *string `protobuf:"bytes,36,opt,name=required_chars,json=requiredChars" json:"required_chars,omitempty"`
+	// Decomposes unknown pieces into UTF-8 bytes.
+	ByteFallback *bool `protobuf:"varint,35,opt,name=byte_fallback,json=byteFallback,def=0" json:"byte_fallback,omitempty"`
+	// When creating the vocabulary file, defines whether or not to additionally
+	// output the score for each piece.
+	VocabularyOutputPieceScore *bool `protobuf:"varint,32,opt,name=vocabulary_output_piece_score,json=vocabularyOutputPieceScore,def=1" json:"vocabulary_output_piece_score,omitempty"`
+	// `vocab_size` is treated as hard limit. Crash if
+	// the model can not produce the vocab of size `vocab_size`,
+	// When `hard_vocab_limit` is false, vocab_size is treated
+	// as soft limit. Note that when model_type=char,
+	// always assumes hard_vocab_limit = false.
+	HardVocabLimit *bool `protobuf:"varint,33,opt,name=hard_vocab_limit,json=hardVocabLimit,def=1" json:"hard_vocab_limit,omitempty"`
+	// use all symbols for vocab extraction. This flag is valid
+	// if model type is either CHAR or WORD
+	UseAllVocab *bool `protobuf:"varint,34,opt,name=use_all_vocab,json=useAllVocab,def=0" json:"use_all_vocab,omitempty"`
+	///////////////////////////////////////////////////////////////////
+	// Reserved special meta tokens.
+	// * -1 is not used.
+	// * unk_id must not be -1.
+	// Id must starts with 0 and be contigous.
+	UnkId    *int32  `protobuf:"varint,40,opt,name=unk_id,json=unkId,def=0" json:"unk_id,omitempty"`  // <unk>
+	BosId    *int32  `protobuf:"varint,41,opt,name=bos_id,json=bosId,def=1" json:"bos_id,omitempty"`  // <s>
+	EosId    *int32  `protobuf:"varint,42,opt,name=eos_id,json=eosId,def=2" json:"eos_id,omitempty"`  // </s>
+	PadId    *int32  `protobuf:"varint,43,opt,name=pad_id,json=padId,def=-1" json:"pad_id,omitempty"` // <pad> (padding)
+	UnkPiece *string `protobuf:"bytes,45,opt,name=unk_piece,json=unkPiece,def=<unk>" json:"unk_piece,omitempty"`
+	BosPiece *string `protobuf:"bytes,46,opt,name=bos_piece,json=bosPiece,def=<s>" json:"bos_piece,omitempty"`
+	EosPiece *string `protobuf:"bytes,47,opt,name=eos_piece,json=eosPiece,def=</s>" json:"eos_piece,omitempty"`
+	PadPiece *string `protobuf:"bytes,48,opt,name=pad_piece,json=padPiece,def=<pad>" json:"pad_piece,omitempty"`
+	// Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
+	// since this character can be useful both for user and
+	// developer. We can easily figure out that <unk> is emitted.
+	UnkSurface *string `protobuf:"bytes,44,opt,name=unk_surface,json=unkSurface,def= ⁇ " json:"unk_surface,omitempty"`
+	// Increase bit depth to allow unigram model training on large
+	// (>10M sentences) corpora. A Side-effect of enabling this flag
+	// is increased memory usage.
+	TrainExtremelyLargeCorpus *bool `protobuf:"varint,49,opt,name=train_extremely_large_corpus,json=trainExtremelyLargeCorpus,def=0" json:"train_extremely_large_corpus,omitempty"`
+	// Path to a seed sentencepieces file, with one tab-separated
+	// seed sentencepiece <tab> frequency per line.
+	SeedSentencepiecesFile *string `protobuf:"bytes,54,opt,name=seed_sentencepieces_file,json=seedSentencepiecesFile,def=" json:"seed_sentencepieces_file,omitempty"`
+}
+
+// Default values for TrainerSpec fields.
+const (
+	Default_TrainerSpec_ModelType                            = TrainerSpec_UNIGRAM
+	Default_TrainerSpec_VocabSize                            = int32(8000)
+	Default_TrainerSpec_SelfTestSampleSize                   = int32(0)
+	Default_TrainerSpec_EnableDifferentialPrivacy            = bool(false)
+	Default_TrainerSpec_DifferentialPrivacyNoiseLevel        = float32(0)
+	Default_TrainerSpec_DifferentialPrivacyClippingThreshold = uint64(0)
+	Default_TrainerSpec_CharacterCoverage                    = float32(0.9994999766349792)
+	Default_TrainerSpec_InputSentenceSize                    = uint64(0)
+	Default_TrainerSpec_ShuffleInputSentence                 = bool(true)
+	Default_TrainerSpec_SeedSentencepieceSize                = int32(1000000)
+	Default_TrainerSpec_ShrinkingFactor                      = float32(0.75)
+	Default_TrainerSpec_MaxSentenceLength                    = int32(4192)
+	Default_TrainerSpec_NumThreads                           = int32(16)
+	Default_TrainerSpec_NumSubIterations                     = int32(2)
+	Default_TrainerSpec_MaxSentencepieceLength               = int32(16)
+	Default_TrainerSpec_SplitByUnicodeScript                 = bool(true)
+	Default_TrainerSpec_SplitByNumber                        = bool(true)
+	Default_TrainerSpec_SplitByWhitespace                    = bool(true)
+	Default_TrainerSpec_TreatWhitespaceAsSuffix              = bool(false)
+	Default_TrainerSpec_AllowWhitespaceOnlyPieces            = bool(false)
+	Default_TrainerSpec_SplitDigits                          = bool(false)
+	Default_TrainerSpec_PretokenizationDelimiter             = string("")
+	Default_TrainerSpec_ByteFallback                         = bool(false)
+	Default_TrainerSpec_VocabularyOutputPieceScore           = bool(true)
+	Default_TrainerSpec_HardVocabLimit                       = bool(true)
+	Default_TrainerSpec_UseAllVocab                          = bool(false)
+	Default_TrainerSpec_UnkId                                = int32(0)
+	Default_TrainerSpec_BosId                                = int32(1)
+	Default_TrainerSpec_EosId                                = int32(2)
+	Default_TrainerSpec_PadId                                = int32(-1)
+	Default_TrainerSpec_UnkPiece                             = string("<unk>")
+	Default_TrainerSpec_BosPiece                             = string("<s>")
+	Default_TrainerSpec_EosPiece                             = string("</s>")
+	Default_TrainerSpec_PadPiece                             = string("<pad>")
+	Default_TrainerSpec_UnkSurface                           = string(" ⁇ ")
+	Default_TrainerSpec_TrainExtremelyLargeCorpus            = bool(false)
+	Default_TrainerSpec_SeedSentencepiecesFile               = string("")
+)
+
+func (x *TrainerSpec) Reset() {
+	*x = TrainerSpec{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_sentencepiece_model_proto_msgTypes[0]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *TrainerSpec) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*TrainerSpec) ProtoMessage() {}
+
+func (x *TrainerSpec) ProtoReflect() protoreflect.Message {
+	mi := &file_sentencepiece_model_proto_msgTypes[0]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use TrainerSpec.ProtoReflect.Descriptor instead.
+func (*TrainerSpec) Descriptor() ([]byte, []int) {
+	return file_sentencepiece_model_proto_rawDescGZIP(), []int{0}
+}
+
+var extRange_TrainerSpec = []protoiface.ExtensionRangeV1{
+	{Start: 200, End: 536870911},
+}
+
+// Deprecated: Use TrainerSpec.ProtoReflect.Descriptor.ExtensionRanges instead.
+func (*TrainerSpec) ExtensionRangeArray() []protoiface.ExtensionRangeV1 {
+	return extRange_TrainerSpec
+}
+
+func (x *TrainerSpec) GetInput() []string {
+	if x != nil {
+		return x.Input
+	}
+	return nil
+}
+
+func (x *TrainerSpec) GetInputFormat() string {
+	if x != nil && x.InputFormat != nil {
+		return *x.InputFormat
+	}
+	return ""
+}
+
+func (x *TrainerSpec) GetModelPrefix() string {
+	if x != nil && x.ModelPrefix != nil {
+		return *x.ModelPrefix
+	}
+	return ""
+}
+
+func (x *TrainerSpec) GetModelType() TrainerSpec_ModelType {
+	if x != nil && x.ModelType != nil {
+		return *x.ModelType
+	}
+	return Default_TrainerSpec_ModelType
+}
+
+func (x *TrainerSpec) GetVocabSize() int32 {
+	if x != nil && x.VocabSize != nil {
+		return *x.VocabSize
+	}
+	return Default_TrainerSpec_VocabSize
+}
+
+func (x *TrainerSpec) GetAcceptLanguage() []string {
+	if x != nil {
+		return x.AcceptLanguage
+	}
+	return nil
+}
+
+func (x *TrainerSpec) GetSelfTestSampleSize() int32 {
+	if x != nil && x.SelfTestSampleSize != nil {
+		return *x.SelfTestSampleSize
+	}
+	return Default_TrainerSpec_SelfTestSampleSize
+}
+
+func (x *TrainerSpec) GetEnableDifferentialPrivacy() bool {
+	if x != nil && x.EnableDifferentialPrivacy != nil {
+		return *x.EnableDifferentialPrivacy
+	}
+	return Default_TrainerSpec_EnableDifferentialPrivacy
+}
+
+func (x *TrainerSpec) GetDifferentialPrivacyNoiseLevel() float32 {
+	if x != nil && x.DifferentialPrivacyNoiseLevel != nil {
+		return *x.DifferentialPrivacyNoiseLevel
+	}
+	return Default_TrainerSpec_DifferentialPrivacyNoiseLevel
+}
+
+func (x *TrainerSpec) GetDifferentialPrivacyClippingThreshold() uint64 {
+	if x != nil && x.DifferentialPrivacyClippingThreshold != nil {
+		return *x.DifferentialPrivacyClippingThreshold
+	}
+	return Default_TrainerSpec_DifferentialPrivacyClippingThreshold
+}
+
+func (x *TrainerSpec) GetCharacterCoverage() float32 {
+	if x != nil && x.CharacterCoverage != nil {
+		return *x.CharacterCoverage
+	}
+	return Default_TrainerSpec_CharacterCoverage
+}
+
+func (x *TrainerSpec) GetInputSentenceSize() uint64 {
+	if x != nil && x.InputSentenceSize != nil {
+		return *x.InputSentenceSize
+	}
+	return Default_TrainerSpec_InputSentenceSize
+}
+
+func (x *TrainerSpec) GetShuffleInputSentence() bool {
+	if x != nil && x.ShuffleInputSentence != nil {
+		return *x.ShuffleInputSentence
+	}
+	return Default_TrainerSpec_ShuffleInputSentence
+}
+
+// Deprecated: Do not use.
+func (x *TrainerSpec) GetMiningSentenceSize() int32 {
+	if x != nil && x.MiningSentenceSize != nil {
+		return *x.MiningSentenceSize
+	}
+	return 0
+}
+
+// Deprecated: Do not use.
+func (x *TrainerSpec) GetTrainingSentenceSize() int32 {
+	if x != nil && x.TrainingSentenceSize != nil {
+		return *x.TrainingSentenceSize
+	}
+	return 0
+}
+
+func (x *TrainerSpec) GetSeedSentencepieceSize() int32 {
+	if x != nil && x.SeedSentencepieceSize != nil {
+		return *x.SeedSentencepieceSize
+	}
+	return Default_TrainerSpec_SeedSentencepieceSize
+}
+
+func (x *TrainerSpec) GetShrinkingFactor() float32 {
+	if x != nil && x.ShrinkingFactor != nil {
+		return *x.ShrinkingFactor
+	}
+	return Default_TrainerSpec_ShrinkingFactor
+}
+
+func (x *TrainerSpec) GetMaxSentenceLength() int32 {
+	if x != nil && x.MaxSentenceLength != nil {
+		return *x.MaxSentenceLength
+	}
+	return Default_TrainerSpec_MaxSentenceLength
+}
+
+func (x *TrainerSpec) GetNumThreads() int32 {
+	if x != nil && x.NumThreads != nil {
+		return *x.NumThreads
+	}
+	return Default_TrainerSpec_NumThreads
+}
+
+func (x *TrainerSpec) GetNumSubIterations() int32 {
+	if x != nil && x.NumSubIterations != nil {
+		return *x.NumSubIterations
+	}
+	return Default_TrainerSpec_NumSubIterations
+}
+
+func (x *TrainerSpec) GetMaxSentencepieceLength() int32 {
+	if x != nil && x.MaxSentencepieceLength != nil {
+		return *x.MaxSentencepieceLength
+	}
+	return Default_TrainerSpec_MaxSentencepieceLength
+}
+
+func (x *TrainerSpec) GetSplitByUnicodeScript() bool {
+	if x != nil && x.SplitByUnicodeScript != nil {
+		return *x.SplitByUnicodeScript
+	}
+	return Default_TrainerSpec_SplitByUnicodeScript
+}
+
+func (x *TrainerSpec) GetSplitByNumber() bool {
+	if x != nil && x.SplitByNumber != nil {
+		return *x.SplitByNumber
+	}
+	return Default_TrainerSpec_SplitByNumber
+}
+
+func (x *TrainerSpec) GetSplitByWhitespace() bool {
+	if x != nil && x.SplitByWhitespace != nil {
+		return *x.SplitByWhitespace
+	}
+	return Default_TrainerSpec_SplitByWhitespace
+}
+
+func (x *TrainerSpec) GetTreatWhitespaceAsSuffix() bool {
+	if x != nil && x.TreatWhitespaceAsSuffix != nil {
+		return *x.TreatWhitespaceAsSuffix
+	}
+	return Default_TrainerSpec_TreatWhitespaceAsSuffix
+}
+
+func (x *TrainerSpec) GetAllowWhitespaceOnlyPieces() bool {
+	if x != nil && x.AllowWhitespaceOnlyPieces != nil {
+		return *x.AllowWhitespaceOnlyPieces
+	}
+	return Default_TrainerSpec_AllowWhitespaceOnlyPieces
+}
+
+func (x *TrainerSpec) GetSplitDigits() bool {
+	if x != nil && x.SplitDigits != nil {
+		return *x.SplitDigits
+	}
+	return Default_TrainerSpec_SplitDigits
+}
+
+func (x *TrainerSpec) GetPretokenizationDelimiter() string {
+	if x != nil && x.PretokenizationDelimiter != nil {
+		return *x.PretokenizationDelimiter
+	}
+	return Default_TrainerSpec_PretokenizationDelimiter
+}
+
+func (x *TrainerSpec) GetControlSymbols() []string {
+	if x != nil {
+		return x.ControlSymbols
+	}
+	return nil
+}
+
+func (x *TrainerSpec) GetUserDefinedSymbols() []string {
+	if x != nil {
+		return x.UserDefinedSymbols
+	}
+	return nil
+}
+
+func (x *TrainerSpec) GetRequiredChars() string {
+	if x != nil && x.RequiredChars != nil {
+		return *x.RequiredChars
+	}
+	return ""
+}
+
+func (x *TrainerSpec) GetByteFallback() bool {
+	if x != nil && x.ByteFallback != nil {
+		return *x.ByteFallback
+	}
+	return Default_TrainerSpec_ByteFallback
+}
+
+func (x *TrainerSpec) GetVocabularyOutputPieceScore() bool {
+	if x != nil && x.VocabularyOutputPieceScore != nil {
+		return *x.VocabularyOutputPieceScore
+	}
+	return Default_TrainerSpec_VocabularyOutputPieceScore
+}
+
+func (x *TrainerSpec) GetHardVocabLimit() bool {
+	if x != nil && x.HardVocabLimit != nil {
+		return *x.HardVocabLimit
+	}
+	return Default_TrainerSpec_HardVocabLimit
+}
+
+func (x *TrainerSpec) GetUseAllVocab() bool {
+	if x != nil && x.UseAllVocab != nil {
+		return *x.UseAllVocab
+	}
+	return Default_TrainerSpec_UseAllVocab
+}
+
+func (x *TrainerSpec) GetUnkId() int32 {
+	if x != nil && x.UnkId != nil {
+		return *x.UnkId
+	}
+	return Default_TrainerSpec_UnkId
+}
+
+func (x *TrainerSpec) GetBosId() int32 {
+	if x != nil && x.BosId != nil {
+		return *x.BosId
+	}
+	return Default_TrainerSpec_BosId
+}
+
+func (x *TrainerSpec) GetEosId() int32 {
+	if x != nil && x.EosId != nil {
+		return *x.EosId
+	}
+	return Default_TrainerSpec_EosId
+}
+
+func (x *TrainerSpec) GetPadId() int32 {
+	if x != nil && x.PadId != nil {
+		return *x.PadId
+	}
+	return Default_TrainerSpec_PadId
+}
+
+func (x *TrainerSpec) GetUnkPiece() string {
+	if x != nil && x.UnkPiece != nil {
+		return *x.UnkPiece
+	}
+	return Default_TrainerSpec_UnkPiece
+}
+
+func (x *TrainerSpec) GetBosPiece() string {
+	if x != nil && x.BosPiece != nil {
+		return *x.BosPiece
+	}
+	return Default_TrainerSpec_BosPiece
+}
+
+func (x *TrainerSpec) GetEosPiece() string {
+	if x != nil && x.EosPiece != nil {
+		return *x.EosPiece
+	}
+	return Default_TrainerSpec_EosPiece
+}
+
+func (x *TrainerSpec) GetPadPiece() string {
+	if x != nil && x.PadPiece != nil {
+		return *x.PadPiece
+	}
+	return Default_TrainerSpec_PadPiece
+}
+
+func (x *TrainerSpec) GetUnkSurface() string {
+	if x != nil && x.UnkSurface != nil {
+		return *x.UnkSurface
+	}
+	return Default_TrainerSpec_UnkSurface
+}
+
+func (x *TrainerSpec) GetTrainExtremelyLargeCorpus() bool {
+	if x != nil && x.TrainExtremelyLargeCorpus != nil {
+		return *x.TrainExtremelyLargeCorpus
+	}
+	return Default_TrainerSpec_TrainExtremelyLargeCorpus
+}
+
+func (x *TrainerSpec) GetSeedSentencepiecesFile() string {
+	if x != nil && x.SeedSentencepiecesFile != nil {
+		return *x.SeedSentencepiecesFile
+	}
+	return Default_TrainerSpec_SeedSentencepiecesFile
+}
+
+// NormalizerSpec encodes a various parameters for string normalizaiton
+type NormalizerSpec struct {
+	state           protoimpl.MessageState
+	sizeCache       protoimpl.SizeCache
+	unknownFields   protoimpl.UnknownFields
+	extensionFields protoimpl.ExtensionFields
+
+	// name of normalization rule.
+	Name *string `protobuf:"bytes,1,opt,name=name" json:"name,omitempty"`
+	// Pre-compiled normalization rule created by
+	// Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method.
+	// Usually this field is set by Builder::GetNormalizerSpec() method.
+	PrecompiledCharsmap []byte `protobuf:"bytes,2,opt,name=precompiled_charsmap,json=precompiledCharsmap" json:"precompiled_charsmap,omitempty"`
+	// Adds dummy whitespace at the beginning of text in order to
+	// treat "world" in "world" and "hello world" in the same way.
+	AddDummyPrefix *bool `protobuf:"varint,3,opt,name=add_dummy_prefix,json=addDummyPrefix,def=1" json:"add_dummy_prefix,omitempty"`
+	// Removes leading, trailing, and duplicate internal whitespace.
+	RemoveExtraWhitespaces *bool `protobuf:"varint,4,opt,name=remove_extra_whitespaces,json=removeExtraWhitespaces,def=1" json:"remove_extra_whitespaces,omitempty"`
+	// Replaces whitespace with meta symbol.
+	// This field must be true to train sentence piece model.
+	EscapeWhitespaces *bool `protobuf:"varint,5,opt,name=escape_whitespaces,json=escapeWhitespaces,def=1" json:"escape_whitespaces,omitempty"`
+	// Custom normalization rule file in TSV format.
+	// https://github.com/google/sentencepiece/blob/master/doc/normalization.md
+	// This field is only used in SentencePieceTrainer::Train() method, which
+	// compiles the rule into the binary rule stored in `precompiled_charsmap`.
+	NormalizationRuleTsv *string `protobuf:"bytes,6,opt,name=normalization_rule_tsv,json=normalizationRuleTsv" json:"normalization_rule_tsv,omitempty"`
+}
+
+// Default values for NormalizerSpec fields.
+const (
+	Default_NormalizerSpec_AddDummyPrefix         = bool(true)
+	Default_NormalizerSpec_RemoveExtraWhitespaces = bool(true)
+	Default_NormalizerSpec_EscapeWhitespaces      = bool(true)
+)
+
+func (x *NormalizerSpec) Reset() {
+	*x = NormalizerSpec{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_sentencepiece_model_proto_msgTypes[1]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *NormalizerSpec) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*NormalizerSpec) ProtoMessage() {}
+
+func (x *NormalizerSpec) ProtoReflect() protoreflect.Message {
+	mi := &file_sentencepiece_model_proto_msgTypes[1]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use NormalizerSpec.ProtoReflect.Descriptor instead.
+func (*NormalizerSpec) Descriptor() ([]byte, []int) {
+	return file_sentencepiece_model_proto_rawDescGZIP(), []int{1}
+}
+
+var extRange_NormalizerSpec = []protoiface.ExtensionRangeV1{
+	{Start: 200, End: 536870911},
+}
+
+// Deprecated: Use NormalizerSpec.ProtoReflect.Descriptor.ExtensionRanges instead.
+func (*NormalizerSpec) ExtensionRangeArray() []protoiface.ExtensionRangeV1 {
+	return extRange_NormalizerSpec
+}
+
+func (x *NormalizerSpec) GetName() string {
+	if x != nil && x.Name != nil {
+		return *x.Name
+	}
+	return ""
+}
+
+func (x *NormalizerSpec) GetPrecompiledCharsmap() []byte {
+	if x != nil {
+		return x.PrecompiledCharsmap
+	}
+	return nil
+}
+
+func (x *NormalizerSpec) GetAddDummyPrefix() bool {
+	if x != nil && x.AddDummyPrefix != nil {
+		return *x.AddDummyPrefix
+	}
+	return Default_NormalizerSpec_AddDummyPrefix
+}
+
+func (x *NormalizerSpec) GetRemoveExtraWhitespaces() bool {
+	if x != nil && x.RemoveExtraWhitespaces != nil {
+		return *x.RemoveExtraWhitespaces
+	}
+	return Default_NormalizerSpec_RemoveExtraWhitespaces
+}
+
+func (x *NormalizerSpec) GetEscapeWhitespaces() bool {
+	if x != nil && x.EscapeWhitespaces != nil {
+		return *x.EscapeWhitespaces
+	}
+	return Default_NormalizerSpec_EscapeWhitespaces
+}
+
+func (x *NormalizerSpec) GetNormalizationRuleTsv() string {
+	if x != nil && x.NormalizationRuleTsv != nil {
+		return *x.NormalizationRuleTsv
+	}
+	return ""
+}
+
+// Proto to store samples for self-testing.
+type SelfTestData struct {
+	state           protoimpl.MessageState
+	sizeCache       protoimpl.SizeCache
+	unknownFields   protoimpl.UnknownFields
+	extensionFields protoimpl.ExtensionFields
+
+	Samples []*SelfTestData_Sample `protobuf:"bytes,1,rep,name=samples" json:"samples,omitempty"`
+}
+
+func (x *SelfTestData) Reset() {
+	*x = SelfTestData{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_sentencepiece_model_proto_msgTypes[2]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *SelfTestData) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*SelfTestData) ProtoMessage() {}
+
+func (x *SelfTestData) ProtoReflect() protoreflect.Message {
+	mi := &file_sentencepiece_model_proto_msgTypes[2]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use SelfTestData.ProtoReflect.Descriptor instead.
+func (*SelfTestData) Descriptor() ([]byte, []int) {
+	return file_sentencepiece_model_proto_rawDescGZIP(), []int{2}
+}
+
+var extRange_SelfTestData = []protoiface.ExtensionRangeV1{
+	{Start: 200, End: 536870911},
+}
+
+// Deprecated: Use SelfTestData.ProtoReflect.Descriptor.ExtensionRanges instead.
+func (*SelfTestData) ExtensionRangeArray() []protoiface.ExtensionRangeV1 {
+	return extRange_SelfTestData
+}
+
+func (x *SelfTestData) GetSamples() []*SelfTestData_Sample {
+	if x != nil {
+		return x.Samples
+	}
+	return nil
+}
+
+// ModelProto stores model parameters.
+// SentencePieceProcessor is supposed to be self-contained.
+// All settings/parameters which may change the behavior must be encoded
+// in ModelProto.
+type ModelProto struct {
+	state           protoimpl.MessageState
+	sizeCache       protoimpl.SizeCache
+	unknownFields   protoimpl.UnknownFields
+	extensionFields protoimpl.ExtensionFields
+
+	// Sentence pieces with scores.
+	Pieces []*ModelProto_SentencePiece `protobuf:"bytes,1,rep,name=pieces" json:"pieces,omitempty"`
+	// Spec used to generate this model file.
+	TrainerSpec *TrainerSpec `protobuf:"bytes,2,opt,name=trainer_spec,json=trainerSpec" json:"trainer_spec,omitempty"`
+	// Spec for text normalization.
+	NormalizerSpec *NormalizerSpec `protobuf:"bytes,3,opt,name=normalizer_spec,json=normalizerSpec" json:"normalizer_spec,omitempty"`
+	// Stores sample input and its expected segmentation to verify the model.
+	SelfTestData *SelfTestData `protobuf:"bytes,4,opt,name=self_test_data,json=selfTestData" json:"self_test_data,omitempty"`
+	// Spec for text de-normalization.
+	DenormalizerSpec *NormalizerSpec `protobuf:"bytes,5,opt,name=denormalizer_spec,json=denormalizerSpec" json:"denormalizer_spec,omitempty"`
+}
+
+func (x *ModelProto) Reset() {
+	*x = ModelProto{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_sentencepiece_model_proto_msgTypes[3]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *ModelProto) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*ModelProto) ProtoMessage() {}
+
+func (x *ModelProto) ProtoReflect() protoreflect.Message {
+	mi := &file_sentencepiece_model_proto_msgTypes[3]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use ModelProto.ProtoReflect.Descriptor instead.
+func (*ModelProto) Descriptor() ([]byte, []int) {
+	return file_sentencepiece_model_proto_rawDescGZIP(), []int{3}
+}
+
+var extRange_ModelProto = []protoiface.ExtensionRangeV1{
+	{Start: 200, End: 536870911},
+}
+
+// Deprecated: Use ModelProto.ProtoReflect.Descriptor.ExtensionRanges instead.
+func (*ModelProto) ExtensionRangeArray() []protoiface.ExtensionRangeV1 {
+	return extRange_ModelProto
+}
+
+func (x *ModelProto) GetPieces() []*ModelProto_SentencePiece {
+	if x != nil {
+		return x.Pieces
+	}
+	return nil
+}
+
+func (x *ModelProto) GetTrainerSpec() *TrainerSpec {
+	if x != nil {
+		return x.TrainerSpec
+	}
+	return nil
+}
+
+func (x *ModelProto) GetNormalizerSpec() *NormalizerSpec {
+	if x != nil {
+		return x.NormalizerSpec
+	}
+	return nil
+}
+
+func (x *ModelProto) GetSelfTestData() *SelfTestData {
+	if x != nil {
+		return x.SelfTestData
+	}
+	return nil
+}
+
+func (x *ModelProto) GetDenormalizerSpec() *NormalizerSpec {
+	if x != nil {
+		return x.DenormalizerSpec
+	}
+	return nil
+}
+
+type SelfTestData_Sample struct {
+	state         protoimpl.MessageState
+	sizeCache     protoimpl.SizeCache
+	unknownFields protoimpl.UnknownFields
+
+	Input    *string `protobuf:"bytes,1,opt,name=input" json:"input,omitempty"`
+	Expected *string `protobuf:"bytes,2,opt,name=expected" json:"expected,omitempty"`
+}
+
+func (x *SelfTestData_Sample) Reset() {
+	*x = SelfTestData_Sample{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_sentencepiece_model_proto_msgTypes[4]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *SelfTestData_Sample) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*SelfTestData_Sample) ProtoMessage() {}
+
+func (x *SelfTestData_Sample) ProtoReflect() protoreflect.Message {
+	mi := &file_sentencepiece_model_proto_msgTypes[4]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use SelfTestData_Sample.ProtoReflect.Descriptor instead.
+func (*SelfTestData_Sample) Descriptor() ([]byte, []int) {
+	return file_sentencepiece_model_proto_rawDescGZIP(), []int{2, 0}
+}
+
+func (x *SelfTestData_Sample) GetInput() string {
+	if x != nil && x.Input != nil {
+		return *x.Input
+	}
+	return ""
+}
+
+func (x *SelfTestData_Sample) GetExpected() string {
+	if x != nil && x.Expected != nil {
+		return *x.Expected
+	}
+	return ""
+}
+
+type ModelProto_SentencePiece struct {
+	state           protoimpl.MessageState
+	sizeCache       protoimpl.SizeCache
+	unknownFields   protoimpl.UnknownFields
+	extensionFields protoimpl.ExtensionFields
+
+	Piece *string                        `protobuf:"bytes,1,opt,name=piece" json:"piece,omitempty"` // piece must not be empty.
+	Score *float32                       `protobuf:"fixed32,2,opt,name=score" json:"score,omitempty"`
+	Type  *ModelProto_SentencePiece_Type `protobuf:"varint,3,opt,name=type,enum=sentencepiece.ModelProto_SentencePiece_Type,def=1" json:"type,omitempty"`
+}
+
+// Default values for ModelProto_SentencePiece fields.
+const (
+	Default_ModelProto_SentencePiece_Type = ModelProto_SentencePiece_NORMAL
+)
+
+func (x *ModelProto_SentencePiece) Reset() {
+	*x = ModelProto_SentencePiece{}
+	if protoimpl.UnsafeEnabled {
+		mi := &file_sentencepiece_model_proto_msgTypes[5]
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		ms.StoreMessageInfo(mi)
+	}
+}
+
+func (x *ModelProto_SentencePiece) String() string {
+	return protoimpl.X.MessageStringOf(x)
+}
+
+func (*ModelProto_SentencePiece) ProtoMessage() {}
+
+func (x *ModelProto_SentencePiece) ProtoReflect() protoreflect.Message {
+	mi := &file_sentencepiece_model_proto_msgTypes[5]
+	if protoimpl.UnsafeEnabled && x != nil {
+		ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x))
+		if ms.LoadMessageInfo() == nil {
+			ms.StoreMessageInfo(mi)
+		}
+		return ms
+	}
+	return mi.MessageOf(x)
+}
+
+// Deprecated: Use ModelProto_SentencePiece.ProtoReflect.Descriptor instead.
+func (*ModelProto_SentencePiece) Descriptor() ([]byte, []int) {
+	return file_sentencepiece_model_proto_rawDescGZIP(), []int{3, 0}
+}
+
+var extRange_ModelProto_SentencePiece = []protoiface.ExtensionRangeV1{
+	{Start: 200, End: 536870911},
+}
+
+// Deprecated: Use ModelProto_SentencePiece.ProtoReflect.Descriptor.ExtensionRanges instead.
+func (*ModelProto_SentencePiece) ExtensionRangeArray() []protoiface.ExtensionRangeV1 {
+	return extRange_ModelProto_SentencePiece
+}
+
+func (x *ModelProto_SentencePiece) GetPiece() string {
+	if x != nil && x.Piece != nil {
+		return *x.Piece
+	}
+	return ""
+}
+
+func (x *ModelProto_SentencePiece) GetScore() float32 {
+	if x != nil && x.Score != nil {
+		return *x.Score
+	}
+	return 0
+}
+
+func (x *ModelProto_SentencePiece) GetType() ModelProto_SentencePiece_Type {
+	if x != nil && x.Type != nil {
+		return *x.Type
+	}
+	return Default_ModelProto_SentencePiece_Type
+}
+
+var File_sentencepiece_model_proto protoreflect.FileDescriptor
+
+var file_sentencepiece_model_proto_rawDesc = []byte{
+	0x0a, 0x19, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f,
+	0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x0d, 0x73, 0x65, 0x6e,
+	0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x22, 0xc6, 0x12, 0x0a, 0x0b, 0x54,
+	0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x12, 0x14, 0x0a, 0x05, 0x69, 0x6e,
+	0x70, 0x75, 0x74, 0x18, 0x01, 0x20, 0x03, 0x28, 0x09, 0x52, 0x05, 0x69, 0x6e, 0x70, 0x75, 0x74,
+	0x12, 0x21, 0x0a, 0x0c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74,
+	0x18, 0x07, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x46, 0x6f, 0x72,
+	0x6d, 0x61, 0x74, 0x12, 0x21, 0x0a, 0x0c, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x5f, 0x70, 0x72, 0x65,
+	0x66, 0x69, 0x78, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x6d, 0x6f, 0x64, 0x65, 0x6c,
+	0x50, 0x72, 0x65, 0x66, 0x69, 0x78, 0x12, 0x4c, 0x0a, 0x0a, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x5f,
+	0x74, 0x79, 0x70, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x24, 0x2e, 0x73, 0x65, 0x6e,
+	0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x54, 0x72, 0x61, 0x69, 0x6e,
+	0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x54, 0x79, 0x70, 0x65,
+	0x3a, 0x07, 0x55, 0x4e, 0x49, 0x47, 0x52, 0x41, 0x4d, 0x52, 0x09, 0x6d, 0x6f, 0x64, 0x65, 0x6c,
+	0x54, 0x79, 0x70, 0x65, 0x12, 0x23, 0x0a, 0x0a, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x5f, 0x73, 0x69,
+	0x7a, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x04, 0x38, 0x30, 0x30, 0x30, 0x52, 0x09,
+	0x76, 0x6f, 0x63, 0x61, 0x62, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x27, 0x0a, 0x0f, 0x61, 0x63, 0x63,
+	0x65, 0x70, 0x74, 0x5f, 0x6c, 0x61, 0x6e, 0x67, 0x75, 0x61, 0x67, 0x65, 0x18, 0x05, 0x20, 0x03,
+	0x28, 0x09, 0x52, 0x0e, 0x61, 0x63, 0x63, 0x65, 0x70, 0x74, 0x4c, 0x61, 0x6e, 0x67, 0x75, 0x61,
+	0x67, 0x65, 0x12, 0x34, 0x0a, 0x15, 0x73, 0x65, 0x6c, 0x66, 0x5f, 0x74, 0x65, 0x73, 0x74, 0x5f,
+	0x73, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x06, 0x20, 0x01, 0x28,
+	0x05, 0x3a, 0x01, 0x30, 0x52, 0x12, 0x73, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x53, 0x61,
+	0x6d, 0x70, 0x6c, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x45, 0x0a, 0x1b, 0x65, 0x6e, 0x61, 0x62,
+	0x6c, 0x65, 0x5f, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f,
+	0x70, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x18, 0x32, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66,
+	0x61, 0x6c, 0x73, 0x65, 0x52, 0x19, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x44, 0x69, 0x66, 0x66,
+	0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x50, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x12,
+	0x4a, 0x0a, 0x20, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f,
+	0x70, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x5f, 0x6e, 0x6f, 0x69, 0x73, 0x65, 0x5f, 0x6c, 0x65,
+	0x76, 0x65, 0x6c, 0x18, 0x33, 0x20, 0x01, 0x28, 0x02, 0x3a, 0x01, 0x30, 0x52, 0x1d, 0x64, 0x69,
+	0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x50, 0x72, 0x69, 0x76, 0x61, 0x63,
+	0x79, 0x4e, 0x6f, 0x69, 0x73, 0x65, 0x4c, 0x65, 0x76, 0x65, 0x6c, 0x12, 0x58, 0x0a, 0x27, 0x64,
+	0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x70, 0x72, 0x69, 0x76,
+	0x61, 0x63, 0x79, 0x5f, 0x63, 0x6c, 0x69, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x5f, 0x74, 0x68, 0x72,
+	0x65, 0x73, 0x68, 0x6f, 0x6c, 0x64, 0x18, 0x34, 0x20, 0x01, 0x28, 0x04, 0x3a, 0x01, 0x30, 0x52,
+	0x24, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x50, 0x72, 0x69,
+	0x76, 0x61, 0x63, 0x79, 0x43, 0x6c, 0x69, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x54, 0x68, 0x72, 0x65,
+	0x73, 0x68, 0x6f, 0x6c, 0x64, 0x12, 0x35, 0x0a, 0x12, 0x63, 0x68, 0x61, 0x72, 0x61, 0x63, 0x74,
+	0x65, 0x72, 0x5f, 0x63, 0x6f, 0x76, 0x65, 0x72, 0x61, 0x67, 0x65, 0x18, 0x0a, 0x20, 0x01, 0x28,
+	0x02, 0x3a, 0x06, 0x30, 0x2e, 0x39, 0x39, 0x39, 0x35, 0x52, 0x11, 0x63, 0x68, 0x61, 0x72, 0x61,
+	0x63, 0x74, 0x65, 0x72, 0x43, 0x6f, 0x76, 0x65, 0x72, 0x61, 0x67, 0x65, 0x12, 0x31, 0x0a, 0x13,
+	0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73,
+	0x69, 0x7a, 0x65, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x04, 0x3a, 0x01, 0x30, 0x52, 0x11, 0x69, 0x6e,
+	0x70, 0x75, 0x74, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x12,
+	0x3a, 0x0a, 0x16, 0x73, 0x68, 0x75, 0x66, 0x66, 0x6c, 0x65, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74,
+	0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x18, 0x13, 0x20, 0x01, 0x28, 0x08, 0x3a,
+	0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x14, 0x73, 0x68, 0x75, 0x66, 0x66, 0x6c, 0x65, 0x49, 0x6e,
+	0x70, 0x75, 0x74, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x12, 0x34, 0x0a, 0x14, 0x6d,
+	0x69, 0x6e, 0x69, 0x6e, 0x67, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73,
+	0x69, 0x7a, 0x65, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x05, 0x42, 0x02, 0x18, 0x01, 0x52, 0x12, 0x6d,
+	0x69, 0x6e, 0x69, 0x6e, 0x67, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x53, 0x69, 0x7a,
+	0x65, 0x12, 0x38, 0x0a, 0x16, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x5f, 0x73, 0x65,
+	0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x0d, 0x20, 0x01, 0x28,
+	0x05, 0x42, 0x02, 0x18, 0x01, 0x52, 0x14, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x53,
+	0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x3f, 0x0a, 0x17, 0x73,
+	0x65, 0x65, 0x64, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63,
+	0x65, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x0e, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x07, 0x31, 0x30,
+	0x30, 0x30, 0x30, 0x30, 0x30, 0x52, 0x15, 0x73, 0x65, 0x65, 0x64, 0x53, 0x65, 0x6e, 0x74, 0x65,
+	0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x2f, 0x0a, 0x10,
+	0x73, 0x68, 0x72, 0x69, 0x6e, 0x6b, 0x69, 0x6e, 0x67, 0x5f, 0x66, 0x61, 0x63, 0x74, 0x6f, 0x72,
+	0x18, 0x0f, 0x20, 0x01, 0x28, 0x02, 0x3a, 0x04, 0x30, 0x2e, 0x37, 0x35, 0x52, 0x0f, 0x73, 0x68,
+	0x72, 0x69, 0x6e, 0x6b, 0x69, 0x6e, 0x67, 0x46, 0x61, 0x63, 0x74, 0x6f, 0x72, 0x12, 0x34, 0x0a,
+	0x13, 0x6d, 0x61, 0x78, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x6c, 0x65,
+	0x6e, 0x67, 0x74, 0x68, 0x18, 0x12, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x04, 0x34, 0x31, 0x39, 0x32,
+	0x52, 0x11, 0x6d, 0x61, 0x78, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x4c, 0x65, 0x6e,
+	0x67, 0x74, 0x68, 0x12, 0x23, 0x0a, 0x0b, 0x6e, 0x75, 0x6d, 0x5f, 0x74, 0x68, 0x72, 0x65, 0x61,
+	0x64, 0x73, 0x18, 0x10, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x02, 0x31, 0x36, 0x52, 0x0a, 0x6e, 0x75,
+	0x6d, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73, 0x12, 0x2f, 0x0a, 0x12, 0x6e, 0x75, 0x6d, 0x5f,
+	0x73, 0x75, 0x62, 0x5f, 0x69, 0x74, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x11,
+	0x20, 0x01, 0x28, 0x05, 0x3a, 0x01, 0x32, 0x52, 0x10, 0x6e, 0x75, 0x6d, 0x53, 0x75, 0x62, 0x49,
+	0x74, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x3c, 0x0a, 0x18, 0x6d, 0x61, 0x78,
+	0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, 0x6c,
+	0x65, 0x6e, 0x67, 0x74, 0x68, 0x18, 0x14, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x02, 0x31, 0x36, 0x52,
+	0x16, 0x6d, 0x61, 0x78, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63,
+	0x65, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x12, 0x3b, 0x0a, 0x17, 0x73, 0x70, 0x6c, 0x69, 0x74,
+	0x5f, 0x62, 0x79, 0x5f, 0x75, 0x6e, 0x69, 0x63, 0x6f, 0x64, 0x65, 0x5f, 0x73, 0x63, 0x72, 0x69,
+	0x70, 0x74, 0x18, 0x15, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x14,
+	0x73, 0x70, 0x6c, 0x69, 0x74, 0x42, 0x79, 0x55, 0x6e, 0x69, 0x63, 0x6f, 0x64, 0x65, 0x53, 0x63,
+	0x72, 0x69, 0x70, 0x74, 0x12, 0x2c, 0x0a, 0x0f, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x5f, 0x62, 0x79,
+	0x5f, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x18, 0x17, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74,
+	0x72, 0x75, 0x65, 0x52, 0x0d, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x42, 0x79, 0x4e, 0x75, 0x6d, 0x62,
+	0x65, 0x72, 0x12, 0x34, 0x0a, 0x13, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x5f, 0x62, 0x79, 0x5f, 0x77,
+	0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x18, 0x16, 0x20, 0x01, 0x28, 0x08, 0x3a,
+	0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x11, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x42, 0x79, 0x57, 0x68,
+	0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x12, 0x42, 0x0a, 0x1a, 0x74, 0x72, 0x65, 0x61,
+	0x74, 0x5f, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x5f, 0x61, 0x73, 0x5f,
+	0x73, 0x75, 0x66, 0x66, 0x69, 0x78, 0x18, 0x18, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61,
+	0x6c, 0x73, 0x65, 0x52, 0x17, 0x74, 0x72, 0x65, 0x61, 0x74, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73,
+	0x70, 0x61, 0x63, 0x65, 0x41, 0x73, 0x53, 0x75, 0x66, 0x66, 0x69, 0x78, 0x12, 0x46, 0x0a, 0x1c,
+	0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x5f, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65,
+	0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x73, 0x18, 0x1a, 0x20, 0x01,
+	0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x19, 0x61, 0x6c, 0x6c, 0x6f, 0x77,
+	0x57, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x4f, 0x6e, 0x6c, 0x79, 0x50, 0x69,
+	0x65, 0x63, 0x65, 0x73, 0x12, 0x28, 0x0a, 0x0c, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x5f, 0x64, 0x69,
+	0x67, 0x69, 0x74, 0x73, 0x18, 0x19, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73,
+	0x65, 0x52, 0x0b, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x44, 0x69, 0x67, 0x69, 0x74, 0x73, 0x12, 0x3d,
+	0x0a, 0x19, 0x70, 0x72, 0x65, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f,
+	0x6e, 0x5f, 0x64, 0x65, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x65, 0x72, 0x18, 0x35, 0x20, 0x01, 0x28,
+	0x09, 0x3a, 0x00, 0x52, 0x18, 0x70, 0x72, 0x65, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x69, 0x7a, 0x61,
+	0x74, 0x69, 0x6f, 0x6e, 0x44, 0x65, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x65, 0x72, 0x12, 0x27, 0x0a,
+	0x0f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x5f, 0x73, 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73,
+	0x18, 0x1e, 0x20, 0x03, 0x28, 0x09, 0x52, 0x0e, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x53,
+	0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, 0x12, 0x30, 0x0a, 0x14, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x64,
+	0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, 0x5f, 0x73, 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, 0x18, 0x1f,
+	0x20, 0x03, 0x28, 0x09, 0x52, 0x12, 0x75, 0x73, 0x65, 0x72, 0x44, 0x65, 0x66, 0x69, 0x6e, 0x65,
+	0x64, 0x53, 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, 0x12, 0x25, 0x0a, 0x0e, 0x72, 0x65, 0x71, 0x75,
+	0x69, 0x72, 0x65, 0x64, 0x5f, 0x63, 0x68, 0x61, 0x72, 0x73, 0x18, 0x24, 0x20, 0x01, 0x28, 0x09,
+	0x52, 0x0d, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x43, 0x68, 0x61, 0x72, 0x73, 0x12,
+	0x2a, 0x0a, 0x0d, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x66, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b,
+	0x18, 0x23, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x0c, 0x62,
+	0x79, 0x74, 0x65, 0x46, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x12, 0x47, 0x0a, 0x1d, 0x76,
+	0x6f, 0x63, 0x61, 0x62, 0x75, 0x6c, 0x61, 0x72, 0x79, 0x5f, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74,
+	0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x18, 0x20, 0x20, 0x01,
+	0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x1a, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x75,
+	0x6c, 0x61, 0x72, 0x79, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x50, 0x69, 0x65, 0x63, 0x65, 0x53,
+	0x63, 0x6f, 0x72, 0x65, 0x12, 0x2e, 0x0a, 0x10, 0x68, 0x61, 0x72, 0x64, 0x5f, 0x76, 0x6f, 0x63,
+	0x61, 0x62, 0x5f, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x18, 0x21, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04,
+	0x74, 0x72, 0x75, 0x65, 0x52, 0x0e, 0x68, 0x61, 0x72, 0x64, 0x56, 0x6f, 0x63, 0x61, 0x62, 0x4c,
+	0x69, 0x6d, 0x69, 0x74, 0x12, 0x29, 0x0a, 0x0d, 0x75, 0x73, 0x65, 0x5f, 0x61, 0x6c, 0x6c, 0x5f,
+	0x76, 0x6f, 0x63, 0x61, 0x62, 0x18, 0x22, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c,
+	0x73, 0x65, 0x52, 0x0b, 0x75, 0x73, 0x65, 0x41, 0x6c, 0x6c, 0x56, 0x6f, 0x63, 0x61, 0x62, 0x12,
+	0x18, 0x0a, 0x06, 0x75, 0x6e, 0x6b, 0x5f, 0x69, 0x64, 0x18, 0x28, 0x20, 0x01, 0x28, 0x05, 0x3a,
+	0x01, 0x30, 0x52, 0x05, 0x75, 0x6e, 0x6b, 0x49, 0x64, 0x12, 0x18, 0x0a, 0x06, 0x62, 0x6f, 0x73,
+	0x5f, 0x69, 0x64, 0x18, 0x29, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x01, 0x31, 0x52, 0x05, 0x62, 0x6f,
+	0x73, 0x49, 0x64, 0x12, 0x18, 0x0a, 0x06, 0x65, 0x6f, 0x73, 0x5f, 0x69, 0x64, 0x18, 0x2a, 0x20,
+	0x01, 0x28, 0x05, 0x3a, 0x01, 0x32, 0x52, 0x05, 0x65, 0x6f, 0x73, 0x49, 0x64, 0x12, 0x19, 0x0a,
+	0x06, 0x70, 0x61, 0x64, 0x5f, 0x69, 0x64, 0x18, 0x2b, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x02, 0x2d,
+	0x31, 0x52, 0x05, 0x70, 0x61, 0x64, 0x49, 0x64, 0x12, 0x22, 0x0a, 0x09, 0x75, 0x6e, 0x6b, 0x5f,
+	0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x2d, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x05, 0x3c, 0x75, 0x6e,
+	0x6b, 0x3e, 0x52, 0x08, 0x75, 0x6e, 0x6b, 0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x20, 0x0a, 0x09,
+	0x62, 0x6f, 0x73, 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x2e, 0x20, 0x01, 0x28, 0x09, 0x3a,
+	0x03, 0x3c, 0x73, 0x3e, 0x52, 0x08, 0x62, 0x6f, 0x73, 0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x21,
+	0x0a, 0x09, 0x65, 0x6f, 0x73, 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x2f, 0x20, 0x01, 0x28,
+	0x09, 0x3a, 0x04, 0x3c, 0x2f, 0x73, 0x3e, 0x52, 0x08, 0x65, 0x6f, 0x73, 0x50, 0x69, 0x65, 0x63,
+	0x65, 0x12, 0x22, 0x0a, 0x09, 0x70, 0x61, 0x64, 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x30,
+	0x20, 0x01, 0x28, 0x09, 0x3a, 0x05, 0x3c, 0x70, 0x61, 0x64, 0x3e, 0x52, 0x08, 0x70, 0x61, 0x64,
+	0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x26, 0x0a, 0x0b, 0x75, 0x6e, 0x6b, 0x5f, 0x73, 0x75, 0x72,
+	0x66, 0x61, 0x63, 0x65, 0x18, 0x2c, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x05, 0x20, 0xe2, 0x81, 0x87,
+	0x20, 0x52, 0x0a, 0x75, 0x6e, 0x6b, 0x53, 0x75, 0x72, 0x66, 0x61, 0x63, 0x65, 0x12, 0x46, 0x0a,
+	0x1c, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x5f, 0x65, 0x78, 0x74, 0x72, 0x65, 0x6d, 0x65, 0x6c, 0x79,
+	0x5f, 0x6c, 0x61, 0x72, 0x67, 0x65, 0x5f, 0x63, 0x6f, 0x72, 0x70, 0x75, 0x73, 0x18, 0x31, 0x20,
+	0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x19, 0x74, 0x72, 0x61, 0x69,
+	0x6e, 0x45, 0x78, 0x74, 0x72, 0x65, 0x6d, 0x65, 0x6c, 0x79, 0x4c, 0x61, 0x72, 0x67, 0x65, 0x43,
+	0x6f, 0x72, 0x70, 0x75, 0x73, 0x12, 0x3a, 0x0a, 0x18, 0x73, 0x65, 0x65, 0x64, 0x5f, 0x73, 0x65,
+	0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x73, 0x5f, 0x66, 0x69, 0x6c,
+	0x65, 0x18, 0x36, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x00, 0x52, 0x16, 0x73, 0x65, 0x65, 0x64, 0x53,
+	0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x73, 0x46, 0x69, 0x6c,
+	0x65, 0x22, 0x35, 0x0a, 0x09, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b,
+	0x0a, 0x07, 0x55, 0x4e, 0x49, 0x47, 0x52, 0x41, 0x4d, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x42,
+	0x50, 0x45, 0x10, 0x02, 0x12, 0x08, 0x0a, 0x04, 0x57, 0x4f, 0x52, 0x44, 0x10, 0x03, 0x12, 0x08,
+	0x0a, 0x04, 0x43, 0x48, 0x41, 0x52, 0x10, 0x04, 0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80,
+	0x80, 0x80, 0x02, 0x22, 0xbd, 0x02, 0x0a, 0x0e, 0x4e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a,
+	0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01,
+	0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x31, 0x0a, 0x14, 0x70, 0x72,
+	0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x5f, 0x63, 0x68, 0x61, 0x72, 0x73, 0x6d,
+	0x61, 0x70, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x13, 0x70, 0x72, 0x65, 0x63, 0x6f, 0x6d,
+	0x70, 0x69, 0x6c, 0x65, 0x64, 0x43, 0x68, 0x61, 0x72, 0x73, 0x6d, 0x61, 0x70, 0x12, 0x2e, 0x0a,
+	0x10, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x75, 0x6d, 0x6d, 0x79, 0x5f, 0x70, 0x72, 0x65, 0x66, 0x69,
+	0x78, 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x0e, 0x61,
+	0x64, 0x64, 0x44, 0x75, 0x6d, 0x6d, 0x79, 0x50, 0x72, 0x65, 0x66, 0x69, 0x78, 0x12, 0x3e, 0x0a,
+	0x18, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x5f, 0x65, 0x78, 0x74, 0x72, 0x61, 0x5f, 0x77, 0x68,
+	0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x08, 0x3a,
+	0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x16, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x45, 0x78, 0x74,
+	0x72, 0x61, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x73, 0x12, 0x33, 0x0a,
+	0x12, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x5f, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61,
+	0x63, 0x65, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52,
+	0x11, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63,
+	0x65, 0x73, 0x12, 0x34, 0x0a, 0x16, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x61, 0x74,
+	0x69, 0x6f, 0x6e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x5f, 0x74, 0x73, 0x76, 0x18, 0x06, 0x20, 0x01,
+	0x28, 0x09, 0x52, 0x14, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f,
+	0x6e, 0x52, 0x75, 0x6c, 0x65, 0x54, 0x73, 0x76, 0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80,
+	0x80, 0x80, 0x02, 0x22, 0x93, 0x01, 0x0a, 0x0c, 0x53, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74,
+	0x44, 0x61, 0x74, 0x61, 0x12, 0x3c, 0x0a, 0x07, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x73, 0x18,
+	0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x22, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65,
+	0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x53, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x44, 0x61,
+	0x74, 0x61, 0x2e, 0x53, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x52, 0x07, 0x73, 0x61, 0x6d, 0x70, 0x6c,
+	0x65, 0x73, 0x1a, 0x3a, 0x0a, 0x06, 0x53, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x12, 0x14, 0x0a, 0x05,
+	0x69, 0x6e, 0x70, 0x75, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x69, 0x6e, 0x70,
+	0x75, 0x74, 0x12, 0x1a, 0x0a, 0x08, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x18, 0x02,
+	0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x2a, 0x09,
+	0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, 0x80, 0x80, 0x02, 0x22, 0xd7, 0x04, 0x0a, 0x0a, 0x4d, 0x6f,
+	0x64, 0x65, 0x6c, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x3f, 0x0a, 0x06, 0x70, 0x69, 0x65, 0x63,
+	0x65, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x27, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65,
+	0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x50, 0x72,
+	0x6f, 0x74, 0x6f, 0x2e, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x50, 0x69, 0x65, 0x63,
+	0x65, 0x52, 0x06, 0x70, 0x69, 0x65, 0x63, 0x65, 0x73, 0x12, 0x3d, 0x0a, 0x0c, 0x74, 0x72, 0x61,
+	0x69, 0x6e, 0x65, 0x72, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32,
+	0x1a, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e,
+	0x54, 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x52, 0x0b, 0x74, 0x72, 0x61,
+	0x69, 0x6e, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x12, 0x46, 0x0a, 0x0f, 0x6e, 0x6f, 0x72, 0x6d,
+	0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x18, 0x03, 0x20, 0x01, 0x28,
+	0x0b, 0x32, 0x1d, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63,
+	0x65, 0x2e, 0x4e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63,
+	0x52, 0x0e, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63,
+	0x12, 0x41, 0x0a, 0x0e, 0x73, 0x65, 0x6c, 0x66, 0x5f, 0x74, 0x65, 0x73, 0x74, 0x5f, 0x64, 0x61,
+	0x74, 0x61, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1b, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65,
+	0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x53, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73,
+	0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0c, 0x73, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x44,
+	0x61, 0x74, 0x61, 0x12, 0x4a, 0x0a, 0x11, 0x64, 0x65, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69,
+	0x7a, 0x65, 0x72, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1d,
+	0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x4e,
+	0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x52, 0x10, 0x64,
+	0x65, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x1a,
+	0xe6, 0x01, 0x0a, 0x0d, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x50, 0x69, 0x65, 0x63,
+	0x65, 0x12, 0x14, 0x0a, 0x05, 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09,
+	0x52, 0x05, 0x70, 0x69, 0x65, 0x63, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x63, 0x6f, 0x72, 0x65,
+	0x18, 0x02, 0x20, 0x01, 0x28, 0x02, 0x52, 0x05, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x12, 0x48, 0x0a,
+	0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x2c, 0x2e, 0x73, 0x65,
+	0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x4d, 0x6f, 0x64, 0x65,
+	0x6c, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x50,
+	0x69, 0x65, 0x63, 0x65, 0x2e, 0x54, 0x79, 0x70, 0x65, 0x3a, 0x06, 0x4e, 0x4f, 0x52, 0x4d, 0x41,
+	0x4c, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x22, 0x54, 0x0a, 0x04, 0x54, 0x79, 0x70, 0x65, 0x12,
+	0x0a, 0x0a, 0x06, 0x4e, 0x4f, 0x52, 0x4d, 0x41, 0x4c, 0x10, 0x01, 0x12, 0x0b, 0x0a, 0x07, 0x55,
+	0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x02, 0x12, 0x0b, 0x0a, 0x07, 0x43, 0x4f, 0x4e, 0x54,
+	0x52, 0x4f, 0x4c, 0x10, 0x03, 0x12, 0x10, 0x0a, 0x0c, 0x55, 0x53, 0x45, 0x52, 0x5f, 0x44, 0x45,
+	0x46, 0x49, 0x4e, 0x45, 0x44, 0x10, 0x04, 0x12, 0x08, 0x0a, 0x04, 0x42, 0x59, 0x54, 0x45, 0x10,
+	0x06, 0x12, 0x0a, 0x0a, 0x06, 0x55, 0x4e, 0x55, 0x53, 0x45, 0x44, 0x10, 0x05, 0x2a, 0x09, 0x08,
+	0xc8, 0x01, 0x10, 0x80, 0x80, 0x80, 0x80, 0x02, 0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80,
+	0x80, 0x80, 0x02, 0x42, 0x02, 0x48, 0x03,
+}
+
+var (
+	file_sentencepiece_model_proto_rawDescOnce sync.Once
+	file_sentencepiece_model_proto_rawDescData = file_sentencepiece_model_proto_rawDesc
+)
+
+func file_sentencepiece_model_proto_rawDescGZIP() []byte {
+	file_sentencepiece_model_proto_rawDescOnce.Do(func() {
+		file_sentencepiece_model_proto_rawDescData = protoimpl.X.CompressGZIP(file_sentencepiece_model_proto_rawDescData)
+	})
+	return file_sentencepiece_model_proto_rawDescData
+}
+
+var file_sentencepiece_model_proto_enumTypes = make([]protoimpl.EnumInfo, 2)
+var file_sentencepiece_model_proto_msgTypes = make([]protoimpl.MessageInfo, 6)
+var file_sentencepiece_model_proto_goTypes = []interface{}{
+	(TrainerSpec_ModelType)(0),         // 0: sentencepiece.TrainerSpec.ModelType
+	(ModelProto_SentencePiece_Type)(0), // 1: sentencepiece.ModelProto.SentencePiece.Type
+	(*TrainerSpec)(nil),                // 2: sentencepiece.TrainerSpec
+	(*NormalizerSpec)(nil),             // 3: sentencepiece.NormalizerSpec
+	(*SelfTestData)(nil),               // 4: sentencepiece.SelfTestData
+	(*ModelProto)(nil),                 // 5: sentencepiece.ModelProto
+	(*SelfTestData_Sample)(nil),        // 6: sentencepiece.SelfTestData.Sample
+	(*ModelProto_SentencePiece)(nil),   // 7: sentencepiece.ModelProto.SentencePiece
+}
+var file_sentencepiece_model_proto_depIdxs = []int32{
+	0, // 0: sentencepiece.TrainerSpec.model_type:type_name -> sentencepiece.TrainerSpec.ModelType
+	6, // 1: sentencepiece.SelfTestData.samples:type_name -> sentencepiece.SelfTestData.Sample
+	7, // 2: sentencepiece.ModelProto.pieces:type_name -> sentencepiece.ModelProto.SentencePiece
+	2, // 3: sentencepiece.ModelProto.trainer_spec:type_name -> sentencepiece.TrainerSpec
+	3, // 4: sentencepiece.ModelProto.normalizer_spec:type_name -> sentencepiece.NormalizerSpec
+	4, // 5: sentencepiece.ModelProto.self_test_data:type_name -> sentencepiece.SelfTestData
+	3, // 6: sentencepiece.ModelProto.denormalizer_spec:type_name -> sentencepiece.NormalizerSpec
+	1, // 7: sentencepiece.ModelProto.SentencePiece.type:type_name -> sentencepiece.ModelProto.SentencePiece.Type
+	8, // [8:8] is the sub-list for method output_type
+	8, // [8:8] is the sub-list for method input_type
+	8, // [8:8] is the sub-list for extension type_name
+	8, // [8:8] is the sub-list for extension extendee
+	0, // [0:8] is the sub-list for field type_name
+}
+
+func init() { file_sentencepiece_model_proto_init() }
+func file_sentencepiece_model_proto_init() {
+	if File_sentencepiece_model_proto != nil {
+		return
+	}
+	if !protoimpl.UnsafeEnabled {
+		file_sentencepiece_model_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*TrainerSpec); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			case 3:
+				return &v.extensionFields
+			default:
+				return nil
+			}
+		}
+		file_sentencepiece_model_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*NormalizerSpec); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			case 3:
+				return &v.extensionFields
+			default:
+				return nil
+			}
+		}
+		file_sentencepiece_model_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*SelfTestData); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			case 3:
+				return &v.extensionFields
+			default:
+				return nil
+			}
+		}
+		file_sentencepiece_model_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*ModelProto); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			case 3:
+				return &v.extensionFields
+			default:
+				return nil
+			}
+		}
+		file_sentencepiece_model_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*SelfTestData_Sample); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			default:
+				return nil
+			}
+		}
+		file_sentencepiece_model_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} {
+			switch v := v.(*ModelProto_SentencePiece); i {
+			case 0:
+				return &v.state
+			case 1:
+				return &v.sizeCache
+			case 2:
+				return &v.unknownFields
+			case 3:
+				return &v.extensionFields
+			default:
+				return nil
+			}
+		}
+	}
+	type x struct{}
+	out := protoimpl.TypeBuilder{
+		File: protoimpl.DescBuilder{
+			GoPackagePath: reflect.TypeOf(x{}).PkgPath(),
+			RawDescriptor: file_sentencepiece_model_proto_rawDesc,
+			NumEnums:      2,
+			NumMessages:   6,
+			NumExtensions: 0,
+			NumServices:   0,
+		},
+		GoTypes:           file_sentencepiece_model_proto_goTypes,
+		DependencyIndexes: file_sentencepiece_model_proto_depIdxs,
+		EnumInfos:         file_sentencepiece_model_proto_enumTypes,
+		MessageInfos:      file_sentencepiece_model_proto_msgTypes,
+	}.Build()
+	File_sentencepiece_model_proto = out.File
+	file_sentencepiece_model_proto_rawDesc = nil
+	file_sentencepiece_model_proto_goTypes = nil
+	file_sentencepiece_model_proto_depIdxs = nil
+}
diff --git a/vertexai/internal/sentencepiece/internal/model/sentencepiece_model.proto b/vertexai/internal/sentencepiece/internal/model/sentencepiece_model.proto
new file mode 100644
index 000000000000..a48f7cc845d8
--- /dev/null
+++ b/vertexai/internal/sentencepiece/internal/model/sentencepiece_model.proto
@@ -0,0 +1,332 @@
+// Copyright 2024 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+// TODO(taku): Needs to use LITE RUNTIME in OSS release.
+option optimize_for = LITE_RUNTIME;
+
+package sentencepiece;
+
+// TrainerSpec encodes a various parameters for SentencePiece training.
+// Next id: 55
+message TrainerSpec {
+  ///////////////////////////////////////////////////////////////////
+  // General parameters
+  //
+  // Input corpus files.
+  //  Trainer accepts the following two formats:
+  //  A) Monolingual: plain text, one sentence per line.
+  //  B) Bilingual:   TSV, source sentence <tab> target sentence
+  //  When bilingual data is passed, shared vocabulary model is built.
+  //  Note that the input file must be raw corpus, not a preprocessed corpus.
+  //  Trainer only loads the first `input_sentence_size` sentences specified
+  //  with this parameter.
+  repeated string input = 1;
+
+  // Input corpus format:
+  // "text": one-sentence-per-line text format (default)
+  // "tsv":  sentence <tab> freq
+  optional string input_format = 7;
+
+  // Output model file prefix.
+  // <model_prefix>.model and <model_prefix>.vocab are generated.
+  optional string model_prefix = 2;
+
+  // Model type. only have UNIGRAM now.
+  enum ModelType {
+    UNIGRAM = 1;  // Unigram language model with dynamic algorithm
+    BPE = 2;      // Byte Pair Encoding
+    WORD = 3;     // Delimitered by whitespace.
+    CHAR = 4;     // tokenizes into character sequence
+  }
+  optional ModelType model_type = 3 [default = UNIGRAM];
+
+  // Vocabulary size. 8k is the default size.
+  optional int32 vocab_size = 4 [default = 8000];
+
+  // List of the languages this model can accept.
+  // Since the model is language-agnostic, this field is used as a reference.
+  repeated string accept_language = 5;
+
+  // Size of self-test samples, which are encoded in the model file.
+  optional int32 self_test_sample_size = 6 [default = 0];
+
+  // Whether to use DP version of sentencepiece. Use it with TSV input format
+  // (requires precomputed word tab counts to work).
+  optional bool enable_differential_privacy = 50 [default = false];
+  // Set these parameters if you need DP version of sentencepiece.
+  // std of noise to add.
+  optional float differential_privacy_noise_level = 51 [default = 0.0];
+  // Clipping threshold to apply after adding noise. All the words with
+  // frequency less than this value are dropped.
+  optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
+
+  ///////////////////////////////////////////////////////////////////
+  // Training parameters.
+  //
+  // Uses characters which cover the corpus with the ratio of `chars_coverage`.
+  // This parameter determines the set of basic Alphabet of sentence piece.
+  // 1.0 - `chars_coverage` characters are treated as UNK.
+  // See also required_chars field.
+  optional float character_coverage = 10 [default = 0.9995];
+
+  // Maximum size of sentences the trainer loads from `input` parameter.
+  // Trainer simply loads the `input` files in sequence.
+  // It is better to shuffle the input corpus randomly.
+  optional uint64 input_sentence_size = 11 [default = 0];
+  optional bool shuffle_input_sentence = 19 [default = true];
+
+  // Maximum size of sentences to make seed sentence pieces.
+  // Extended suffix array is constructed to extract frequent
+  // sub-strings from the corpus. This uses 20N working space,
+  // where N is the size of corpus.
+  optional int32 mining_sentence_size = 12 [deprecated = true];
+
+  // Maximum size of sentences to train sentence pieces.
+  optional int32 training_sentence_size = 13 [deprecated = true];
+
+  // The size of seed sentencepieces.
+  // `seed_sentencepiece_size` must be larger than `vocab_size`.
+  optional int32 seed_sentencepiece_size = 14 [default = 1000000];
+
+  // In every EM sub-iterations, keeps top
+  // `shrinking_factor` * `current sentencepieces size` with respect to
+  // the loss of the sentence piece. This value should be smaller than 1.0.
+  optional float shrinking_factor = 15 [default = 0.75];
+
+  // The maximum sentence length in byte. The sentences with the length
+  // larger than `max_sentence_length` is simply ignored.
+  // Longer input tends to bring the following risks:
+  //  * Overflow during EM training (unigram language model only)
+  //  * Performance drop because of O(n log n) cost in BPE.
+  optional int32 max_sentence_length = 18 [default = 4192];
+
+  // Number of threads in the training.
+  optional int32 num_threads = 16 [default = 16];
+
+  // Number of EM sub iterations.
+  optional int32 num_sub_iterations = 17 [default = 2];
+
+  ///////////////////////////////////////////////////////////////////
+  // SentencePiece parameters which control the shapes of sentence piece.
+  //
+  // Maximum length of sentencepiece.
+  optional int32 max_sentencepiece_length = 20 [default = 16];
+
+  // Uses Unicode script to split sentence pieces.
+  // When `split_by_unicode_script` is true, we do not allow sentence piece to
+  // include multiple Unicode scripts, e.g. "F1" is not a valid piece.
+  // Exception: CJ characters (Hiragana/Katakana/Han) are all handled
+  // as one script type, since Japanese word can consist of multiple scripts.
+  // This exception is always applied regardless of the accept-language
+  // parameter.
+  optional bool split_by_unicode_script = 21 [default = true];
+
+  // When `split_by_number` is true, put a boundary between number and
+  // non-number transition. If we want to treat "F1" is one token, set this flag
+  // to be false.
+  optional bool split_by_number = 23 [default = true];
+
+  // Use a white space to split sentence pieces.
+  // When `split_by_whitespace` is false, we may have the piece containing
+  // a white space in the middle. e.g., "in_the".
+  optional bool split_by_whitespace = 22 [default = true];
+
+  // Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello =>
+  // hello_. When `treat_whitespace_as_suffix` is true,
+  // NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end
+  // of sentence.
+  optional bool treat_whitespace_as_suffix = 24 [default = false];
+
+  // Allows pieces that only contain whitespaces instead of appearing only as
+  // prefix or suffix of other pieces.
+  optional bool allow_whitespace_only_pieces = 26 [default = false];
+
+  // Split all digits (0-9) into separate pieces.
+  optional bool split_digits = 25 [default = false];
+
+  // Defines the pre-tokenization delimiter.
+  // When specified, no pieces crossing this delimiter is not included
+  // in the vocab. Then the delimiter string is virtually ignored
+  // during the training. This field can allows constraints on the vocabulary
+  // selection. Note that this field is available on unigram mode.
+  optional string pretokenization_delimiter = 53 [ default = ""];
+
+  ///////////////////////////////////////////////////////////////////
+  // Vocabulary management
+  //
+  // Defines control symbols used as an indicator to
+  // change the behavior of the decoder. <s> and </s> are pre-defined.
+  // We can use this field to encode various meta information,
+  // including language indicator in multilingual model.
+  // These symbols are not visible to users, but visible to
+  // the decoder. Note that when the input sentence contains control symbols,
+  // they are not treated as one token, but segmented into normal pieces.
+  // Control symbols must be inserted independently from the segmentation.
+  repeated string control_symbols = 30;
+
+  // Defines user defined symbols.
+  // These symbols are added with extremely high score
+  // so they are always treated as one unique symbol in any context.
+  // Typical usage of user_defined_symbols is placeholder for named entities.
+  repeated string user_defined_symbols = 31;
+
+  // Defines required characters. Each UTF8 character in this string is included
+  // in the character set regardless of character_coverage value. Unlike
+  // user_defined_symbols, these characters have scores based on the frequency
+  // on input sentences, and the model can form subwords using characters
+  // in this field.
+  optional string required_chars = 36;
+
+  // Decomposes unknown pieces into UTF-8 bytes.
+  optional bool byte_fallback = 35 [default = false];
+
+  // When creating the vocabulary file, defines whether or not to additionally
+  // output the score for each piece.
+  optional bool vocabulary_output_piece_score = 32 [default = true];
+
+  // `vocab_size` is treated as hard limit. Crash if
+  // the model can not produce the vocab of size `vocab_size`,
+  // When `hard_vocab_limit` is false, vocab_size is treated
+  // as soft limit. Note that when model_type=char,
+  // always assumes hard_vocab_limit = false.
+  optional bool hard_vocab_limit = 33 [default = true];
+
+  // use all symbols for vocab extraction. This flag is valid
+  // if model type is either CHAR or WORD
+  optional bool use_all_vocab = 34 [default = false];
+
+  ///////////////////////////////////////////////////////////////////
+  // Reserved special meta tokens.
+  // * -1 is not used.
+  // * unk_id must not be -1.
+  // Id must starts with 0 and be contigous.
+  optional int32 unk_id = 40 [default = 0];   // <unk>
+  optional int32 bos_id = 41 [default = 1];   // <s>
+  optional int32 eos_id = 42 [default = 2];   // </s>
+  optional int32 pad_id = 43 [default = -1];  // <pad> (padding)
+  optional string unk_piece = 45 [default = "<unk>"];
+  optional string bos_piece = 46 [default = "<s>"];
+  optional string eos_piece = 47 [default = "</s>"];
+  optional string pad_piece = 48 [default = "<pad>"];
+
+  // Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
+  // since this character can be useful both for user and
+  // developer. We can easily figure out that <unk> is emitted.
+  optional string unk_surface = 44 [default = " \xE2\x81\x87 "];
+
+  // Increase bit depth to allow unigram model training on large
+  // (>10M sentences) corpora. A Side-effect of enabling this flag
+  // is increased memory usage.
+  optional bool train_extremely_large_corpus = 49 [default = false];
+
+ // Path to a seed sentencepieces file, with one tab-separated
+  // seed sentencepiece <tab> frequency per line.
+  optional string seed_sentencepieces_file = 54 [default = ""];
+
+  // Customized extensions: the range of field numbers
+  // are open to third-party extensions.
+  extensions 200 to max;
+}
+
+// NormalizerSpec encodes a various parameters for string normalizaiton
+message NormalizerSpec {
+  // name of normalization rule.
+  optional string name = 1;
+
+  // Pre-compiled normalization rule created by
+  // Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method.
+  // Usually this field is set by Builder::GetNormalizerSpec() method.
+  optional bytes precompiled_charsmap = 2;
+
+  // Adds dummy whitespace at the beginning of text in order to
+  // treat "world" in "world" and "hello world" in the same way.
+  optional bool add_dummy_prefix = 3 [default = true];
+
+  // Removes leading, trailing, and duplicate internal whitespace.
+  optional bool remove_extra_whitespaces = 4 [default = true];
+
+  // Replaces whitespace with meta symbol.
+  // This field must be true to train sentence piece model.
+  optional bool escape_whitespaces = 5 [default = true];
+
+  // Custom normalization rule file in TSV format.
+  // https://github.com/google/sentencepiece/blob/master/doc/normalization.md
+  // This field is only used in SentencePieceTrainer::Train() method, which
+  // compiles the rule into the binary rule stored in `precompiled_charsmap`.
+  optional string normalization_rule_tsv = 6;
+
+  // Customized extensions: the range of field numbers
+  // are open to third-party extensions.
+  extensions 200 to max;
+}
+
+// Proto to store samples for self-testing.
+message SelfTestData {
+  message Sample {
+    optional string input = 1;
+    optional string expected = 2;
+  }
+  repeated Sample samples = 1;
+
+  // Customized extensions: the range of field numbers
+  // are open to third-party extensions.
+  extensions 200 to max;
+}
+
+// ModelProto stores model parameters.
+// SentencePieceProcessor is supposed to be self-contained.
+// All settings/parameters which may change the behavior must be encoded
+// in ModelProto.
+message ModelProto {
+  message SentencePiece {
+    enum Type {
+      NORMAL = 1;        // normal symbol
+      UNKNOWN = 2;       // unknown symbol. only <unk> for now.
+      CONTROL = 3;       // control symbols. </s>, <s>, <2ja> etc.
+      USER_DEFINED = 4;  // user defined symbols.
+                         // Typical usage of USER_DEFINED symbol
+                         // is placeholder.
+      BYTE = 6;          // byte symbols. Used when `byte_fallback` is true.
+      UNUSED = 5;        // this piece is not used.
+    }
+    optional string piece = 1;  // piece must not be empty.
+    optional float score = 2;
+    optional Type type = 3 [default = NORMAL];
+
+    // Customized extensions: the range of field numbers
+    // are open to third-party extensions.
+    extensions 200 to max;
+  }
+
+  // Sentence pieces with scores.
+  repeated SentencePiece pieces = 1;
+
+  // Spec used to generate this model file.
+  optional TrainerSpec trainer_spec = 2;
+
+  // Spec for text normalization.
+  optional NormalizerSpec normalizer_spec = 3;
+
+  // Stores sample input and its expected segmentation to verify the model.
+  optional SelfTestData self_test_data = 4;
+
+  // Spec for text de-normalization.
+  optional NormalizerSpec denormalizer_spec = 5;
+
+  // Customized extensions: the range of field numbers
+  // are open to third-party extensions.
+  extensions 200 to max;
+}
diff --git a/vertexai/internal/sentencepiece/internal/prefixmatcher/prefixmatcher.go b/vertexai/internal/sentencepiece/internal/prefixmatcher/prefixmatcher.go
new file mode 100644
index 000000000000..540b1774df13
--- /dev/null
+++ b/vertexai/internal/sentencepiece/internal/prefixmatcher/prefixmatcher.go
@@ -0,0 +1,82 @@
+// Copyright 2024 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package prefixmatcher
+
+import (
+	"unicode/utf8"
+)
+
+// PrefixMatcher helps find longest prefixes. See [FindPrefixLen].
+type PrefixMatcher struct {
+	root *trieNode
+}
+
+type trieNode struct {
+	children map[rune]*trieNode
+	final    bool
+}
+
+// NewFromSet creates a new [PrefixMatcher] from a set of strings tha represent
+// the vocabulary.
+func NewFromSet(vocab map[string]bool) *PrefixMatcher {
+	pm := &PrefixMatcher{root: newNode()}
+	for word := range vocab {
+		pm.add(word)
+	}
+	return pm
+}
+
+// FindPrefixLen finds the longest prefix of text that matches a vocabulary
+// word, and returns it. If 0 is returned, no prefix was found.
+func (pm *PrefixMatcher) FindPrefixLen(text string) int {
+	node := pm.root
+	maxLen := 0
+
+	for i, r := range text {
+		child := node.children[r]
+		if child == nil {
+			// r not found in this node, so we're done.
+			return maxLen
+		}
+		if child.final {
+			maxLen = i + utf8.RuneLen(r)
+		}
+		node = child
+	}
+
+	return maxLen
+}
+
+func (pm *PrefixMatcher) add(word string) {
+	node := pm.root
+
+	for _, r := range word {
+		child := node.children[r]
+		if child == nil {
+			child = newNode()
+			node.children[r] = child
+		}
+		node = child
+	}
+
+	node.final = true
+}
+
+func newNode() *trieNode {
+	return &trieNode{
+		children: make(map[rune]*trieNode),
+		final:    false,
+	}
+}
diff --git a/vertexai/internal/sentencepiece/internal/priorityqueue/priorityqueue.go b/vertexai/internal/sentencepiece/internal/priorityqueue/priorityqueue.go
new file mode 100644
index 000000000000..ed8e14e27b90
--- /dev/null
+++ b/vertexai/internal/sentencepiece/internal/priorityqueue/priorityqueue.go
@@ -0,0 +1,108 @@
+// Copyright 2024 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package priorityqueue provides a generic priority queue with Insert
+// and PopMax operations.
+package priorityqueue
+
+// PriorityQueue is a generic priority queue with a configurable comparison
+// function.
+type PriorityQueue[T any] struct {
+	cmp func(a, b T) int
+
+	// items holds the queue's items as a binary heap.
+	// items[0] is a dummy element that's not used. If the queue has N elements,
+	// they are stored at indices 1...N (N == len(items)-1)
+	// For an element at index i, its parent is at index i/2, and its children
+	// are at indices 2i and 2i+1. The root of the heap is at index 1.
+	items []T
+}
+
+// New creates a new PriorityQueue, configured with a function that
+// compares the priorities of two items a and b; it should return a number > 0
+// if the priority of a is higher, 0 if the priorities are equal a number < 0
+// otherwise.
+func New[T any](cmp func(a, b T) int) *PriorityQueue[T] {
+	return &PriorityQueue[T]{cmp: cmp, items: make([]T, 1)}
+}
+
+// Len returns the length (number of items) of the priority queue.
+func (pq *PriorityQueue[T]) Len() int {
+	return len(pq.items) - 1
+}
+
+// Insert inserts a new element into the priority queue.
+func (pq *PriorityQueue[T]) Insert(elem T) {
+	pq.items = append(pq.items, elem)
+	pq.siftup(len(pq.items) - 1)
+}
+
+// PopMax returns the element with the maximal priority in the queue, and
+// removes it from the queue. Warning: to maintain a clean API, PopMax panics
+// if the queue is empty. Make sure to check Len() first.
+func (pq *PriorityQueue[T]) PopMax() T {
+	if len(pq.items) < 2 {
+		panic("popping from empty priority queue")
+	}
+	maxItem := pq.items[1]
+	pq.items[1] = pq.items[len(pq.items)-1]
+	pq.items = pq.items[:len(pq.items)-1]
+	pq.siftdown()
+	return maxItem
+}
+
+func (pq *PriorityQueue[T]) siftup(n int) {
+	i := n
+	for {
+		if i == 1 {
+			// Reached root, we're done.
+			return
+		}
+		// p is the index of i's parent
+		// if p parent has a higher priority than i, we're done.
+		p := i / 2
+		if pq.cmp(pq.items[p], pq.items[i]) >= 0 {
+			return
+		}
+		pq.items[i], pq.items[p] = pq.items[p], pq.items[i]
+		i = p
+	}
+}
+
+func (pq *PriorityQueue[T]) siftdown() {
+	i := 1
+	for {
+		c := 2 * i
+		if c >= len(pq.items) {
+			return
+		}
+		// c is not out of bounds, so it's the index of the left child of i
+
+		// Figure out the child index with the maximal priority
+		maxChild := c
+		if c+1 < len(pq.items) {
+			// c+1 is not out of bounds, so it's the index of the right child of i
+			if pq.cmp(pq.items[c+1], pq.items[c]) > 0 {
+				maxChild = c + 1
+			}
+		}
+		if pq.cmp(pq.items[i], pq.items[maxChild]) >= 0 {
+			// i has higher priority than either child, so we're done.
+			return
+		}
+
+		pq.items[i], pq.items[maxChild] = pq.items[maxChild], pq.items[i]
+		i = maxChild
+	}
+}
diff --git a/vertexai/internal/sentencepiece/normalize.go b/vertexai/internal/sentencepiece/normalize.go
new file mode 100644
index 000000000000..6fb4f8674675
--- /dev/null
+++ b/vertexai/internal/sentencepiece/normalize.go
@@ -0,0 +1,34 @@
+// Copyright 2024 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sentencepiece
+
+import "strings"
+
+// normalize performs unicode normalization.
+//
+// SentencePiece has a feature to perform configurable unicode normalization on
+// the input text and has some options for adding dummy whitespace prefixes or
+// trimming whitespace. However, the model we're working with has a very simple
+// normalizer that does none of this. These options can be added in the future
+// if needed.
+func normalize(text string) string {
+	return replaceSeparator(text)
+}
+
+// replaceSeparator replaces spaces by the whitespace separator used by
+// the model.
+func replaceSeparator(text string) string {
+	return strings.ReplaceAll(text, " ", "▁")
+}
diff --git a/vertexai/internal/sentencepiece/token.go b/vertexai/internal/sentencepiece/token.go
new file mode 100644
index 000000000000..1af86755ea16
--- /dev/null
+++ b/vertexai/internal/sentencepiece/token.go
@@ -0,0 +1,29 @@
+// Copyright 2024 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sentencepiece
+
+import "fmt"
+
+// Token represents a single token from the input text. ID is a unique token
+// identifier that the model uses in its internal representation. Text is
+// the piece of text this token represents.
+type Token struct {
+	ID   int
+	Text string
+}
+
+func (t Token) String() string {
+	return fmt.Sprintf("Token{ID: %v, Text: %q}", t.ID, t.Text)
+}