From bf9595ee7c77d913db0506a29ccf455f8eb6e98b Mon Sep 17 00:00:00 2001 From: Eli Bendersky Date: Thu, 15 Aug 2024 16:18:21 -0600 Subject: [PATCH] vertexai: import go-sentencepiece to internal (#10689) The import script `import-go-sentencepiece.sh` is the only new piece of code here Tests are excluded, since they require setting/downloading the proto model. We'll add some smoke testing for the new package wrapping this for the SDK --- vertexai/internal/LICENSE_HEADER | 14 + vertexai/internal/import-go-sentencepiece.sh | 53 + vertexai/internal/sentencepiece/.gitignore | 25 + vertexai/internal/sentencepiece/LICENSE | 201 +++ vertexai/internal/sentencepiece/README.md | 56 + vertexai/internal/sentencepiece/encoder.go | 332 ++++ .../sentencepiece/internal/cmd/dumper/main.go | 87 + .../sentencepiece/internal/model/gen.sh | 11 + .../internal/model/sentencepiece_model.pb.go | 1556 +++++++++++++++++ .../internal/model/sentencepiece_model.proto | 332 ++++ .../internal/prefixmatcher/prefixmatcher.go | 82 + .../internal/priorityqueue/priorityqueue.go | 108 ++ vertexai/internal/sentencepiece/normalize.go | 34 + vertexai/internal/sentencepiece/token.go | 29 + 14 files changed, 2920 insertions(+) create mode 100644 vertexai/internal/LICENSE_HEADER create mode 100755 vertexai/internal/import-go-sentencepiece.sh create mode 100644 vertexai/internal/sentencepiece/.gitignore create mode 100644 vertexai/internal/sentencepiece/LICENSE create mode 100644 vertexai/internal/sentencepiece/README.md create mode 100644 vertexai/internal/sentencepiece/encoder.go create mode 100644 vertexai/internal/sentencepiece/internal/cmd/dumper/main.go create mode 100755 vertexai/internal/sentencepiece/internal/model/gen.sh create mode 100644 vertexai/internal/sentencepiece/internal/model/sentencepiece_model.pb.go create mode 100644 vertexai/internal/sentencepiece/internal/model/sentencepiece_model.proto create mode 100644 vertexai/internal/sentencepiece/internal/prefixmatcher/prefixmatcher.go create mode 100644 vertexai/internal/sentencepiece/internal/priorityqueue/priorityqueue.go create mode 100644 vertexai/internal/sentencepiece/normalize.go create mode 100644 vertexai/internal/sentencepiece/token.go diff --git a/vertexai/internal/LICENSE_HEADER b/vertexai/internal/LICENSE_HEADER new file mode 100644 index 000000000000..b3b714d5e766 --- /dev/null +++ b/vertexai/internal/LICENSE_HEADER @@ -0,0 +1,14 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + diff --git a/vertexai/internal/import-go-sentencepiece.sh b/vertexai/internal/import-go-sentencepiece.sh new file mode 100755 index 000000000000..50ba93f6541c --- /dev/null +++ b/vertexai/internal/import-go-sentencepiece.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Imports github.com/eliben/go-sentencepiece for local vendoring in our module, +# with the author's permission. + +# Fail on any error +set -eo pipefail + +# Display commands being run +set -x + +# Create a temporary directory +TEMP_DIR=$(mktemp -d) + +# Clone the repository with --depth 1 to get only the latest files +git clone --depth 1 https://github.com/eliben/go-sentencepiece.git "$TEMP_DIR/go-sentencepiece" + +# Copy the repository contents to here, excluding the .git directory +rm -rf sentencepiece +mkdir -p sentencepiece +rsync -av \ + --exclude='.git' \ + --exclude='go.mod' \ + --exclude='go.sum' \ + --exclude='test' \ + --exclude='*_test.go' \ + "$TEMP_DIR/go-sentencepiece/" sentencepiece + +# Replace import paths. +find "sentencepiece" -type f -name '*.go' \ + -exec sed -i 's|github.com/eliben/go-sentencepiece|cloud.google.com/go/vertexai/internal/sentencepiece|g' {} + + +# Prepend the LICENSE_HEADER to each .go file +GO_FILES=$(find sentencepiece -type f -name '*.go') +LICENSE_HEADER=$(realpath "LICENSE_HEADER") + +for gofile in $GO_FILES; do + cat "$LICENSE_HEADER" "$gofile" > "$gofile.tmp" && mv "$gofile.tmp" "$gofile" +done diff --git a/vertexai/internal/sentencepiece/.gitignore b/vertexai/internal/sentencepiece/.gitignore new file mode 100644 index 000000000000..6f72f8926186 --- /dev/null +++ b/vertexai/internal/sentencepiece/.gitignore @@ -0,0 +1,25 @@ +# If you prefer the allow list template instead of the deny list, see community template: +# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore +# +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Test binary, built with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +# Dependency directories (remove the comment below to include it) +# vendor/ + +# Go workspace file +go.work +go.work.sum + +# env file +.env diff --git a/vertexai/internal/sentencepiece/LICENSE b/vertexai/internal/sentencepiece/LICENSE new file mode 100644 index 000000000000..261eeb9e9f8b --- /dev/null +++ b/vertexai/internal/sentencepiece/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vertexai/internal/sentencepiece/README.md b/vertexai/internal/sentencepiece/README.md new file mode 100644 index 000000000000..724221784780 --- /dev/null +++ b/vertexai/internal/sentencepiece/README.md @@ -0,0 +1,56 @@ +# go-sentencepiece + +[![Go Reference](https://pkg.go.dev/badge/github.com/eliben/go-sentencepiece.svg)](https://pkg.go.dev/github.com/eliben/go-sentencepiece) + +This is a pure Go implementation of encoding text with +the [SentencePiece tokenizer](https://github.com/google/sentencepiece). + +"Encoding" is the operation used to split text into tokens, using +a trained tokenizer model. + +SentencePiece is a general family of tokenizers that is configured +by a protobuf configuration file. This repository currently focuses +on implementing just the functionality required to reproduce the +tokenization of [Gemma models](https://ai.google.dev/gemma) (the same +tokenizer is used for Google's proprietary Gemini family of models). +Specifically, it only implements BPE tokenization since this is what +Gemma uses. + +## Current status + +This package should be ready to use for encoding text into tokens +using the Gemma tokenizer; it's been reasonably optimized and extensively +tested vs. the [SentencePiece Python bindings](https://pypi.org/project/sentencepiece/) +(see `system_test.go` in this repository). + +If you find any problems or discrepancies, please open an issue. + +## Tokenizer configuration + +The configuration file for the tokenizer is a protobuf (structured +data, serialized in the [protocol buffer format](https://protobuf.dev/)) +that describes a trained tokenizer model; it includes +the complete learned vocabulary used for tokenization, as well as +other configuration information. + +It is not part of this repository. Please fetch it from the +[official Gemma implementation repository](https://github.com/google/gemma_pytorch/tree/main/tokenizer). +`NewEncoder*` constructors will expect to read this file. + +## Developing + +A protobuf is used to configure the tokenizer. The structure of the +protobuf is described by the `internal/model/sentencepiece_model.proto` file, +which is vendored from https://github.com/google/sentencepiece + +To re-generate the `*.pb.go` file from it: + +``` +$ cd internal/model +$ ./gen.sh +``` + +The configuration protobuf itself is obtained as described in the +[Tokenizer configuration](#tokenizer-configuration) section. All +tests require the `MODELPATH` env var to point to a local +copy of the tokenizer configuration file. diff --git a/vertexai/internal/sentencepiece/encoder.go b/vertexai/internal/sentencepiece/encoder.go new file mode 100644 index 000000000000..89c8b7785761 --- /dev/null +++ b/vertexai/internal/sentencepiece/encoder.go @@ -0,0 +1,332 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sentencepiece + +import ( + "fmt" + "io" + "os" + "strconv" + "strings" + "unicode/utf8" + + "cloud.google.com/go/vertexai/internal/sentencepiece/internal/model" + "cloud.google.com/go/vertexai/internal/sentencepiece/internal/prefixmatcher" + "cloud.google.com/go/vertexai/internal/sentencepiece/internal/priorityqueue" + "google.golang.org/protobuf/proto" +) + +const debugEncode = false + +// Encoder represents a SentencePiece encoder (tokenizer). +// An Encoder converts input text into a sequence of tokens LLMs use. +// The mapping between token IDs and the text they represent is read from the +// model proto (provided to the constructor); it's the same between all calls +// to the Encode method. +type Encoder struct { + model *model.ModelProto + + pieces map[string]int + reserved map[string]int + + // unknownID is the token identifier of the UNKNOWN piece + unknownID int + + // userDefinedMatcher is a prefix matcher for symbols that are of + // "user-defined" type in the model proto. + userDefinedMatcher *prefixmatcher.PrefixMatcher + + // byteTokens is a cache of byte values and the tokens they represent + byteTokens map[byte]Token +} + +// NewEncoderFromPath creates a new Encoder from a file path to the protobuf +// data. +func NewEncoderFromPath(protoFile string) (*Encoder, error) { + f, err := os.Open(protoFile) + if err != nil { + return nil, fmt.Errorf("unable to read %q: %v", protoFile, err) + } + defer f.Close() + return NewEncoder(f) +} + +// NewEncoder creates a new Encoder from a reader with the protobuf data. +func NewEncoder(protoReader io.Reader) (*Encoder, error) { + b, err := io.ReadAll(protoReader) + if err != nil { + return nil, fmt.Errorf("unable to read protobuf data: %v", err) + } + + var mp model.ModelProto + err = proto.Unmarshal(b, &mp) + if err != nil { + return nil, fmt.Errorf("unable to unmarshal protobuf: %v", err) + } + + tspec := mp.GetTrainerSpec() + if tspec.GetModelType() != model.TrainerSpec_BPE { + return nil, fmt.Errorf("model type %s not supported", tspec.GetModelType()) + } + + userDefined := make(map[string]bool) + pieces := make(map[string]int) + reserved := make(map[string]int) + byteTokens := make(map[byte]Token) + unkID := -1 + + for i, piece := range mp.GetPieces() { + isNormalPiece := (piece.GetType() == model.ModelProto_SentencePiece_NORMAL || + piece.GetType() == model.ModelProto_SentencePiece_USER_DEFINED || + piece.GetType() == model.ModelProto_SentencePiece_UNUSED) + + if isNormalPiece { + pieces[piece.GetPiece()] = i + } else { + reserved[piece.GetPiece()] = i + } + + if piece.GetType() == model.ModelProto_SentencePiece_USER_DEFINED { + userDefined[piece.GetPiece()] = true + } else if piece.GetType() == model.ModelProto_SentencePiece_UNKNOWN { + if unkID > 0 { + return nil, fmt.Errorf("unk redefined") + } + unkID = i + } else if piece.GetType() == model.ModelProto_SentencePiece_BYTE { + if !tspec.GetByteFallback() { + return nil, fmt.Errorf("byte piece %q is found although `byte_fallback=false`", piece.GetPiece()) + } + bv := convertHexValue(piece.GetPiece()) + if bv >= 0 && bv < 256 { + byteTokens[byte(bv)] = Token{ID: i, Text: piece.GetPiece()} + } + } + } + + if unkID < 0 { + return nil, fmt.Errorf("unk symbol is not defined") + } + + // In case byte_fallback is specified, make sure that all 256 possible byte + // values were found. + if tspec.GetByteFallback() { + for i := 0; i < 256; i++ { + if _, found := byteTokens[byte(i)]; !found { + return nil, fmt.Errorf("byte value 0x%02X not found", i) + } + } + } + + return &Encoder{ + model: &mp, + userDefinedMatcher: prefixmatcher.NewFromSet(userDefined), + byteTokens: byteTokens, + unknownID: unkID, + pieces: pieces, + reserved: reserved, + }, nil +} + +// Encode tokenizes the input text and returns a list of Tokens. +func (enc *Encoder) Encode(text string) []Token { + text = normalize(text) + + // We begin by having each symbol a single Unicode character (or a + // user-defined string), and will iteratively merge them into larger and + // larger symbols until we have the final list of tokens. + // Since this list of symbols changes a lot, we represent it as a + // doubly-linked list in the symList slice. Each element in this slice has + // prev/next links to the next "live" symbol in the list; noMerge means this + // is a user-defined symbol we're not allowed to merge with neighbors. + // After the algorithm is finished, many elements in symList will be "dead" + // (unreachable by next/prev links from the first element). + // This representation is inspired by the implementation of bpe::Model + // in the SentencePiece C++ library. + + type symListElem struct { + prev, next int + noMerge bool + symbol string + } + symList := make([]symListElem, 0, len(text)) + + for { + // Match the next symbol in text + slen, found := enc.symbolMatch(text) + + // Append a list element for this symbol; note that this element will be + // at index len(symList), so prev/next are set up accordingly. + sym := symListElem{ + noMerge: found, + symbol: text[:slen], + prev: len(symList) - 1, + next: len(symList) + 1, + } + symList = append(symList, sym) + + // Advance the text slice to the next symbol; if no more text, we're done. + text = text[slen:] + if len(text) == 0 { + break + } + } + + if len(symList) == 0 { + return nil + } + symList[len(symList)-1].next = -1 + + debugShowSymList := func(prefix string) { + if debugEncode { + fmt.Println(prefix) + for i, elem := range symList { + fmt.Printf("[%3d]: [prev: %3v, next: %3d, noMerge: %v] %q\n", i, elem.prev, elem.next, elem.noMerge, elem.symbol) + } + } + } + debugShowSymList("initial") + + // To avoid repeating work, we manage a priority queue of "merge candidates". + // Each candidate has pointers to the symList list for the left and right + // symbol in the pair, as well as the combined symbol's score. + // The priority of merging is determined by this score, with position as + // the tie-breaker (earlier pairs are preferred). + type mergeCandidate struct { + left, right int + length int + score float32 + } + + mergeQueue := priorityqueue.New(func(a, b mergeCandidate) int { + if a.score > b.score || (a.score == b.score && a.left < b.left) { + return 1 + } + return -1 + }) + + // suggestNewMergePair is called to potentially add a new mergeCandidate to + // mergeQueue. The candidate is added if it's valid, both its parts are + // allowed to merge, and it appears in the vocabulary. + suggestNewMergePair := func(left, right int) { + if left == -1 || right == -1 || symList[left].noMerge || symList[right].noMerge { + return + } + + mergedSymbol := symList[left].symbol + symList[right].symbol + if id, found := enc.pieces[mergedSymbol]; found { + mergeQueue.Insert(mergeCandidate{ + left: left, + right: right, + length: len(mergedSymbol), + score: enc.model.GetPieces()[id].GetScore(), + }) + } + } + + // Seed the merge queue with all pairs of symbols from symList + for i := 1; i < len(symList); i++ { + suggestNewMergePair(i-1, i) + } + + // Main loop + for mergeQueue.Len() > 0 { + candidate := mergeQueue.PopMax() + leftSymbol := symList[candidate.left] + rightSymbol := symList[candidate.right] + + // Make sure this candidate is not out of date. If one of its parts was + // already merged with another symbol, just skip this candidate. + if len(leftSymbol.symbol) == 0 || + len(rightSymbol.symbol) == 0 || + len(leftSymbol.symbol)+len(rightSymbol.symbol) != candidate.length { + continue + } + + // Do the merge: + // 1. Merge the concatenation of leftSymbol and rightSymbol into leftSymbol + symList[candidate.left].symbol = leftSymbol.symbol + rightSymbol.symbol + + // 2. Update prev/next pointers + symList[candidate.left].next = rightSymbol.next + if rightSymbol.next >= 0 { + symList[rightSymbol.next].prev = candidate.left + } + + // 3. Mark the right element in the pair as outdated (it's been merged + // into the left one). + symList[candidate.right].symbol = "" + + // 4. Add merge suggestions for the newly merged symbol with its neighbors + suggestNewMergePair(leftSymbol.prev, candidate.left) + suggestNewMergePair(candidate.left, rightSymbol.next) + } + + // Collect the final list of tokens from the remaining elements of symList. + tokens := make([]Token, 0, len(symList)) + for i := 0; i >= 0; i = symList[i].next { + symbol := symList[i].symbol + id := enc.symbolToID(symbol) + + if id == enc.unknownID && enc.model.GetTrainerSpec().GetByteFallback() { + // Decompose this symbol into bytes, and report each byte as a separate + // token. + for i := 0; i < len(symbol); i++ { + tokens = append(tokens, enc.byteTokens[symbol[i]]) + } + } else { + tokens = append(tokens, Token{ID: id, Text: symbol}) + } + } + + return tokens +} + +// symbolMatch finds the length of the first symbol in text. A symbol is either +// a user-defined symbol from the proto or a single rune. The second return +// value is true iff a user-defined symbol was matched. +func (enc *Encoder) symbolMatch(text string) (int, bool) { + prefixLen := enc.userDefinedMatcher.FindPrefixLen(text) + if prefixLen > 0 { + return prefixLen, true + } + // Not found a user-defined prefix; get the length of next rune. + _, rlen := utf8.DecodeRuneInString(text) + return rlen, false +} + +// symbolToID finds the right ID for the given textual symbol, or returns +// enc.unknownID if the symbol is unknown. +func (enc *Encoder) symbolToID(symbol string) int { + if id, found := enc.reserved[symbol]; found { + return id + } + if id, found := enc.pieces[symbol]; found { + return id + } + return enc.unknownID +} + +// convertHexValue converts strings of the form "<0xXY>" to the (unsigned) +// integer value of the hexadecimal number XY. -1 is returned for bad input. +func convertHexValue(bv string) int { + bv = strings.TrimPrefix(bv, "<0x") + bv = strings.TrimSuffix(bv, ">") + n, err := strconv.ParseInt(bv, 16, 32) + if err != nil { + return -1 + } + return int(n) +} diff --git a/vertexai/internal/sentencepiece/internal/cmd/dumper/main.go b/vertexai/internal/sentencepiece/internal/cmd/dumper/main.go new file mode 100644 index 000000000000..74bafa991fdf --- /dev/null +++ b/vertexai/internal/sentencepiece/internal/cmd/dumper/main.go @@ -0,0 +1,87 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +// Command dumper is a debugging utility for internal use. It helps explore +// the model proto and compare results with other tools. + +import ( + "flag" + "fmt" + "io/ioutil" + "log" + "os" + "unicode" + + "cloud.google.com/go/vertexai/internal/sentencepiece" + "cloud.google.com/go/vertexai/internal/sentencepiece/internal/model" + "google.golang.org/protobuf/encoding/prototext" + "google.golang.org/protobuf/proto" +) + +func main() { + fDumpAll := flag.Bool("dumpall", false, "dump entire model proto") + fFindUni := flag.Bool("finduni", false, "find unicode runes not in pieces") + fEncodeFile := flag.String("encodefile", "", "file name to open and encode") + flag.Parse() + + modelPath := os.Getenv("MODELPATH") + if modelPath == "" { + log.Fatal("Need MODELPATH env var to run") + } + + b, err := ioutil.ReadFile(modelPath) + if err != nil { + log.Fatal(err) + } + + var model model.ModelProto + err = proto.Unmarshal(b, &model) + if err != nil { + log.Fatal(err) + } + + if *fDumpAll { + fmt.Println(prototext.Format(&model)) + } else if *fFindUni { + pieces := make(map[string]int) + for i, piece := range model.GetPieces() { + pieces[piece.GetPiece()] = i + } + + for r := rune(0); r <= unicode.MaxRune; r++ { + if unicode.IsPrint(r) { + if _, found := pieces[string(r)]; !found { + fmt.Printf("not in pieces: %U %q\n", r, string(r)) + } + } + } + } else if *fEncodeFile != "" { + enc, err := sentencepiece.NewEncoderFromPath(modelPath) + if err != nil { + log.Fatal(err) + } + + b, err := ioutil.ReadFile(*fEncodeFile) + if err != nil { + log.Fatal(err) + } + + tokens := enc.Encode(string(b)) + for _, t := range tokens { + fmt.Println(t.ID) + } + } +} diff --git a/vertexai/internal/sentencepiece/internal/model/gen.sh b/vertexai/internal/sentencepiece/internal/model/gen.sh new file mode 100755 index 000000000000..7669a851a88a --- /dev/null +++ b/vertexai/internal/sentencepiece/internal/model/gen.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -o pipefail +set -eux + +protoc \ + --go_out=. \ + --go_opt="Msentencepiece_model.proto=;model" sentencepiece_model.proto + +goimports -w . + diff --git a/vertexai/internal/sentencepiece/internal/model/sentencepiece_model.pb.go b/vertexai/internal/sentencepiece/internal/model/sentencepiece_model.pb.go new file mode 100644 index 000000000000..127f6f23da48 --- /dev/null +++ b/vertexai/internal/sentencepiece/internal/model/sentencepiece_model.pb.go @@ -0,0 +1,1556 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License.! + +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.25.0-devel +// protoc v3.14.0 +// source: sentencepiece_model.proto + +package model + +import ( + reflect "reflect" + sync "sync" + + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoiface "google.golang.org/protobuf/runtime/protoiface" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +// Model type. only have UNIGRAM now. +type TrainerSpec_ModelType int32 + +const ( + TrainerSpec_UNIGRAM TrainerSpec_ModelType = 1 // Unigram language model with dynamic algorithm + TrainerSpec_BPE TrainerSpec_ModelType = 2 // Byte Pair Encoding + TrainerSpec_WORD TrainerSpec_ModelType = 3 // Delimitered by whitespace. + TrainerSpec_CHAR TrainerSpec_ModelType = 4 // tokenizes into character sequence +) + +// Enum value maps for TrainerSpec_ModelType. +var ( + TrainerSpec_ModelType_name = map[int32]string{ + 1: "UNIGRAM", + 2: "BPE", + 3: "WORD", + 4: "CHAR", + } + TrainerSpec_ModelType_value = map[string]int32{ + "UNIGRAM": 1, + "BPE": 2, + "WORD": 3, + "CHAR": 4, + } +) + +func (x TrainerSpec_ModelType) Enum() *TrainerSpec_ModelType { + p := new(TrainerSpec_ModelType) + *p = x + return p +} + +func (x TrainerSpec_ModelType) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (TrainerSpec_ModelType) Descriptor() protoreflect.EnumDescriptor { + return file_sentencepiece_model_proto_enumTypes[0].Descriptor() +} + +func (TrainerSpec_ModelType) Type() protoreflect.EnumType { + return &file_sentencepiece_model_proto_enumTypes[0] +} + +func (x TrainerSpec_ModelType) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Do not use. +func (x *TrainerSpec_ModelType) UnmarshalJSON(b []byte) error { + num, err := protoimpl.X.UnmarshalJSONEnum(x.Descriptor(), b) + if err != nil { + return err + } + *x = TrainerSpec_ModelType(num) + return nil +} + +// Deprecated: Use TrainerSpec_ModelType.Descriptor instead. +func (TrainerSpec_ModelType) EnumDescriptor() ([]byte, []int) { + return file_sentencepiece_model_proto_rawDescGZIP(), []int{0, 0} +} + +type ModelProto_SentencePiece_Type int32 + +const ( + ModelProto_SentencePiece_NORMAL ModelProto_SentencePiece_Type = 1 // normal symbol + ModelProto_SentencePiece_UNKNOWN ModelProto_SentencePiece_Type = 2 // unknown symbol. only for now. + ModelProto_SentencePiece_CONTROL ModelProto_SentencePiece_Type = 3 // control symbols. , , <2ja> etc. + ModelProto_SentencePiece_USER_DEFINED ModelProto_SentencePiece_Type = 4 // user defined symbols. + // Typical usage of USER_DEFINED symbol + // is placeholder. + ModelProto_SentencePiece_BYTE ModelProto_SentencePiece_Type = 6 // byte symbols. Used when `byte_fallback` is true. + ModelProto_SentencePiece_UNUSED ModelProto_SentencePiece_Type = 5 // this piece is not used. +) + +// Enum value maps for ModelProto_SentencePiece_Type. +var ( + ModelProto_SentencePiece_Type_name = map[int32]string{ + 1: "NORMAL", + 2: "UNKNOWN", + 3: "CONTROL", + 4: "USER_DEFINED", + 6: "BYTE", + 5: "UNUSED", + } + ModelProto_SentencePiece_Type_value = map[string]int32{ + "NORMAL": 1, + "UNKNOWN": 2, + "CONTROL": 3, + "USER_DEFINED": 4, + "BYTE": 6, + "UNUSED": 5, + } +) + +func (x ModelProto_SentencePiece_Type) Enum() *ModelProto_SentencePiece_Type { + p := new(ModelProto_SentencePiece_Type) + *p = x + return p +} + +func (x ModelProto_SentencePiece_Type) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (ModelProto_SentencePiece_Type) Descriptor() protoreflect.EnumDescriptor { + return file_sentencepiece_model_proto_enumTypes[1].Descriptor() +} + +func (ModelProto_SentencePiece_Type) Type() protoreflect.EnumType { + return &file_sentencepiece_model_proto_enumTypes[1] +} + +func (x ModelProto_SentencePiece_Type) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Do not use. +func (x *ModelProto_SentencePiece_Type) UnmarshalJSON(b []byte) error { + num, err := protoimpl.X.UnmarshalJSONEnum(x.Descriptor(), b) + if err != nil { + return err + } + *x = ModelProto_SentencePiece_Type(num) + return nil +} + +// Deprecated: Use ModelProto_SentencePiece_Type.Descriptor instead. +func (ModelProto_SentencePiece_Type) EnumDescriptor() ([]byte, []int) { + return file_sentencepiece_model_proto_rawDescGZIP(), []int{3, 0, 0} +} + +// TrainerSpec encodes a various parameters for SentencePiece training. +// Next id: 55 +type TrainerSpec struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + extensionFields protoimpl.ExtensionFields + + /////////////////////////////////////////////////////////////////// + // General parameters + // + // Input corpus files. + // Trainer accepts the following two formats: + // A) Monolingual: plain text, one sentence per line. + // B) Bilingual: TSV, source sentence target sentence + // When bilingual data is passed, shared vocabulary model is built. + // Note that the input file must be raw corpus, not a preprocessed corpus. + // Trainer only loads the first `input_sentence_size` sentences specified + // with this parameter. + Input []string `protobuf:"bytes,1,rep,name=input" json:"input,omitempty"` + // Input corpus format: + // "text": one-sentence-per-line text format (default) + // "tsv": sentence freq + InputFormat *string `protobuf:"bytes,7,opt,name=input_format,json=inputFormat" json:"input_format,omitempty"` + // Output model file prefix. + // .model and .vocab are generated. + ModelPrefix *string `protobuf:"bytes,2,opt,name=model_prefix,json=modelPrefix" json:"model_prefix,omitempty"` + ModelType *TrainerSpec_ModelType `protobuf:"varint,3,opt,name=model_type,json=modelType,enum=sentencepiece.TrainerSpec_ModelType,def=1" json:"model_type,omitempty"` + // Vocabulary size. 8k is the default size. + VocabSize *int32 `protobuf:"varint,4,opt,name=vocab_size,json=vocabSize,def=8000" json:"vocab_size,omitempty"` + // List of the languages this model can accept. + // Since the model is language-agnostic, this field is used as a reference. + AcceptLanguage []string `protobuf:"bytes,5,rep,name=accept_language,json=acceptLanguage" json:"accept_language,omitempty"` + // Size of self-test samples, which are encoded in the model file. + SelfTestSampleSize *int32 `protobuf:"varint,6,opt,name=self_test_sample_size,json=selfTestSampleSize,def=0" json:"self_test_sample_size,omitempty"` + // Whether to use DP version of sentencepiece. Use it with TSV input format + // (requires precomputed word tab counts to work). + EnableDifferentialPrivacy *bool `protobuf:"varint,50,opt,name=enable_differential_privacy,json=enableDifferentialPrivacy,def=0" json:"enable_differential_privacy,omitempty"` + // Set these parameters if you need DP version of sentencepiece. + // std of noise to add. + DifferentialPrivacyNoiseLevel *float32 `protobuf:"fixed32,51,opt,name=differential_privacy_noise_level,json=differentialPrivacyNoiseLevel,def=0" json:"differential_privacy_noise_level,omitempty"` + // Clipping threshold to apply after adding noise. All the words with + // frequency less than this value are dropped. + DifferentialPrivacyClippingThreshold *uint64 `protobuf:"varint,52,opt,name=differential_privacy_clipping_threshold,json=differentialPrivacyClippingThreshold,def=0" json:"differential_privacy_clipping_threshold,omitempty"` + /////////////////////////////////////////////////////////////////// + // Training parameters. + // + // Uses characters which cover the corpus with the ratio of `chars_coverage`. + // This parameter determines the set of basic Alphabet of sentence piece. + // 1.0 - `chars_coverage` characters are treated as UNK. + // See also required_chars field. + CharacterCoverage *float32 `protobuf:"fixed32,10,opt,name=character_coverage,json=characterCoverage,def=0.9995" json:"character_coverage,omitempty"` + // Maximum size of sentences the trainer loads from `input` parameter. + // Trainer simply loads the `input` files in sequence. + // It is better to shuffle the input corpus randomly. + InputSentenceSize *uint64 `protobuf:"varint,11,opt,name=input_sentence_size,json=inputSentenceSize,def=0" json:"input_sentence_size,omitempty"` + ShuffleInputSentence *bool `protobuf:"varint,19,opt,name=shuffle_input_sentence,json=shuffleInputSentence,def=1" json:"shuffle_input_sentence,omitempty"` + // Maximum size of sentences to make seed sentence pieces. + // Extended suffix array is constructed to extract frequent + // sub-strings from the corpus. This uses 20N working space, + // where N is the size of corpus. + // + // Deprecated: Do not use. + MiningSentenceSize *int32 `protobuf:"varint,12,opt,name=mining_sentence_size,json=miningSentenceSize" json:"mining_sentence_size,omitempty"` + // Maximum size of sentences to train sentence pieces. + // + // Deprecated: Do not use. + TrainingSentenceSize *int32 `protobuf:"varint,13,opt,name=training_sentence_size,json=trainingSentenceSize" json:"training_sentence_size,omitempty"` + // The size of seed sentencepieces. + // `seed_sentencepiece_size` must be larger than `vocab_size`. + SeedSentencepieceSize *int32 `protobuf:"varint,14,opt,name=seed_sentencepiece_size,json=seedSentencepieceSize,def=1000000" json:"seed_sentencepiece_size,omitempty"` + // In every EM sub-iterations, keeps top + // `shrinking_factor` * `current sentencepieces size` with respect to + // the loss of the sentence piece. This value should be smaller than 1.0. + ShrinkingFactor *float32 `protobuf:"fixed32,15,opt,name=shrinking_factor,json=shrinkingFactor,def=0.75" json:"shrinking_factor,omitempty"` + // The maximum sentence length in byte. The sentences with the length + // larger than `max_sentence_length` is simply ignored. + // Longer input tends to bring the following risks: + // * Overflow during EM training (unigram language model only) + // * Performance drop because of O(n log n) cost in BPE. + MaxSentenceLength *int32 `protobuf:"varint,18,opt,name=max_sentence_length,json=maxSentenceLength,def=4192" json:"max_sentence_length,omitempty"` + // Number of threads in the training. + NumThreads *int32 `protobuf:"varint,16,opt,name=num_threads,json=numThreads,def=16" json:"num_threads,omitempty"` + // Number of EM sub iterations. + NumSubIterations *int32 `protobuf:"varint,17,opt,name=num_sub_iterations,json=numSubIterations,def=2" json:"num_sub_iterations,omitempty"` + /////////////////////////////////////////////////////////////////// + // SentencePiece parameters which control the shapes of sentence piece. + // + // Maximum length of sentencepiece. + MaxSentencepieceLength *int32 `protobuf:"varint,20,opt,name=max_sentencepiece_length,json=maxSentencepieceLength,def=16" json:"max_sentencepiece_length,omitempty"` + // Uses Unicode script to split sentence pieces. + // When `split_by_unicode_script` is true, we do not allow sentence piece to + // include multiple Unicode scripts, e.g. "F1" is not a valid piece. + // Exception: CJ characters (Hiragana/Katakana/Han) are all handled + // as one script type, since Japanese word can consist of multiple scripts. + // This exception is always applied regardless of the accept-language + // parameter. + SplitByUnicodeScript *bool `protobuf:"varint,21,opt,name=split_by_unicode_script,json=splitByUnicodeScript,def=1" json:"split_by_unicode_script,omitempty"` + // When `split_by_number` is true, put a boundary between number and + // non-number transition. If we want to treat "F1" is one token, set this flag + // to be false. + SplitByNumber *bool `protobuf:"varint,23,opt,name=split_by_number,json=splitByNumber,def=1" json:"split_by_number,omitempty"` + // Use a white space to split sentence pieces. + // When `split_by_whitespace` is false, we may have the piece containing + // a white space in the middle. e.g., "in_the". + SplitByWhitespace *bool `protobuf:"varint,22,opt,name=split_by_whitespace,json=splitByWhitespace,def=1" json:"split_by_whitespace,omitempty"` + // Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello => + // hello_. When `treat_whitespace_as_suffix` is true, + // NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end + // of sentence. + TreatWhitespaceAsSuffix *bool `protobuf:"varint,24,opt,name=treat_whitespace_as_suffix,json=treatWhitespaceAsSuffix,def=0" json:"treat_whitespace_as_suffix,omitempty"` + // Allows pieces that only contain whitespaces instead of appearing only as + // prefix or suffix of other pieces. + AllowWhitespaceOnlyPieces *bool `protobuf:"varint,26,opt,name=allow_whitespace_only_pieces,json=allowWhitespaceOnlyPieces,def=0" json:"allow_whitespace_only_pieces,omitempty"` + // Split all digits (0-9) into separate pieces. + SplitDigits *bool `protobuf:"varint,25,opt,name=split_digits,json=splitDigits,def=0" json:"split_digits,omitempty"` + // Defines the pre-tokenization delimiter. + // When specified, no pieces crossing this delimiter is not included + // in the vocab. Then the delimiter string is virtually ignored + // during the training. This field can allows constraints on the vocabulary + // selection. Note that this field is available on unigram mode. + PretokenizationDelimiter *string `protobuf:"bytes,53,opt,name=pretokenization_delimiter,json=pretokenizationDelimiter,def=" json:"pretokenization_delimiter,omitempty"` + /////////////////////////////////////////////////////////////////// + // Vocabulary management + // + // Defines control symbols used as an indicator to + // change the behavior of the decoder. and are pre-defined. + // We can use this field to encode various meta information, + // including language indicator in multilingual model. + // These symbols are not visible to users, but visible to + // the decoder. Note that when the input sentence contains control symbols, + // they are not treated as one token, but segmented into normal pieces. + // Control symbols must be inserted independently from the segmentation. + ControlSymbols []string `protobuf:"bytes,30,rep,name=control_symbols,json=controlSymbols" json:"control_symbols,omitempty"` + // Defines user defined symbols. + // These symbols are added with extremely high score + // so they are always treated as one unique symbol in any context. + // Typical usage of user_defined_symbols is placeholder for named entities. + UserDefinedSymbols []string `protobuf:"bytes,31,rep,name=user_defined_symbols,json=userDefinedSymbols" json:"user_defined_symbols,omitempty"` + // Defines required characters. Each UTF8 character in this string is included + // in the character set regardless of character_coverage value. Unlike + // user_defined_symbols, these characters have scores based on the frequency + // on input sentences, and the model can form subwords using characters + // in this field. + RequiredChars *string `protobuf:"bytes,36,opt,name=required_chars,json=requiredChars" json:"required_chars,omitempty"` + // Decomposes unknown pieces into UTF-8 bytes. + ByteFallback *bool `protobuf:"varint,35,opt,name=byte_fallback,json=byteFallback,def=0" json:"byte_fallback,omitempty"` + // When creating the vocabulary file, defines whether or not to additionally + // output the score for each piece. + VocabularyOutputPieceScore *bool `protobuf:"varint,32,opt,name=vocabulary_output_piece_score,json=vocabularyOutputPieceScore,def=1" json:"vocabulary_output_piece_score,omitempty"` + // `vocab_size` is treated as hard limit. Crash if + // the model can not produce the vocab of size `vocab_size`, + // When `hard_vocab_limit` is false, vocab_size is treated + // as soft limit. Note that when model_type=char, + // always assumes hard_vocab_limit = false. + HardVocabLimit *bool `protobuf:"varint,33,opt,name=hard_vocab_limit,json=hardVocabLimit,def=1" json:"hard_vocab_limit,omitempty"` + // use all symbols for vocab extraction. This flag is valid + // if model type is either CHAR or WORD + UseAllVocab *bool `protobuf:"varint,34,opt,name=use_all_vocab,json=useAllVocab,def=0" json:"use_all_vocab,omitempty"` + /////////////////////////////////////////////////////////////////// + // Reserved special meta tokens. + // * -1 is not used. + // * unk_id must not be -1. + // Id must starts with 0 and be contigous. + UnkId *int32 `protobuf:"varint,40,opt,name=unk_id,json=unkId,def=0" json:"unk_id,omitempty"` // + BosId *int32 `protobuf:"varint,41,opt,name=bos_id,json=bosId,def=1" json:"bos_id,omitempty"` // + EosId *int32 `protobuf:"varint,42,opt,name=eos_id,json=eosId,def=2" json:"eos_id,omitempty"` // + PadId *int32 `protobuf:"varint,43,opt,name=pad_id,json=padId,def=-1" json:"pad_id,omitempty"` // (padding) + UnkPiece *string `protobuf:"bytes,45,opt,name=unk_piece,json=unkPiece,def=" json:"unk_piece,omitempty"` + BosPiece *string `protobuf:"bytes,46,opt,name=bos_piece,json=bosPiece,def=" json:"bos_piece,omitempty"` + EosPiece *string `protobuf:"bytes,47,opt,name=eos_piece,json=eosPiece,def=" json:"eos_piece,omitempty"` + PadPiece *string `protobuf:"bytes,48,opt,name=pad_piece,json=padPiece,def=" json:"pad_piece,omitempty"` + // Encodes into U+2047 (DOUBLE QUESTION MARK), + // since this character can be useful both for user and + // developer. We can easily figure out that is emitted. + UnkSurface *string `protobuf:"bytes,44,opt,name=unk_surface,json=unkSurface,def= ⁇ " json:"unk_surface,omitempty"` + // Increase bit depth to allow unigram model training on large + // (>10M sentences) corpora. A Side-effect of enabling this flag + // is increased memory usage. + TrainExtremelyLargeCorpus *bool `protobuf:"varint,49,opt,name=train_extremely_large_corpus,json=trainExtremelyLargeCorpus,def=0" json:"train_extremely_large_corpus,omitempty"` + // Path to a seed sentencepieces file, with one tab-separated + // seed sentencepiece frequency per line. + SeedSentencepiecesFile *string `protobuf:"bytes,54,opt,name=seed_sentencepieces_file,json=seedSentencepiecesFile,def=" json:"seed_sentencepieces_file,omitempty"` +} + +// Default values for TrainerSpec fields. +const ( + Default_TrainerSpec_ModelType = TrainerSpec_UNIGRAM + Default_TrainerSpec_VocabSize = int32(8000) + Default_TrainerSpec_SelfTestSampleSize = int32(0) + Default_TrainerSpec_EnableDifferentialPrivacy = bool(false) + Default_TrainerSpec_DifferentialPrivacyNoiseLevel = float32(0) + Default_TrainerSpec_DifferentialPrivacyClippingThreshold = uint64(0) + Default_TrainerSpec_CharacterCoverage = float32(0.9994999766349792) + Default_TrainerSpec_InputSentenceSize = uint64(0) + Default_TrainerSpec_ShuffleInputSentence = bool(true) + Default_TrainerSpec_SeedSentencepieceSize = int32(1000000) + Default_TrainerSpec_ShrinkingFactor = float32(0.75) + Default_TrainerSpec_MaxSentenceLength = int32(4192) + Default_TrainerSpec_NumThreads = int32(16) + Default_TrainerSpec_NumSubIterations = int32(2) + Default_TrainerSpec_MaxSentencepieceLength = int32(16) + Default_TrainerSpec_SplitByUnicodeScript = bool(true) + Default_TrainerSpec_SplitByNumber = bool(true) + Default_TrainerSpec_SplitByWhitespace = bool(true) + Default_TrainerSpec_TreatWhitespaceAsSuffix = bool(false) + Default_TrainerSpec_AllowWhitespaceOnlyPieces = bool(false) + Default_TrainerSpec_SplitDigits = bool(false) + Default_TrainerSpec_PretokenizationDelimiter = string("") + Default_TrainerSpec_ByteFallback = bool(false) + Default_TrainerSpec_VocabularyOutputPieceScore = bool(true) + Default_TrainerSpec_HardVocabLimit = bool(true) + Default_TrainerSpec_UseAllVocab = bool(false) + Default_TrainerSpec_UnkId = int32(0) + Default_TrainerSpec_BosId = int32(1) + Default_TrainerSpec_EosId = int32(2) + Default_TrainerSpec_PadId = int32(-1) + Default_TrainerSpec_UnkPiece = string("") + Default_TrainerSpec_BosPiece = string("") + Default_TrainerSpec_EosPiece = string("") + Default_TrainerSpec_PadPiece = string("") + Default_TrainerSpec_UnkSurface = string(" ⁇ ") + Default_TrainerSpec_TrainExtremelyLargeCorpus = bool(false) + Default_TrainerSpec_SeedSentencepiecesFile = string("") +) + +func (x *TrainerSpec) Reset() { + *x = TrainerSpec{} + if protoimpl.UnsafeEnabled { + mi := &file_sentencepiece_model_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *TrainerSpec) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*TrainerSpec) ProtoMessage() {} + +func (x *TrainerSpec) ProtoReflect() protoreflect.Message { + mi := &file_sentencepiece_model_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use TrainerSpec.ProtoReflect.Descriptor instead. +func (*TrainerSpec) Descriptor() ([]byte, []int) { + return file_sentencepiece_model_proto_rawDescGZIP(), []int{0} +} + +var extRange_TrainerSpec = []protoiface.ExtensionRangeV1{ + {Start: 200, End: 536870911}, +} + +// Deprecated: Use TrainerSpec.ProtoReflect.Descriptor.ExtensionRanges instead. +func (*TrainerSpec) ExtensionRangeArray() []protoiface.ExtensionRangeV1 { + return extRange_TrainerSpec +} + +func (x *TrainerSpec) GetInput() []string { + if x != nil { + return x.Input + } + return nil +} + +func (x *TrainerSpec) GetInputFormat() string { + if x != nil && x.InputFormat != nil { + return *x.InputFormat + } + return "" +} + +func (x *TrainerSpec) GetModelPrefix() string { + if x != nil && x.ModelPrefix != nil { + return *x.ModelPrefix + } + return "" +} + +func (x *TrainerSpec) GetModelType() TrainerSpec_ModelType { + if x != nil && x.ModelType != nil { + return *x.ModelType + } + return Default_TrainerSpec_ModelType +} + +func (x *TrainerSpec) GetVocabSize() int32 { + if x != nil && x.VocabSize != nil { + return *x.VocabSize + } + return Default_TrainerSpec_VocabSize +} + +func (x *TrainerSpec) GetAcceptLanguage() []string { + if x != nil { + return x.AcceptLanguage + } + return nil +} + +func (x *TrainerSpec) GetSelfTestSampleSize() int32 { + if x != nil && x.SelfTestSampleSize != nil { + return *x.SelfTestSampleSize + } + return Default_TrainerSpec_SelfTestSampleSize +} + +func (x *TrainerSpec) GetEnableDifferentialPrivacy() bool { + if x != nil && x.EnableDifferentialPrivacy != nil { + return *x.EnableDifferentialPrivacy + } + return Default_TrainerSpec_EnableDifferentialPrivacy +} + +func (x *TrainerSpec) GetDifferentialPrivacyNoiseLevel() float32 { + if x != nil && x.DifferentialPrivacyNoiseLevel != nil { + return *x.DifferentialPrivacyNoiseLevel + } + return Default_TrainerSpec_DifferentialPrivacyNoiseLevel +} + +func (x *TrainerSpec) GetDifferentialPrivacyClippingThreshold() uint64 { + if x != nil && x.DifferentialPrivacyClippingThreshold != nil { + return *x.DifferentialPrivacyClippingThreshold + } + return Default_TrainerSpec_DifferentialPrivacyClippingThreshold +} + +func (x *TrainerSpec) GetCharacterCoverage() float32 { + if x != nil && x.CharacterCoverage != nil { + return *x.CharacterCoverage + } + return Default_TrainerSpec_CharacterCoverage +} + +func (x *TrainerSpec) GetInputSentenceSize() uint64 { + if x != nil && x.InputSentenceSize != nil { + return *x.InputSentenceSize + } + return Default_TrainerSpec_InputSentenceSize +} + +func (x *TrainerSpec) GetShuffleInputSentence() bool { + if x != nil && x.ShuffleInputSentence != nil { + return *x.ShuffleInputSentence + } + return Default_TrainerSpec_ShuffleInputSentence +} + +// Deprecated: Do not use. +func (x *TrainerSpec) GetMiningSentenceSize() int32 { + if x != nil && x.MiningSentenceSize != nil { + return *x.MiningSentenceSize + } + return 0 +} + +// Deprecated: Do not use. +func (x *TrainerSpec) GetTrainingSentenceSize() int32 { + if x != nil && x.TrainingSentenceSize != nil { + return *x.TrainingSentenceSize + } + return 0 +} + +func (x *TrainerSpec) GetSeedSentencepieceSize() int32 { + if x != nil && x.SeedSentencepieceSize != nil { + return *x.SeedSentencepieceSize + } + return Default_TrainerSpec_SeedSentencepieceSize +} + +func (x *TrainerSpec) GetShrinkingFactor() float32 { + if x != nil && x.ShrinkingFactor != nil { + return *x.ShrinkingFactor + } + return Default_TrainerSpec_ShrinkingFactor +} + +func (x *TrainerSpec) GetMaxSentenceLength() int32 { + if x != nil && x.MaxSentenceLength != nil { + return *x.MaxSentenceLength + } + return Default_TrainerSpec_MaxSentenceLength +} + +func (x *TrainerSpec) GetNumThreads() int32 { + if x != nil && x.NumThreads != nil { + return *x.NumThreads + } + return Default_TrainerSpec_NumThreads +} + +func (x *TrainerSpec) GetNumSubIterations() int32 { + if x != nil && x.NumSubIterations != nil { + return *x.NumSubIterations + } + return Default_TrainerSpec_NumSubIterations +} + +func (x *TrainerSpec) GetMaxSentencepieceLength() int32 { + if x != nil && x.MaxSentencepieceLength != nil { + return *x.MaxSentencepieceLength + } + return Default_TrainerSpec_MaxSentencepieceLength +} + +func (x *TrainerSpec) GetSplitByUnicodeScript() bool { + if x != nil && x.SplitByUnicodeScript != nil { + return *x.SplitByUnicodeScript + } + return Default_TrainerSpec_SplitByUnicodeScript +} + +func (x *TrainerSpec) GetSplitByNumber() bool { + if x != nil && x.SplitByNumber != nil { + return *x.SplitByNumber + } + return Default_TrainerSpec_SplitByNumber +} + +func (x *TrainerSpec) GetSplitByWhitespace() bool { + if x != nil && x.SplitByWhitespace != nil { + return *x.SplitByWhitespace + } + return Default_TrainerSpec_SplitByWhitespace +} + +func (x *TrainerSpec) GetTreatWhitespaceAsSuffix() bool { + if x != nil && x.TreatWhitespaceAsSuffix != nil { + return *x.TreatWhitespaceAsSuffix + } + return Default_TrainerSpec_TreatWhitespaceAsSuffix +} + +func (x *TrainerSpec) GetAllowWhitespaceOnlyPieces() bool { + if x != nil && x.AllowWhitespaceOnlyPieces != nil { + return *x.AllowWhitespaceOnlyPieces + } + return Default_TrainerSpec_AllowWhitespaceOnlyPieces +} + +func (x *TrainerSpec) GetSplitDigits() bool { + if x != nil && x.SplitDigits != nil { + return *x.SplitDigits + } + return Default_TrainerSpec_SplitDigits +} + +func (x *TrainerSpec) GetPretokenizationDelimiter() string { + if x != nil && x.PretokenizationDelimiter != nil { + return *x.PretokenizationDelimiter + } + return Default_TrainerSpec_PretokenizationDelimiter +} + +func (x *TrainerSpec) GetControlSymbols() []string { + if x != nil { + return x.ControlSymbols + } + return nil +} + +func (x *TrainerSpec) GetUserDefinedSymbols() []string { + if x != nil { + return x.UserDefinedSymbols + } + return nil +} + +func (x *TrainerSpec) GetRequiredChars() string { + if x != nil && x.RequiredChars != nil { + return *x.RequiredChars + } + return "" +} + +func (x *TrainerSpec) GetByteFallback() bool { + if x != nil && x.ByteFallback != nil { + return *x.ByteFallback + } + return Default_TrainerSpec_ByteFallback +} + +func (x *TrainerSpec) GetVocabularyOutputPieceScore() bool { + if x != nil && x.VocabularyOutputPieceScore != nil { + return *x.VocabularyOutputPieceScore + } + return Default_TrainerSpec_VocabularyOutputPieceScore +} + +func (x *TrainerSpec) GetHardVocabLimit() bool { + if x != nil && x.HardVocabLimit != nil { + return *x.HardVocabLimit + } + return Default_TrainerSpec_HardVocabLimit +} + +func (x *TrainerSpec) GetUseAllVocab() bool { + if x != nil && x.UseAllVocab != nil { + return *x.UseAllVocab + } + return Default_TrainerSpec_UseAllVocab +} + +func (x *TrainerSpec) GetUnkId() int32 { + if x != nil && x.UnkId != nil { + return *x.UnkId + } + return Default_TrainerSpec_UnkId +} + +func (x *TrainerSpec) GetBosId() int32 { + if x != nil && x.BosId != nil { + return *x.BosId + } + return Default_TrainerSpec_BosId +} + +func (x *TrainerSpec) GetEosId() int32 { + if x != nil && x.EosId != nil { + return *x.EosId + } + return Default_TrainerSpec_EosId +} + +func (x *TrainerSpec) GetPadId() int32 { + if x != nil && x.PadId != nil { + return *x.PadId + } + return Default_TrainerSpec_PadId +} + +func (x *TrainerSpec) GetUnkPiece() string { + if x != nil && x.UnkPiece != nil { + return *x.UnkPiece + } + return Default_TrainerSpec_UnkPiece +} + +func (x *TrainerSpec) GetBosPiece() string { + if x != nil && x.BosPiece != nil { + return *x.BosPiece + } + return Default_TrainerSpec_BosPiece +} + +func (x *TrainerSpec) GetEosPiece() string { + if x != nil && x.EosPiece != nil { + return *x.EosPiece + } + return Default_TrainerSpec_EosPiece +} + +func (x *TrainerSpec) GetPadPiece() string { + if x != nil && x.PadPiece != nil { + return *x.PadPiece + } + return Default_TrainerSpec_PadPiece +} + +func (x *TrainerSpec) GetUnkSurface() string { + if x != nil && x.UnkSurface != nil { + return *x.UnkSurface + } + return Default_TrainerSpec_UnkSurface +} + +func (x *TrainerSpec) GetTrainExtremelyLargeCorpus() bool { + if x != nil && x.TrainExtremelyLargeCorpus != nil { + return *x.TrainExtremelyLargeCorpus + } + return Default_TrainerSpec_TrainExtremelyLargeCorpus +} + +func (x *TrainerSpec) GetSeedSentencepiecesFile() string { + if x != nil && x.SeedSentencepiecesFile != nil { + return *x.SeedSentencepiecesFile + } + return Default_TrainerSpec_SeedSentencepiecesFile +} + +// NormalizerSpec encodes a various parameters for string normalizaiton +type NormalizerSpec struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + extensionFields protoimpl.ExtensionFields + + // name of normalization rule. + Name *string `protobuf:"bytes,1,opt,name=name" json:"name,omitempty"` + // Pre-compiled normalization rule created by + // Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method. + // Usually this field is set by Builder::GetNormalizerSpec() method. + PrecompiledCharsmap []byte `protobuf:"bytes,2,opt,name=precompiled_charsmap,json=precompiledCharsmap" json:"precompiled_charsmap,omitempty"` + // Adds dummy whitespace at the beginning of text in order to + // treat "world" in "world" and "hello world" in the same way. + AddDummyPrefix *bool `protobuf:"varint,3,opt,name=add_dummy_prefix,json=addDummyPrefix,def=1" json:"add_dummy_prefix,omitempty"` + // Removes leading, trailing, and duplicate internal whitespace. + RemoveExtraWhitespaces *bool `protobuf:"varint,4,opt,name=remove_extra_whitespaces,json=removeExtraWhitespaces,def=1" json:"remove_extra_whitespaces,omitempty"` + // Replaces whitespace with meta symbol. + // This field must be true to train sentence piece model. + EscapeWhitespaces *bool `protobuf:"varint,5,opt,name=escape_whitespaces,json=escapeWhitespaces,def=1" json:"escape_whitespaces,omitempty"` + // Custom normalization rule file in TSV format. + // https://github.com/google/sentencepiece/blob/master/doc/normalization.md + // This field is only used in SentencePieceTrainer::Train() method, which + // compiles the rule into the binary rule stored in `precompiled_charsmap`. + NormalizationRuleTsv *string `protobuf:"bytes,6,opt,name=normalization_rule_tsv,json=normalizationRuleTsv" json:"normalization_rule_tsv,omitempty"` +} + +// Default values for NormalizerSpec fields. +const ( + Default_NormalizerSpec_AddDummyPrefix = bool(true) + Default_NormalizerSpec_RemoveExtraWhitespaces = bool(true) + Default_NormalizerSpec_EscapeWhitespaces = bool(true) +) + +func (x *NormalizerSpec) Reset() { + *x = NormalizerSpec{} + if protoimpl.UnsafeEnabled { + mi := &file_sentencepiece_model_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *NormalizerSpec) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*NormalizerSpec) ProtoMessage() {} + +func (x *NormalizerSpec) ProtoReflect() protoreflect.Message { + mi := &file_sentencepiece_model_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use NormalizerSpec.ProtoReflect.Descriptor instead. +func (*NormalizerSpec) Descriptor() ([]byte, []int) { + return file_sentencepiece_model_proto_rawDescGZIP(), []int{1} +} + +var extRange_NormalizerSpec = []protoiface.ExtensionRangeV1{ + {Start: 200, End: 536870911}, +} + +// Deprecated: Use NormalizerSpec.ProtoReflect.Descriptor.ExtensionRanges instead. +func (*NormalizerSpec) ExtensionRangeArray() []protoiface.ExtensionRangeV1 { + return extRange_NormalizerSpec +} + +func (x *NormalizerSpec) GetName() string { + if x != nil && x.Name != nil { + return *x.Name + } + return "" +} + +func (x *NormalizerSpec) GetPrecompiledCharsmap() []byte { + if x != nil { + return x.PrecompiledCharsmap + } + return nil +} + +func (x *NormalizerSpec) GetAddDummyPrefix() bool { + if x != nil && x.AddDummyPrefix != nil { + return *x.AddDummyPrefix + } + return Default_NormalizerSpec_AddDummyPrefix +} + +func (x *NormalizerSpec) GetRemoveExtraWhitespaces() bool { + if x != nil && x.RemoveExtraWhitespaces != nil { + return *x.RemoveExtraWhitespaces + } + return Default_NormalizerSpec_RemoveExtraWhitespaces +} + +func (x *NormalizerSpec) GetEscapeWhitespaces() bool { + if x != nil && x.EscapeWhitespaces != nil { + return *x.EscapeWhitespaces + } + return Default_NormalizerSpec_EscapeWhitespaces +} + +func (x *NormalizerSpec) GetNormalizationRuleTsv() string { + if x != nil && x.NormalizationRuleTsv != nil { + return *x.NormalizationRuleTsv + } + return "" +} + +// Proto to store samples for self-testing. +type SelfTestData struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + extensionFields protoimpl.ExtensionFields + + Samples []*SelfTestData_Sample `protobuf:"bytes,1,rep,name=samples" json:"samples,omitempty"` +} + +func (x *SelfTestData) Reset() { + *x = SelfTestData{} + if protoimpl.UnsafeEnabled { + mi := &file_sentencepiece_model_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *SelfTestData) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SelfTestData) ProtoMessage() {} + +func (x *SelfTestData) ProtoReflect() protoreflect.Message { + mi := &file_sentencepiece_model_proto_msgTypes[2] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SelfTestData.ProtoReflect.Descriptor instead. +func (*SelfTestData) Descriptor() ([]byte, []int) { + return file_sentencepiece_model_proto_rawDescGZIP(), []int{2} +} + +var extRange_SelfTestData = []protoiface.ExtensionRangeV1{ + {Start: 200, End: 536870911}, +} + +// Deprecated: Use SelfTestData.ProtoReflect.Descriptor.ExtensionRanges instead. +func (*SelfTestData) ExtensionRangeArray() []protoiface.ExtensionRangeV1 { + return extRange_SelfTestData +} + +func (x *SelfTestData) GetSamples() []*SelfTestData_Sample { + if x != nil { + return x.Samples + } + return nil +} + +// ModelProto stores model parameters. +// SentencePieceProcessor is supposed to be self-contained. +// All settings/parameters which may change the behavior must be encoded +// in ModelProto. +type ModelProto struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + extensionFields protoimpl.ExtensionFields + + // Sentence pieces with scores. + Pieces []*ModelProto_SentencePiece `protobuf:"bytes,1,rep,name=pieces" json:"pieces,omitempty"` + // Spec used to generate this model file. + TrainerSpec *TrainerSpec `protobuf:"bytes,2,opt,name=trainer_spec,json=trainerSpec" json:"trainer_spec,omitempty"` + // Spec for text normalization. + NormalizerSpec *NormalizerSpec `protobuf:"bytes,3,opt,name=normalizer_spec,json=normalizerSpec" json:"normalizer_spec,omitempty"` + // Stores sample input and its expected segmentation to verify the model. + SelfTestData *SelfTestData `protobuf:"bytes,4,opt,name=self_test_data,json=selfTestData" json:"self_test_data,omitempty"` + // Spec for text de-normalization. + DenormalizerSpec *NormalizerSpec `protobuf:"bytes,5,opt,name=denormalizer_spec,json=denormalizerSpec" json:"denormalizer_spec,omitempty"` +} + +func (x *ModelProto) Reset() { + *x = ModelProto{} + if protoimpl.UnsafeEnabled { + mi := &file_sentencepiece_model_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ModelProto) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ModelProto) ProtoMessage() {} + +func (x *ModelProto) ProtoReflect() protoreflect.Message { + mi := &file_sentencepiece_model_proto_msgTypes[3] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ModelProto.ProtoReflect.Descriptor instead. +func (*ModelProto) Descriptor() ([]byte, []int) { + return file_sentencepiece_model_proto_rawDescGZIP(), []int{3} +} + +var extRange_ModelProto = []protoiface.ExtensionRangeV1{ + {Start: 200, End: 536870911}, +} + +// Deprecated: Use ModelProto.ProtoReflect.Descriptor.ExtensionRanges instead. +func (*ModelProto) ExtensionRangeArray() []protoiface.ExtensionRangeV1 { + return extRange_ModelProto +} + +func (x *ModelProto) GetPieces() []*ModelProto_SentencePiece { + if x != nil { + return x.Pieces + } + return nil +} + +func (x *ModelProto) GetTrainerSpec() *TrainerSpec { + if x != nil { + return x.TrainerSpec + } + return nil +} + +func (x *ModelProto) GetNormalizerSpec() *NormalizerSpec { + if x != nil { + return x.NormalizerSpec + } + return nil +} + +func (x *ModelProto) GetSelfTestData() *SelfTestData { + if x != nil { + return x.SelfTestData + } + return nil +} + +func (x *ModelProto) GetDenormalizerSpec() *NormalizerSpec { + if x != nil { + return x.DenormalizerSpec + } + return nil +} + +type SelfTestData_Sample struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Input *string `protobuf:"bytes,1,opt,name=input" json:"input,omitempty"` + Expected *string `protobuf:"bytes,2,opt,name=expected" json:"expected,omitempty"` +} + +func (x *SelfTestData_Sample) Reset() { + *x = SelfTestData_Sample{} + if protoimpl.UnsafeEnabled { + mi := &file_sentencepiece_model_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *SelfTestData_Sample) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SelfTestData_Sample) ProtoMessage() {} + +func (x *SelfTestData_Sample) ProtoReflect() protoreflect.Message { + mi := &file_sentencepiece_model_proto_msgTypes[4] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SelfTestData_Sample.ProtoReflect.Descriptor instead. +func (*SelfTestData_Sample) Descriptor() ([]byte, []int) { + return file_sentencepiece_model_proto_rawDescGZIP(), []int{2, 0} +} + +func (x *SelfTestData_Sample) GetInput() string { + if x != nil && x.Input != nil { + return *x.Input + } + return "" +} + +func (x *SelfTestData_Sample) GetExpected() string { + if x != nil && x.Expected != nil { + return *x.Expected + } + return "" +} + +type ModelProto_SentencePiece struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + extensionFields protoimpl.ExtensionFields + + Piece *string `protobuf:"bytes,1,opt,name=piece" json:"piece,omitempty"` // piece must not be empty. + Score *float32 `protobuf:"fixed32,2,opt,name=score" json:"score,omitempty"` + Type *ModelProto_SentencePiece_Type `protobuf:"varint,3,opt,name=type,enum=sentencepiece.ModelProto_SentencePiece_Type,def=1" json:"type,omitempty"` +} + +// Default values for ModelProto_SentencePiece fields. +const ( + Default_ModelProto_SentencePiece_Type = ModelProto_SentencePiece_NORMAL +) + +func (x *ModelProto_SentencePiece) Reset() { + *x = ModelProto_SentencePiece{} + if protoimpl.UnsafeEnabled { + mi := &file_sentencepiece_model_proto_msgTypes[5] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ModelProto_SentencePiece) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ModelProto_SentencePiece) ProtoMessage() {} + +func (x *ModelProto_SentencePiece) ProtoReflect() protoreflect.Message { + mi := &file_sentencepiece_model_proto_msgTypes[5] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ModelProto_SentencePiece.ProtoReflect.Descriptor instead. +func (*ModelProto_SentencePiece) Descriptor() ([]byte, []int) { + return file_sentencepiece_model_proto_rawDescGZIP(), []int{3, 0} +} + +var extRange_ModelProto_SentencePiece = []protoiface.ExtensionRangeV1{ + {Start: 200, End: 536870911}, +} + +// Deprecated: Use ModelProto_SentencePiece.ProtoReflect.Descriptor.ExtensionRanges instead. +func (*ModelProto_SentencePiece) ExtensionRangeArray() []protoiface.ExtensionRangeV1 { + return extRange_ModelProto_SentencePiece +} + +func (x *ModelProto_SentencePiece) GetPiece() string { + if x != nil && x.Piece != nil { + return *x.Piece + } + return "" +} + +func (x *ModelProto_SentencePiece) GetScore() float32 { + if x != nil && x.Score != nil { + return *x.Score + } + return 0 +} + +func (x *ModelProto_SentencePiece) GetType() ModelProto_SentencePiece_Type { + if x != nil && x.Type != nil { + return *x.Type + } + return Default_ModelProto_SentencePiece_Type +} + +var File_sentencepiece_model_proto protoreflect.FileDescriptor + +var file_sentencepiece_model_proto_rawDesc = []byte{ + 0x0a, 0x19, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, + 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x0d, 0x73, 0x65, 0x6e, + 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x22, 0xc6, 0x12, 0x0a, 0x0b, 0x54, + 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x12, 0x14, 0x0a, 0x05, 0x69, 0x6e, + 0x70, 0x75, 0x74, 0x18, 0x01, 0x20, 0x03, 0x28, 0x09, 0x52, 0x05, 0x69, 0x6e, 0x70, 0x75, 0x74, + 0x12, 0x21, 0x0a, 0x0c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, + 0x18, 0x07, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x46, 0x6f, 0x72, + 0x6d, 0x61, 0x74, 0x12, 0x21, 0x0a, 0x0c, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x5f, 0x70, 0x72, 0x65, + 0x66, 0x69, 0x78, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x6d, 0x6f, 0x64, 0x65, 0x6c, + 0x50, 0x72, 0x65, 0x66, 0x69, 0x78, 0x12, 0x4c, 0x0a, 0x0a, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x5f, + 0x74, 0x79, 0x70, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x24, 0x2e, 0x73, 0x65, 0x6e, + 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x54, 0x72, 0x61, 0x69, 0x6e, + 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x54, 0x79, 0x70, 0x65, + 0x3a, 0x07, 0x55, 0x4e, 0x49, 0x47, 0x52, 0x41, 0x4d, 0x52, 0x09, 0x6d, 0x6f, 0x64, 0x65, 0x6c, + 0x54, 0x79, 0x70, 0x65, 0x12, 0x23, 0x0a, 0x0a, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x5f, 0x73, 0x69, + 0x7a, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x04, 0x38, 0x30, 0x30, 0x30, 0x52, 0x09, + 0x76, 0x6f, 0x63, 0x61, 0x62, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x27, 0x0a, 0x0f, 0x61, 0x63, 0x63, + 0x65, 0x70, 0x74, 0x5f, 0x6c, 0x61, 0x6e, 0x67, 0x75, 0x61, 0x67, 0x65, 0x18, 0x05, 0x20, 0x03, + 0x28, 0x09, 0x52, 0x0e, 0x61, 0x63, 0x63, 0x65, 0x70, 0x74, 0x4c, 0x61, 0x6e, 0x67, 0x75, 0x61, + 0x67, 0x65, 0x12, 0x34, 0x0a, 0x15, 0x73, 0x65, 0x6c, 0x66, 0x5f, 0x74, 0x65, 0x73, 0x74, 0x5f, + 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, + 0x05, 0x3a, 0x01, 0x30, 0x52, 0x12, 0x73, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x53, 0x61, + 0x6d, 0x70, 0x6c, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x45, 0x0a, 0x1b, 0x65, 0x6e, 0x61, 0x62, + 0x6c, 0x65, 0x5f, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, + 0x70, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x18, 0x32, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, + 0x61, 0x6c, 0x73, 0x65, 0x52, 0x19, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x44, 0x69, 0x66, 0x66, + 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x50, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x12, + 0x4a, 0x0a, 0x20, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, + 0x70, 0x72, 0x69, 0x76, 0x61, 0x63, 0x79, 0x5f, 0x6e, 0x6f, 0x69, 0x73, 0x65, 0x5f, 0x6c, 0x65, + 0x76, 0x65, 0x6c, 0x18, 0x33, 0x20, 0x01, 0x28, 0x02, 0x3a, 0x01, 0x30, 0x52, 0x1d, 0x64, 0x69, + 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x50, 0x72, 0x69, 0x76, 0x61, 0x63, + 0x79, 0x4e, 0x6f, 0x69, 0x73, 0x65, 0x4c, 0x65, 0x76, 0x65, 0x6c, 0x12, 0x58, 0x0a, 0x27, 0x64, + 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x5f, 0x70, 0x72, 0x69, 0x76, + 0x61, 0x63, 0x79, 0x5f, 0x63, 0x6c, 0x69, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x5f, 0x74, 0x68, 0x72, + 0x65, 0x73, 0x68, 0x6f, 0x6c, 0x64, 0x18, 0x34, 0x20, 0x01, 0x28, 0x04, 0x3a, 0x01, 0x30, 0x52, + 0x24, 0x64, 0x69, 0x66, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x50, 0x72, 0x69, + 0x76, 0x61, 0x63, 0x79, 0x43, 0x6c, 0x69, 0x70, 0x70, 0x69, 0x6e, 0x67, 0x54, 0x68, 0x72, 0x65, + 0x73, 0x68, 0x6f, 0x6c, 0x64, 0x12, 0x35, 0x0a, 0x12, 0x63, 0x68, 0x61, 0x72, 0x61, 0x63, 0x74, + 0x65, 0x72, 0x5f, 0x63, 0x6f, 0x76, 0x65, 0x72, 0x61, 0x67, 0x65, 0x18, 0x0a, 0x20, 0x01, 0x28, + 0x02, 0x3a, 0x06, 0x30, 0x2e, 0x39, 0x39, 0x39, 0x35, 0x52, 0x11, 0x63, 0x68, 0x61, 0x72, 0x61, + 0x63, 0x74, 0x65, 0x72, 0x43, 0x6f, 0x76, 0x65, 0x72, 0x61, 0x67, 0x65, 0x12, 0x31, 0x0a, 0x13, + 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73, + 0x69, 0x7a, 0x65, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x04, 0x3a, 0x01, 0x30, 0x52, 0x11, 0x69, 0x6e, + 0x70, 0x75, 0x74, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x12, + 0x3a, 0x0a, 0x16, 0x73, 0x68, 0x75, 0x66, 0x66, 0x6c, 0x65, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, + 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x18, 0x13, 0x20, 0x01, 0x28, 0x08, 0x3a, + 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x14, 0x73, 0x68, 0x75, 0x66, 0x66, 0x6c, 0x65, 0x49, 0x6e, + 0x70, 0x75, 0x74, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x12, 0x34, 0x0a, 0x14, 0x6d, + 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73, + 0x69, 0x7a, 0x65, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x05, 0x42, 0x02, 0x18, 0x01, 0x52, 0x12, 0x6d, + 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x53, 0x69, 0x7a, + 0x65, 0x12, 0x38, 0x0a, 0x16, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x5f, 0x73, 0x65, + 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x0d, 0x20, 0x01, 0x28, + 0x05, 0x42, 0x02, 0x18, 0x01, 0x52, 0x14, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x53, + 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x3f, 0x0a, 0x17, 0x73, + 0x65, 0x65, 0x64, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, + 0x65, 0x5f, 0x73, 0x69, 0x7a, 0x65, 0x18, 0x0e, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x07, 0x31, 0x30, + 0x30, 0x30, 0x30, 0x30, 0x30, 0x52, 0x15, 0x73, 0x65, 0x65, 0x64, 0x53, 0x65, 0x6e, 0x74, 0x65, + 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x53, 0x69, 0x7a, 0x65, 0x12, 0x2f, 0x0a, 0x10, + 0x73, 0x68, 0x72, 0x69, 0x6e, 0x6b, 0x69, 0x6e, 0x67, 0x5f, 0x66, 0x61, 0x63, 0x74, 0x6f, 0x72, + 0x18, 0x0f, 0x20, 0x01, 0x28, 0x02, 0x3a, 0x04, 0x30, 0x2e, 0x37, 0x35, 0x52, 0x0f, 0x73, 0x68, + 0x72, 0x69, 0x6e, 0x6b, 0x69, 0x6e, 0x67, 0x46, 0x61, 0x63, 0x74, 0x6f, 0x72, 0x12, 0x34, 0x0a, + 0x13, 0x6d, 0x61, 0x78, 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x6c, 0x65, + 0x6e, 0x67, 0x74, 0x68, 0x18, 0x12, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x04, 0x34, 0x31, 0x39, 0x32, + 0x52, 0x11, 0x6d, 0x61, 0x78, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x4c, 0x65, 0x6e, + 0x67, 0x74, 0x68, 0x12, 0x23, 0x0a, 0x0b, 0x6e, 0x75, 0x6d, 0x5f, 0x74, 0x68, 0x72, 0x65, 0x61, + 0x64, 0x73, 0x18, 0x10, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x02, 0x31, 0x36, 0x52, 0x0a, 0x6e, 0x75, + 0x6d, 0x54, 0x68, 0x72, 0x65, 0x61, 0x64, 0x73, 0x12, 0x2f, 0x0a, 0x12, 0x6e, 0x75, 0x6d, 0x5f, + 0x73, 0x75, 0x62, 0x5f, 0x69, 0x74, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x11, + 0x20, 0x01, 0x28, 0x05, 0x3a, 0x01, 0x32, 0x52, 0x10, 0x6e, 0x75, 0x6d, 0x53, 0x75, 0x62, 0x49, + 0x74, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x3c, 0x0a, 0x18, 0x6d, 0x61, 0x78, + 0x5f, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, 0x6c, + 0x65, 0x6e, 0x67, 0x74, 0x68, 0x18, 0x14, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x02, 0x31, 0x36, 0x52, + 0x16, 0x6d, 0x61, 0x78, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, + 0x65, 0x4c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x12, 0x3b, 0x0a, 0x17, 0x73, 0x70, 0x6c, 0x69, 0x74, + 0x5f, 0x62, 0x79, 0x5f, 0x75, 0x6e, 0x69, 0x63, 0x6f, 0x64, 0x65, 0x5f, 0x73, 0x63, 0x72, 0x69, + 0x70, 0x74, 0x18, 0x15, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x14, + 0x73, 0x70, 0x6c, 0x69, 0x74, 0x42, 0x79, 0x55, 0x6e, 0x69, 0x63, 0x6f, 0x64, 0x65, 0x53, 0x63, + 0x72, 0x69, 0x70, 0x74, 0x12, 0x2c, 0x0a, 0x0f, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x5f, 0x62, 0x79, + 0x5f, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x18, 0x17, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, + 0x72, 0x75, 0x65, 0x52, 0x0d, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x42, 0x79, 0x4e, 0x75, 0x6d, 0x62, + 0x65, 0x72, 0x12, 0x34, 0x0a, 0x13, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x5f, 0x62, 0x79, 0x5f, 0x77, + 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x18, 0x16, 0x20, 0x01, 0x28, 0x08, 0x3a, + 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x11, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x42, 0x79, 0x57, 0x68, + 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x12, 0x42, 0x0a, 0x1a, 0x74, 0x72, 0x65, 0x61, + 0x74, 0x5f, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x5f, 0x61, 0x73, 0x5f, + 0x73, 0x75, 0x66, 0x66, 0x69, 0x78, 0x18, 0x18, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, + 0x6c, 0x73, 0x65, 0x52, 0x17, 0x74, 0x72, 0x65, 0x61, 0x74, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, + 0x70, 0x61, 0x63, 0x65, 0x41, 0x73, 0x53, 0x75, 0x66, 0x66, 0x69, 0x78, 0x12, 0x46, 0x0a, 0x1c, + 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x5f, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, + 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x73, 0x18, 0x1a, 0x20, 0x01, + 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x19, 0x61, 0x6c, 0x6c, 0x6f, 0x77, + 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x4f, 0x6e, 0x6c, 0x79, 0x50, 0x69, + 0x65, 0x63, 0x65, 0x73, 0x12, 0x28, 0x0a, 0x0c, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x5f, 0x64, 0x69, + 0x67, 0x69, 0x74, 0x73, 0x18, 0x19, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, + 0x65, 0x52, 0x0b, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x44, 0x69, 0x67, 0x69, 0x74, 0x73, 0x12, 0x3d, + 0x0a, 0x19, 0x70, 0x72, 0x65, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x5f, 0x64, 0x65, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x65, 0x72, 0x18, 0x35, 0x20, 0x01, 0x28, + 0x09, 0x3a, 0x00, 0x52, 0x18, 0x70, 0x72, 0x65, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x69, 0x7a, 0x61, + 0x74, 0x69, 0x6f, 0x6e, 0x44, 0x65, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x65, 0x72, 0x12, 0x27, 0x0a, + 0x0f, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x5f, 0x73, 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, + 0x18, 0x1e, 0x20, 0x03, 0x28, 0x09, 0x52, 0x0e, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x53, + 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, 0x12, 0x30, 0x0a, 0x14, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x64, + 0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, 0x5f, 0x73, 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, 0x18, 0x1f, + 0x20, 0x03, 0x28, 0x09, 0x52, 0x12, 0x75, 0x73, 0x65, 0x72, 0x44, 0x65, 0x66, 0x69, 0x6e, 0x65, + 0x64, 0x53, 0x79, 0x6d, 0x62, 0x6f, 0x6c, 0x73, 0x12, 0x25, 0x0a, 0x0e, 0x72, 0x65, 0x71, 0x75, + 0x69, 0x72, 0x65, 0x64, 0x5f, 0x63, 0x68, 0x61, 0x72, 0x73, 0x18, 0x24, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x0d, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x43, 0x68, 0x61, 0x72, 0x73, 0x12, + 0x2a, 0x0a, 0x0d, 0x62, 0x79, 0x74, 0x65, 0x5f, 0x66, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, + 0x18, 0x23, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x0c, 0x62, + 0x79, 0x74, 0x65, 0x46, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x12, 0x47, 0x0a, 0x1d, 0x76, + 0x6f, 0x63, 0x61, 0x62, 0x75, 0x6c, 0x61, 0x72, 0x79, 0x5f, 0x6f, 0x75, 0x74, 0x70, 0x75, 0x74, + 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x5f, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x18, 0x20, 0x20, 0x01, + 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x1a, 0x76, 0x6f, 0x63, 0x61, 0x62, 0x75, + 0x6c, 0x61, 0x72, 0x79, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x50, 0x69, 0x65, 0x63, 0x65, 0x53, + 0x63, 0x6f, 0x72, 0x65, 0x12, 0x2e, 0x0a, 0x10, 0x68, 0x61, 0x72, 0x64, 0x5f, 0x76, 0x6f, 0x63, + 0x61, 0x62, 0x5f, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x18, 0x21, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, + 0x74, 0x72, 0x75, 0x65, 0x52, 0x0e, 0x68, 0x61, 0x72, 0x64, 0x56, 0x6f, 0x63, 0x61, 0x62, 0x4c, + 0x69, 0x6d, 0x69, 0x74, 0x12, 0x29, 0x0a, 0x0d, 0x75, 0x73, 0x65, 0x5f, 0x61, 0x6c, 0x6c, 0x5f, + 0x76, 0x6f, 0x63, 0x61, 0x62, 0x18, 0x22, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, + 0x73, 0x65, 0x52, 0x0b, 0x75, 0x73, 0x65, 0x41, 0x6c, 0x6c, 0x56, 0x6f, 0x63, 0x61, 0x62, 0x12, + 0x18, 0x0a, 0x06, 0x75, 0x6e, 0x6b, 0x5f, 0x69, 0x64, 0x18, 0x28, 0x20, 0x01, 0x28, 0x05, 0x3a, + 0x01, 0x30, 0x52, 0x05, 0x75, 0x6e, 0x6b, 0x49, 0x64, 0x12, 0x18, 0x0a, 0x06, 0x62, 0x6f, 0x73, + 0x5f, 0x69, 0x64, 0x18, 0x29, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x01, 0x31, 0x52, 0x05, 0x62, 0x6f, + 0x73, 0x49, 0x64, 0x12, 0x18, 0x0a, 0x06, 0x65, 0x6f, 0x73, 0x5f, 0x69, 0x64, 0x18, 0x2a, 0x20, + 0x01, 0x28, 0x05, 0x3a, 0x01, 0x32, 0x52, 0x05, 0x65, 0x6f, 0x73, 0x49, 0x64, 0x12, 0x19, 0x0a, + 0x06, 0x70, 0x61, 0x64, 0x5f, 0x69, 0x64, 0x18, 0x2b, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x02, 0x2d, + 0x31, 0x52, 0x05, 0x70, 0x61, 0x64, 0x49, 0x64, 0x12, 0x22, 0x0a, 0x09, 0x75, 0x6e, 0x6b, 0x5f, + 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x2d, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x05, 0x3c, 0x75, 0x6e, + 0x6b, 0x3e, 0x52, 0x08, 0x75, 0x6e, 0x6b, 0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x20, 0x0a, 0x09, + 0x62, 0x6f, 0x73, 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x2e, 0x20, 0x01, 0x28, 0x09, 0x3a, + 0x03, 0x3c, 0x73, 0x3e, 0x52, 0x08, 0x62, 0x6f, 0x73, 0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x21, + 0x0a, 0x09, 0x65, 0x6f, 0x73, 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x2f, 0x20, 0x01, 0x28, + 0x09, 0x3a, 0x04, 0x3c, 0x2f, 0x73, 0x3e, 0x52, 0x08, 0x65, 0x6f, 0x73, 0x50, 0x69, 0x65, 0x63, + 0x65, 0x12, 0x22, 0x0a, 0x09, 0x70, 0x61, 0x64, 0x5f, 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x30, + 0x20, 0x01, 0x28, 0x09, 0x3a, 0x05, 0x3c, 0x70, 0x61, 0x64, 0x3e, 0x52, 0x08, 0x70, 0x61, 0x64, + 0x50, 0x69, 0x65, 0x63, 0x65, 0x12, 0x26, 0x0a, 0x0b, 0x75, 0x6e, 0x6b, 0x5f, 0x73, 0x75, 0x72, + 0x66, 0x61, 0x63, 0x65, 0x18, 0x2c, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x05, 0x20, 0xe2, 0x81, 0x87, + 0x20, 0x52, 0x0a, 0x75, 0x6e, 0x6b, 0x53, 0x75, 0x72, 0x66, 0x61, 0x63, 0x65, 0x12, 0x46, 0x0a, + 0x1c, 0x74, 0x72, 0x61, 0x69, 0x6e, 0x5f, 0x65, 0x78, 0x74, 0x72, 0x65, 0x6d, 0x65, 0x6c, 0x79, + 0x5f, 0x6c, 0x61, 0x72, 0x67, 0x65, 0x5f, 0x63, 0x6f, 0x72, 0x70, 0x75, 0x73, 0x18, 0x31, 0x20, + 0x01, 0x28, 0x08, 0x3a, 0x05, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x52, 0x19, 0x74, 0x72, 0x61, 0x69, + 0x6e, 0x45, 0x78, 0x74, 0x72, 0x65, 0x6d, 0x65, 0x6c, 0x79, 0x4c, 0x61, 0x72, 0x67, 0x65, 0x43, + 0x6f, 0x72, 0x70, 0x75, 0x73, 0x12, 0x3a, 0x0a, 0x18, 0x73, 0x65, 0x65, 0x64, 0x5f, 0x73, 0x65, + 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x73, 0x5f, 0x66, 0x69, 0x6c, + 0x65, 0x18, 0x36, 0x20, 0x01, 0x28, 0x09, 0x3a, 0x00, 0x52, 0x16, 0x73, 0x65, 0x65, 0x64, 0x53, + 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x73, 0x46, 0x69, 0x6c, + 0x65, 0x22, 0x35, 0x0a, 0x09, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x54, 0x79, 0x70, 0x65, 0x12, 0x0b, + 0x0a, 0x07, 0x55, 0x4e, 0x49, 0x47, 0x52, 0x41, 0x4d, 0x10, 0x01, 0x12, 0x07, 0x0a, 0x03, 0x42, + 0x50, 0x45, 0x10, 0x02, 0x12, 0x08, 0x0a, 0x04, 0x57, 0x4f, 0x52, 0x44, 0x10, 0x03, 0x12, 0x08, + 0x0a, 0x04, 0x43, 0x48, 0x41, 0x52, 0x10, 0x04, 0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, + 0x80, 0x80, 0x02, 0x22, 0xbd, 0x02, 0x0a, 0x0e, 0x4e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, + 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x31, 0x0a, 0x14, 0x70, 0x72, + 0x65, 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x5f, 0x63, 0x68, 0x61, 0x72, 0x73, 0x6d, + 0x61, 0x70, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x13, 0x70, 0x72, 0x65, 0x63, 0x6f, 0x6d, + 0x70, 0x69, 0x6c, 0x65, 0x64, 0x43, 0x68, 0x61, 0x72, 0x73, 0x6d, 0x61, 0x70, 0x12, 0x2e, 0x0a, + 0x10, 0x61, 0x64, 0x64, 0x5f, 0x64, 0x75, 0x6d, 0x6d, 0x79, 0x5f, 0x70, 0x72, 0x65, 0x66, 0x69, + 0x78, 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x0e, 0x61, + 0x64, 0x64, 0x44, 0x75, 0x6d, 0x6d, 0x79, 0x50, 0x72, 0x65, 0x66, 0x69, 0x78, 0x12, 0x3e, 0x0a, + 0x18, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x5f, 0x65, 0x78, 0x74, 0x72, 0x61, 0x5f, 0x77, 0x68, + 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x08, 0x3a, + 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, 0x16, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x45, 0x78, 0x74, + 0x72, 0x61, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x73, 0x12, 0x33, 0x0a, + 0x12, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x5f, 0x77, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, + 0x63, 0x65, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x08, 0x3a, 0x04, 0x74, 0x72, 0x75, 0x65, 0x52, + 0x11, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x57, 0x68, 0x69, 0x74, 0x65, 0x73, 0x70, 0x61, 0x63, + 0x65, 0x73, 0x12, 0x34, 0x0a, 0x16, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x61, 0x74, + 0x69, 0x6f, 0x6e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x5f, 0x74, 0x73, 0x76, 0x18, 0x06, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x14, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x52, 0x75, 0x6c, 0x65, 0x54, 0x73, 0x76, 0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, + 0x80, 0x80, 0x02, 0x22, 0x93, 0x01, 0x0a, 0x0c, 0x53, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, + 0x44, 0x61, 0x74, 0x61, 0x12, 0x3c, 0x0a, 0x07, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x73, 0x18, + 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x22, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, + 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x53, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x44, 0x61, + 0x74, 0x61, 0x2e, 0x53, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x52, 0x07, 0x73, 0x61, 0x6d, 0x70, 0x6c, + 0x65, 0x73, 0x1a, 0x3a, 0x0a, 0x06, 0x53, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x12, 0x14, 0x0a, 0x05, + 0x69, 0x6e, 0x70, 0x75, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x69, 0x6e, 0x70, + 0x75, 0x74, 0x12, 0x1a, 0x0a, 0x08, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x18, 0x02, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x2a, 0x09, + 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, 0x80, 0x80, 0x02, 0x22, 0xd7, 0x04, 0x0a, 0x0a, 0x4d, 0x6f, + 0x64, 0x65, 0x6c, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x3f, 0x0a, 0x06, 0x70, 0x69, 0x65, 0x63, + 0x65, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x27, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, + 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x50, 0x72, + 0x6f, 0x74, 0x6f, 0x2e, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x50, 0x69, 0x65, 0x63, + 0x65, 0x52, 0x06, 0x70, 0x69, 0x65, 0x63, 0x65, 0x73, 0x12, 0x3d, 0x0a, 0x0c, 0x74, 0x72, 0x61, + 0x69, 0x6e, 0x65, 0x72, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, + 0x1a, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, + 0x54, 0x72, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x52, 0x0b, 0x74, 0x72, 0x61, + 0x69, 0x6e, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x12, 0x46, 0x0a, 0x0f, 0x6e, 0x6f, 0x72, 0x6d, + 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x18, 0x03, 0x20, 0x01, 0x28, + 0x0b, 0x32, 0x1d, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, + 0x65, 0x2e, 0x4e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, + 0x52, 0x0e, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, + 0x12, 0x41, 0x0a, 0x0e, 0x73, 0x65, 0x6c, 0x66, 0x5f, 0x74, 0x65, 0x73, 0x74, 0x5f, 0x64, 0x61, + 0x74, 0x61, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1b, 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, + 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x53, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, + 0x74, 0x44, 0x61, 0x74, 0x61, 0x52, 0x0c, 0x73, 0x65, 0x6c, 0x66, 0x54, 0x65, 0x73, 0x74, 0x44, + 0x61, 0x74, 0x61, 0x12, 0x4a, 0x0a, 0x11, 0x64, 0x65, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, + 0x7a, 0x65, 0x72, 0x5f, 0x73, 0x70, 0x65, 0x63, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1d, + 0x2e, 0x73, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x4e, + 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x52, 0x10, 0x64, + 0x65, 0x6e, 0x6f, 0x72, 0x6d, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x72, 0x53, 0x70, 0x65, 0x63, 0x1a, + 0xe6, 0x01, 0x0a, 0x0d, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x50, 0x69, 0x65, 0x63, + 0x65, 0x12, 0x14, 0x0a, 0x05, 0x70, 0x69, 0x65, 0x63, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x05, 0x70, 0x69, 0x65, 0x63, 0x65, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x63, 0x6f, 0x72, 0x65, + 0x18, 0x02, 0x20, 0x01, 0x28, 0x02, 0x52, 0x05, 0x73, 0x63, 0x6f, 0x72, 0x65, 0x12, 0x48, 0x0a, + 0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x2c, 0x2e, 0x73, 0x65, + 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x70, 0x69, 0x65, 0x63, 0x65, 0x2e, 0x4d, 0x6f, 0x64, 0x65, + 0x6c, 0x50, 0x72, 0x6f, 0x74, 0x6f, 0x2e, 0x53, 0x65, 0x6e, 0x74, 0x65, 0x6e, 0x63, 0x65, 0x50, + 0x69, 0x65, 0x63, 0x65, 0x2e, 0x54, 0x79, 0x70, 0x65, 0x3a, 0x06, 0x4e, 0x4f, 0x52, 0x4d, 0x41, + 0x4c, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x22, 0x54, 0x0a, 0x04, 0x54, 0x79, 0x70, 0x65, 0x12, + 0x0a, 0x0a, 0x06, 0x4e, 0x4f, 0x52, 0x4d, 0x41, 0x4c, 0x10, 0x01, 0x12, 0x0b, 0x0a, 0x07, 0x55, + 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x02, 0x12, 0x0b, 0x0a, 0x07, 0x43, 0x4f, 0x4e, 0x54, + 0x52, 0x4f, 0x4c, 0x10, 0x03, 0x12, 0x10, 0x0a, 0x0c, 0x55, 0x53, 0x45, 0x52, 0x5f, 0x44, 0x45, + 0x46, 0x49, 0x4e, 0x45, 0x44, 0x10, 0x04, 0x12, 0x08, 0x0a, 0x04, 0x42, 0x59, 0x54, 0x45, 0x10, + 0x06, 0x12, 0x0a, 0x0a, 0x06, 0x55, 0x4e, 0x55, 0x53, 0x45, 0x44, 0x10, 0x05, 0x2a, 0x09, 0x08, + 0xc8, 0x01, 0x10, 0x80, 0x80, 0x80, 0x80, 0x02, 0x2a, 0x09, 0x08, 0xc8, 0x01, 0x10, 0x80, 0x80, + 0x80, 0x80, 0x02, 0x42, 0x02, 0x48, 0x03, +} + +var ( + file_sentencepiece_model_proto_rawDescOnce sync.Once + file_sentencepiece_model_proto_rawDescData = file_sentencepiece_model_proto_rawDesc +) + +func file_sentencepiece_model_proto_rawDescGZIP() []byte { + file_sentencepiece_model_proto_rawDescOnce.Do(func() { + file_sentencepiece_model_proto_rawDescData = protoimpl.X.CompressGZIP(file_sentencepiece_model_proto_rawDescData) + }) + return file_sentencepiece_model_proto_rawDescData +} + +var file_sentencepiece_model_proto_enumTypes = make([]protoimpl.EnumInfo, 2) +var file_sentencepiece_model_proto_msgTypes = make([]protoimpl.MessageInfo, 6) +var file_sentencepiece_model_proto_goTypes = []interface{}{ + (TrainerSpec_ModelType)(0), // 0: sentencepiece.TrainerSpec.ModelType + (ModelProto_SentencePiece_Type)(0), // 1: sentencepiece.ModelProto.SentencePiece.Type + (*TrainerSpec)(nil), // 2: sentencepiece.TrainerSpec + (*NormalizerSpec)(nil), // 3: sentencepiece.NormalizerSpec + (*SelfTestData)(nil), // 4: sentencepiece.SelfTestData + (*ModelProto)(nil), // 5: sentencepiece.ModelProto + (*SelfTestData_Sample)(nil), // 6: sentencepiece.SelfTestData.Sample + (*ModelProto_SentencePiece)(nil), // 7: sentencepiece.ModelProto.SentencePiece +} +var file_sentencepiece_model_proto_depIdxs = []int32{ + 0, // 0: sentencepiece.TrainerSpec.model_type:type_name -> sentencepiece.TrainerSpec.ModelType + 6, // 1: sentencepiece.SelfTestData.samples:type_name -> sentencepiece.SelfTestData.Sample + 7, // 2: sentencepiece.ModelProto.pieces:type_name -> sentencepiece.ModelProto.SentencePiece + 2, // 3: sentencepiece.ModelProto.trainer_spec:type_name -> sentencepiece.TrainerSpec + 3, // 4: sentencepiece.ModelProto.normalizer_spec:type_name -> sentencepiece.NormalizerSpec + 4, // 5: sentencepiece.ModelProto.self_test_data:type_name -> sentencepiece.SelfTestData + 3, // 6: sentencepiece.ModelProto.denormalizer_spec:type_name -> sentencepiece.NormalizerSpec + 1, // 7: sentencepiece.ModelProto.SentencePiece.type:type_name -> sentencepiece.ModelProto.SentencePiece.Type + 8, // [8:8] is the sub-list for method output_type + 8, // [8:8] is the sub-list for method input_type + 8, // [8:8] is the sub-list for extension type_name + 8, // [8:8] is the sub-list for extension extendee + 0, // [0:8] is the sub-list for field type_name +} + +func init() { file_sentencepiece_model_proto_init() } +func file_sentencepiece_model_proto_init() { + if File_sentencepiece_model_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_sentencepiece_model_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*TrainerSpec); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + case 3: + return &v.extensionFields + default: + return nil + } + } + file_sentencepiece_model_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*NormalizerSpec); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + case 3: + return &v.extensionFields + default: + return nil + } + } + file_sentencepiece_model_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*SelfTestData); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + case 3: + return &v.extensionFields + default: + return nil + } + } + file_sentencepiece_model_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*ModelProto); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + case 3: + return &v.extensionFields + default: + return nil + } + } + file_sentencepiece_model_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*SelfTestData_Sample); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_sentencepiece_model_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*ModelProto_SentencePiece); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + case 3: + return &v.extensionFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_sentencepiece_model_proto_rawDesc, + NumEnums: 2, + NumMessages: 6, + NumExtensions: 0, + NumServices: 0, + }, + GoTypes: file_sentencepiece_model_proto_goTypes, + DependencyIndexes: file_sentencepiece_model_proto_depIdxs, + EnumInfos: file_sentencepiece_model_proto_enumTypes, + MessageInfos: file_sentencepiece_model_proto_msgTypes, + }.Build() + File_sentencepiece_model_proto = out.File + file_sentencepiece_model_proto_rawDesc = nil + file_sentencepiece_model_proto_goTypes = nil + file_sentencepiece_model_proto_depIdxs = nil +} diff --git a/vertexai/internal/sentencepiece/internal/model/sentencepiece_model.proto b/vertexai/internal/sentencepiece/internal/model/sentencepiece_model.proto new file mode 100644 index 000000000000..a48f7cc845d8 --- /dev/null +++ b/vertexai/internal/sentencepiece/internal/model/sentencepiece_model.proto @@ -0,0 +1,332 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; + +// TODO(taku): Needs to use LITE RUNTIME in OSS release. +option optimize_for = LITE_RUNTIME; + +package sentencepiece; + +// TrainerSpec encodes a various parameters for SentencePiece training. +// Next id: 55 +message TrainerSpec { + /////////////////////////////////////////////////////////////////// + // General parameters + // + // Input corpus files. + // Trainer accepts the following two formats: + // A) Monolingual: plain text, one sentence per line. + // B) Bilingual: TSV, source sentence target sentence + // When bilingual data is passed, shared vocabulary model is built. + // Note that the input file must be raw corpus, not a preprocessed corpus. + // Trainer only loads the first `input_sentence_size` sentences specified + // with this parameter. + repeated string input = 1; + + // Input corpus format: + // "text": one-sentence-per-line text format (default) + // "tsv": sentence freq + optional string input_format = 7; + + // Output model file prefix. + // .model and .vocab are generated. + optional string model_prefix = 2; + + // Model type. only have UNIGRAM now. + enum ModelType { + UNIGRAM = 1; // Unigram language model with dynamic algorithm + BPE = 2; // Byte Pair Encoding + WORD = 3; // Delimitered by whitespace. + CHAR = 4; // tokenizes into character sequence + } + optional ModelType model_type = 3 [default = UNIGRAM]; + + // Vocabulary size. 8k is the default size. + optional int32 vocab_size = 4 [default = 8000]; + + // List of the languages this model can accept. + // Since the model is language-agnostic, this field is used as a reference. + repeated string accept_language = 5; + + // Size of self-test samples, which are encoded in the model file. + optional int32 self_test_sample_size = 6 [default = 0]; + + // Whether to use DP version of sentencepiece. Use it with TSV input format + // (requires precomputed word tab counts to work). + optional bool enable_differential_privacy = 50 [default = false]; + // Set these parameters if you need DP version of sentencepiece. + // std of noise to add. + optional float differential_privacy_noise_level = 51 [default = 0.0]; + // Clipping threshold to apply after adding noise. All the words with + // frequency less than this value are dropped. + optional uint64 differential_privacy_clipping_threshold = 52 [default = 0]; + + /////////////////////////////////////////////////////////////////// + // Training parameters. + // + // Uses characters which cover the corpus with the ratio of `chars_coverage`. + // This parameter determines the set of basic Alphabet of sentence piece. + // 1.0 - `chars_coverage` characters are treated as UNK. + // See also required_chars field. + optional float character_coverage = 10 [default = 0.9995]; + + // Maximum size of sentences the trainer loads from `input` parameter. + // Trainer simply loads the `input` files in sequence. + // It is better to shuffle the input corpus randomly. + optional uint64 input_sentence_size = 11 [default = 0]; + optional bool shuffle_input_sentence = 19 [default = true]; + + // Maximum size of sentences to make seed sentence pieces. + // Extended suffix array is constructed to extract frequent + // sub-strings from the corpus. This uses 20N working space, + // where N is the size of corpus. + optional int32 mining_sentence_size = 12 [deprecated = true]; + + // Maximum size of sentences to train sentence pieces. + optional int32 training_sentence_size = 13 [deprecated = true]; + + // The size of seed sentencepieces. + // `seed_sentencepiece_size` must be larger than `vocab_size`. + optional int32 seed_sentencepiece_size = 14 [default = 1000000]; + + // In every EM sub-iterations, keeps top + // `shrinking_factor` * `current sentencepieces size` with respect to + // the loss of the sentence piece. This value should be smaller than 1.0. + optional float shrinking_factor = 15 [default = 0.75]; + + // The maximum sentence length in byte. The sentences with the length + // larger than `max_sentence_length` is simply ignored. + // Longer input tends to bring the following risks: + // * Overflow during EM training (unigram language model only) + // * Performance drop because of O(n log n) cost in BPE. + optional int32 max_sentence_length = 18 [default = 4192]; + + // Number of threads in the training. + optional int32 num_threads = 16 [default = 16]; + + // Number of EM sub iterations. + optional int32 num_sub_iterations = 17 [default = 2]; + + /////////////////////////////////////////////////////////////////// + // SentencePiece parameters which control the shapes of sentence piece. + // + // Maximum length of sentencepiece. + optional int32 max_sentencepiece_length = 20 [default = 16]; + + // Uses Unicode script to split sentence pieces. + // When `split_by_unicode_script` is true, we do not allow sentence piece to + // include multiple Unicode scripts, e.g. "F1" is not a valid piece. + // Exception: CJ characters (Hiragana/Katakana/Han) are all handled + // as one script type, since Japanese word can consist of multiple scripts. + // This exception is always applied regardless of the accept-language + // parameter. + optional bool split_by_unicode_script = 21 [default = true]; + + // When `split_by_number` is true, put a boundary between number and + // non-number transition. If we want to treat "F1" is one token, set this flag + // to be false. + optional bool split_by_number = 23 [default = true]; + + // Use a white space to split sentence pieces. + // When `split_by_whitespace` is false, we may have the piece containing + // a white space in the middle. e.g., "in_the". + optional bool split_by_whitespace = 22 [default = true]; + + // Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello => + // hello_. When `treat_whitespace_as_suffix` is true, + // NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end + // of sentence. + optional bool treat_whitespace_as_suffix = 24 [default = false]; + + // Allows pieces that only contain whitespaces instead of appearing only as + // prefix or suffix of other pieces. + optional bool allow_whitespace_only_pieces = 26 [default = false]; + + // Split all digits (0-9) into separate pieces. + optional bool split_digits = 25 [default = false]; + + // Defines the pre-tokenization delimiter. + // When specified, no pieces crossing this delimiter is not included + // in the vocab. Then the delimiter string is virtually ignored + // during the training. This field can allows constraints on the vocabulary + // selection. Note that this field is available on unigram mode. + optional string pretokenization_delimiter = 53 [ default = ""]; + + /////////////////////////////////////////////////////////////////// + // Vocabulary management + // + // Defines control symbols used as an indicator to + // change the behavior of the decoder. and are pre-defined. + // We can use this field to encode various meta information, + // including language indicator in multilingual model. + // These symbols are not visible to users, but visible to + // the decoder. Note that when the input sentence contains control symbols, + // they are not treated as one token, but segmented into normal pieces. + // Control symbols must be inserted independently from the segmentation. + repeated string control_symbols = 30; + + // Defines user defined symbols. + // These symbols are added with extremely high score + // so they are always treated as one unique symbol in any context. + // Typical usage of user_defined_symbols is placeholder for named entities. + repeated string user_defined_symbols = 31; + + // Defines required characters. Each UTF8 character in this string is included + // in the character set regardless of character_coverage value. Unlike + // user_defined_symbols, these characters have scores based on the frequency + // on input sentences, and the model can form subwords using characters + // in this field. + optional string required_chars = 36; + + // Decomposes unknown pieces into UTF-8 bytes. + optional bool byte_fallback = 35 [default = false]; + + // When creating the vocabulary file, defines whether or not to additionally + // output the score for each piece. + optional bool vocabulary_output_piece_score = 32 [default = true]; + + // `vocab_size` is treated as hard limit. Crash if + // the model can not produce the vocab of size `vocab_size`, + // When `hard_vocab_limit` is false, vocab_size is treated + // as soft limit. Note that when model_type=char, + // always assumes hard_vocab_limit = false. + optional bool hard_vocab_limit = 33 [default = true]; + + // use all symbols for vocab extraction. This flag is valid + // if model type is either CHAR or WORD + optional bool use_all_vocab = 34 [default = false]; + + /////////////////////////////////////////////////////////////////// + // Reserved special meta tokens. + // * -1 is not used. + // * unk_id must not be -1. + // Id must starts with 0 and be contigous. + optional int32 unk_id = 40 [default = 0]; // + optional int32 bos_id = 41 [default = 1]; // + optional int32 eos_id = 42 [default = 2]; // + optional int32 pad_id = 43 [default = -1]; // (padding) + optional string unk_piece = 45 [default = ""]; + optional string bos_piece = 46 [default = ""]; + optional string eos_piece = 47 [default = ""]; + optional string pad_piece = 48 [default = ""]; + + // Encodes into U+2047 (DOUBLE QUESTION MARK), + // since this character can be useful both for user and + // developer. We can easily figure out that is emitted. + optional string unk_surface = 44 [default = " \xE2\x81\x87 "]; + + // Increase bit depth to allow unigram model training on large + // (>10M sentences) corpora. A Side-effect of enabling this flag + // is increased memory usage. + optional bool train_extremely_large_corpus = 49 [default = false]; + + // Path to a seed sentencepieces file, with one tab-separated + // seed sentencepiece frequency per line. + optional string seed_sentencepieces_file = 54 [default = ""]; + + // Customized extensions: the range of field numbers + // are open to third-party extensions. + extensions 200 to max; +} + +// NormalizerSpec encodes a various parameters for string normalizaiton +message NormalizerSpec { + // name of normalization rule. + optional string name = 1; + + // Pre-compiled normalization rule created by + // Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method. + // Usually this field is set by Builder::GetNormalizerSpec() method. + optional bytes precompiled_charsmap = 2; + + // Adds dummy whitespace at the beginning of text in order to + // treat "world" in "world" and "hello world" in the same way. + optional bool add_dummy_prefix = 3 [default = true]; + + // Removes leading, trailing, and duplicate internal whitespace. + optional bool remove_extra_whitespaces = 4 [default = true]; + + // Replaces whitespace with meta symbol. + // This field must be true to train sentence piece model. + optional bool escape_whitespaces = 5 [default = true]; + + // Custom normalization rule file in TSV format. + // https://github.com/google/sentencepiece/blob/master/doc/normalization.md + // This field is only used in SentencePieceTrainer::Train() method, which + // compiles the rule into the binary rule stored in `precompiled_charsmap`. + optional string normalization_rule_tsv = 6; + + // Customized extensions: the range of field numbers + // are open to third-party extensions. + extensions 200 to max; +} + +// Proto to store samples for self-testing. +message SelfTestData { + message Sample { + optional string input = 1; + optional string expected = 2; + } + repeated Sample samples = 1; + + // Customized extensions: the range of field numbers + // are open to third-party extensions. + extensions 200 to max; +} + +// ModelProto stores model parameters. +// SentencePieceProcessor is supposed to be self-contained. +// All settings/parameters which may change the behavior must be encoded +// in ModelProto. +message ModelProto { + message SentencePiece { + enum Type { + NORMAL = 1; // normal symbol + UNKNOWN = 2; // unknown symbol. only for now. + CONTROL = 3; // control symbols. , , <2ja> etc. + USER_DEFINED = 4; // user defined symbols. + // Typical usage of USER_DEFINED symbol + // is placeholder. + BYTE = 6; // byte symbols. Used when `byte_fallback` is true. + UNUSED = 5; // this piece is not used. + } + optional string piece = 1; // piece must not be empty. + optional float score = 2; + optional Type type = 3 [default = NORMAL]; + + // Customized extensions: the range of field numbers + // are open to third-party extensions. + extensions 200 to max; + } + + // Sentence pieces with scores. + repeated SentencePiece pieces = 1; + + // Spec used to generate this model file. + optional TrainerSpec trainer_spec = 2; + + // Spec for text normalization. + optional NormalizerSpec normalizer_spec = 3; + + // Stores sample input and its expected segmentation to verify the model. + optional SelfTestData self_test_data = 4; + + // Spec for text de-normalization. + optional NormalizerSpec denormalizer_spec = 5; + + // Customized extensions: the range of field numbers + // are open to third-party extensions. + extensions 200 to max; +} diff --git a/vertexai/internal/sentencepiece/internal/prefixmatcher/prefixmatcher.go b/vertexai/internal/sentencepiece/internal/prefixmatcher/prefixmatcher.go new file mode 100644 index 000000000000..540b1774df13 --- /dev/null +++ b/vertexai/internal/sentencepiece/internal/prefixmatcher/prefixmatcher.go @@ -0,0 +1,82 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package prefixmatcher + +import ( + "unicode/utf8" +) + +// PrefixMatcher helps find longest prefixes. See [FindPrefixLen]. +type PrefixMatcher struct { + root *trieNode +} + +type trieNode struct { + children map[rune]*trieNode + final bool +} + +// NewFromSet creates a new [PrefixMatcher] from a set of strings tha represent +// the vocabulary. +func NewFromSet(vocab map[string]bool) *PrefixMatcher { + pm := &PrefixMatcher{root: newNode()} + for word := range vocab { + pm.add(word) + } + return pm +} + +// FindPrefixLen finds the longest prefix of text that matches a vocabulary +// word, and returns it. If 0 is returned, no prefix was found. +func (pm *PrefixMatcher) FindPrefixLen(text string) int { + node := pm.root + maxLen := 0 + + for i, r := range text { + child := node.children[r] + if child == nil { + // r not found in this node, so we're done. + return maxLen + } + if child.final { + maxLen = i + utf8.RuneLen(r) + } + node = child + } + + return maxLen +} + +func (pm *PrefixMatcher) add(word string) { + node := pm.root + + for _, r := range word { + child := node.children[r] + if child == nil { + child = newNode() + node.children[r] = child + } + node = child + } + + node.final = true +} + +func newNode() *trieNode { + return &trieNode{ + children: make(map[rune]*trieNode), + final: false, + } +} diff --git a/vertexai/internal/sentencepiece/internal/priorityqueue/priorityqueue.go b/vertexai/internal/sentencepiece/internal/priorityqueue/priorityqueue.go new file mode 100644 index 000000000000..ed8e14e27b90 --- /dev/null +++ b/vertexai/internal/sentencepiece/internal/priorityqueue/priorityqueue.go @@ -0,0 +1,108 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package priorityqueue provides a generic priority queue with Insert +// and PopMax operations. +package priorityqueue + +// PriorityQueue is a generic priority queue with a configurable comparison +// function. +type PriorityQueue[T any] struct { + cmp func(a, b T) int + + // items holds the queue's items as a binary heap. + // items[0] is a dummy element that's not used. If the queue has N elements, + // they are stored at indices 1...N (N == len(items)-1) + // For an element at index i, its parent is at index i/2, and its children + // are at indices 2i and 2i+1. The root of the heap is at index 1. + items []T +} + +// New creates a new PriorityQueue, configured with a function that +// compares the priorities of two items a and b; it should return a number > 0 +// if the priority of a is higher, 0 if the priorities are equal a number < 0 +// otherwise. +func New[T any](cmp func(a, b T) int) *PriorityQueue[T] { + return &PriorityQueue[T]{cmp: cmp, items: make([]T, 1)} +} + +// Len returns the length (number of items) of the priority queue. +func (pq *PriorityQueue[T]) Len() int { + return len(pq.items) - 1 +} + +// Insert inserts a new element into the priority queue. +func (pq *PriorityQueue[T]) Insert(elem T) { + pq.items = append(pq.items, elem) + pq.siftup(len(pq.items) - 1) +} + +// PopMax returns the element with the maximal priority in the queue, and +// removes it from the queue. Warning: to maintain a clean API, PopMax panics +// if the queue is empty. Make sure to check Len() first. +func (pq *PriorityQueue[T]) PopMax() T { + if len(pq.items) < 2 { + panic("popping from empty priority queue") + } + maxItem := pq.items[1] + pq.items[1] = pq.items[len(pq.items)-1] + pq.items = pq.items[:len(pq.items)-1] + pq.siftdown() + return maxItem +} + +func (pq *PriorityQueue[T]) siftup(n int) { + i := n + for { + if i == 1 { + // Reached root, we're done. + return + } + // p is the index of i's parent + // if p parent has a higher priority than i, we're done. + p := i / 2 + if pq.cmp(pq.items[p], pq.items[i]) >= 0 { + return + } + pq.items[i], pq.items[p] = pq.items[p], pq.items[i] + i = p + } +} + +func (pq *PriorityQueue[T]) siftdown() { + i := 1 + for { + c := 2 * i + if c >= len(pq.items) { + return + } + // c is not out of bounds, so it's the index of the left child of i + + // Figure out the child index with the maximal priority + maxChild := c + if c+1 < len(pq.items) { + // c+1 is not out of bounds, so it's the index of the right child of i + if pq.cmp(pq.items[c+1], pq.items[c]) > 0 { + maxChild = c + 1 + } + } + if pq.cmp(pq.items[i], pq.items[maxChild]) >= 0 { + // i has higher priority than either child, so we're done. + return + } + + pq.items[i], pq.items[maxChild] = pq.items[maxChild], pq.items[i] + i = maxChild + } +} diff --git a/vertexai/internal/sentencepiece/normalize.go b/vertexai/internal/sentencepiece/normalize.go new file mode 100644 index 000000000000..6fb4f8674675 --- /dev/null +++ b/vertexai/internal/sentencepiece/normalize.go @@ -0,0 +1,34 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sentencepiece + +import "strings" + +// normalize performs unicode normalization. +// +// SentencePiece has a feature to perform configurable unicode normalization on +// the input text and has some options for adding dummy whitespace prefixes or +// trimming whitespace. However, the model we're working with has a very simple +// normalizer that does none of this. These options can be added in the future +// if needed. +func normalize(text string) string { + return replaceSeparator(text) +} + +// replaceSeparator replaces spaces by the whitespace separator used by +// the model. +func replaceSeparator(text string) string { + return strings.ReplaceAll(text, " ", "▁") +} diff --git a/vertexai/internal/sentencepiece/token.go b/vertexai/internal/sentencepiece/token.go new file mode 100644 index 000000000000..1af86755ea16 --- /dev/null +++ b/vertexai/internal/sentencepiece/token.go @@ -0,0 +1,29 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sentencepiece + +import "fmt" + +// Token represents a single token from the input text. ID is a unique token +// identifier that the model uses in its internal representation. Text is +// the piece of text this token represents. +type Token struct { + ID int + Text string +} + +func (t Token) String() string { + return fmt.Sprintf("Token{ID: %v, Text: %q}", t.ID, t.Text) +}