From 210b6e1739c0d3ea5958ccdd554cb926d507142f Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Thu, 5 May 2022 05:37:21 +0900 Subject: [PATCH 1/9] =?UTF-8?q?load=5Fmodel=E9=96=A2=E6=95=B0=E3=82=92?= =?UTF-8?q?=E5=88=87=E3=82=8A=E5=87=BA=E3=81=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/src/core.cpp | 112 +++++++++++++++++++++++++++++++--------------- 1 file changed, 76 insertions(+), 36 deletions(-) diff --git a/core/src/core.cpp b/core/src/core.cpp index ee0315714..90d64e600 100644 --- a/core/src/core.cpp +++ b/core/src/core.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,7 @@ #include "core.h" #define NOT_INITIALIZED_ERR "Call initialize() first." +#define NOT_LOADED_ERR "Model is not loaded." #define NOT_FOUND_ERR "No such file or directory: " #define FAILED_TO_OPEN_MODEL_ERR "Unable to open model files." #define FAILED_TO_OPEN_METAS_ERR "Unable to open metas.json." @@ -43,13 +45,19 @@ EMBED_DECL(YUKARIN_S); EMBED_DECL(YUKARIN_SA); EMBED_DECL(DECODE); -const struct { +/** + * 3種類のモデルを一纏めにしたもの + */ +struct VVMODEL { embed::EMBED_RES (*YUKARIN_S)(); embed::EMBED_RES (*YUKARIN_SA)(); embed::EMBED_RES (*DECODE)(); -} MODELS_LIST[] = {{YUKARIN_S, YUKARIN_SA, DECODE}}; +}; +const VVMODEL VVMODEL_LIST[] = { + {YUKARIN_S, YUKARIN_SA, DECODE}, +}; } // namespace EMBED_DECL_NAMESPACE -using EMBED_DECL_NAMESPACE::MODELS_LIST; +using EMBED_DECL_NAMESPACE::VVMODEL_LIST; // 複数モデルある場合のspeaker_idマッピング // {元のspeaker_id: {モデル番号, 新しいspeaker_id}} @@ -76,8 +84,23 @@ SupportedDevices get_supported_devices() { } struct Status { - Status(bool use_gpu_) - : use_gpu(use_gpu_), memory_info(Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU)) {} + Status(int model_count, bool use_gpu, int cpu_num_threads) + : memory_info(Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU)) { + yukarin_s_list = std::vector>(model_count); + yukarin_sa_list = std::vector>(model_count); + decode_list = std::vector>(model_count); + + session_options.SetInterOpNumThreads(cpu_num_threads).SetIntraOpNumThreads(cpu_num_threads); + if (use_gpu) { +#ifdef DIRECTML + session_options.DisableMemPattern().SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_DML(session_options, 0)); +#else + const OrtCUDAProviderOptions cuda_options; + session_options.AppendExecutionProvider_CUDA(cuda_options); +#endif + } + } /** * Loads the metas.json. * @@ -89,7 +112,7 @@ struct Status { * version: string * }] */ - bool load(int cpu_num_threads) { + bool load_metas() { embed::Resource metas_file = METAS(); metas = nlohmann::json::parse(metas_file.data, metas_file.data + metas_file.size); @@ -100,36 +123,31 @@ struct Status { supported_styles.insert(style["id"].get()); } } + } - for (const auto MODELS : MODELS_LIST) { - embed::Resource yukarin_s_model = MODELS.YUKARIN_S(); - embed::Resource yukarin_sa_model = MODELS.YUKARIN_SA(); - embed::Resource decode_model = MODELS.DECODE(); - - Ort::SessionOptions session_options; - session_options.SetInterOpNumThreads(cpu_num_threads).SetIntraOpNumThreads(cpu_num_threads); - yukarin_s_list.push_back(Ort::Session(env, yukarin_s_model.data, yukarin_s_model.size, session_options)); - yukarin_sa_list.push_back(Ort::Session(env, yukarin_sa_model.data, yukarin_sa_model.size, session_options)); - if (use_gpu) { -#ifdef DIRECTML - session_options.DisableMemPattern().SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_DML(session_options, 0)); -#else - const OrtCUDAProviderOptions cuda_options; - session_options.AppendExecutionProvider_CUDA(cuda_options); -#endif - } - decode_list.push_back(Ort::Session(env, decode_model.data, decode_model.size, session_options)); - } + /** + * モデルを読み込む + */ + bool load_model(int model_index, int cpu_num_threads) { + const auto VVMODEL = VVMODEL_LIST[model_index]; + embed::Resource yukarin_s_model = VVMODEL.YUKARIN_S(); + embed::Resource yukarin_sa_model = VVMODEL.YUKARIN_SA(); + embed::Resource decode_model = VVMODEL.DECODE(); + + yukarin_s_list[model_index] = + std::move(Ort::Session(env, yukarin_s_model.data, yukarin_s_model.size, session_options)); + yukarin_sa_list[model_index] = + std::move(Ort::Session(env, yukarin_sa_model.data, yukarin_sa_model.size, session_options)); + decode_list[model_index] = std::move(Ort::Session(env, decode_model.data, decode_model.size, session_options)); return true; } std::string root_dir_path; - bool use_gpu; + Ort::SessionOptions session_options; Ort::MemoryInfo memory_info; Ort::Env env{ORT_LOGGING_LEVEL_ERROR}; - std::vector yukarin_s_list, yukarin_sa_list, decode_list; + std::vector> yukarin_s_list, yukarin_sa_list, decode_list; nlohmann::json metas; std::string metas_str; @@ -178,12 +196,19 @@ bool initialize(bool use_gpu, int cpu_num_threads) { return false; } try { - status = std::make_unique(use_gpu); - if (!status->load(cpu_num_threads)) { + const int model_count = std::size(VVMODEL_LIST); + status = std::make_unique(model_count, use_gpu, cpu_num_threads); + if (!status->load_metas()) { return false; } + for (int model_index = 0; model_index < model_count; model_index++) { + if (!status->load_model(model_index, cpu_num_threads)) { + return false; + } + } if (use_gpu) { // 一回走らせて十分なGPUメモリを確保させる + // TODO: 全MODELに対して行う int length = 500; int phoneme_size = 45; std::vector phoneme(length * phoneme_size), f0(length); @@ -231,6 +256,11 @@ bool yukarin_s_forward(int64_t length, int64_t *phoneme_list, int64_t *speaker_i return false; } auto [model_index, model_speaker_id] = get_model_index_and_speaker_id(*speaker_id); + auto model = std::move(status->yukarin_s_list[model_index]); + if (!model) { + error_message = NOT_LOADED_ERR; + return false; + } try { const char *inputs[] = {"phoneme_list", "speaker_id"}; const char *outputs[] = {"phoneme_length"}; @@ -240,8 +270,8 @@ bool yukarin_s_forward(int64_t length, int64_t *phoneme_list, int64_t *speaker_i to_tensor(&model_speaker_id, speaker_shape)}; Ort::Value output_tensor = to_tensor(output, phoneme_shape); - status->yukarin_s_list[model_index].Run(Ort::RunOptions{nullptr}, inputs, input_tensors.data(), - input_tensors.size(), outputs, &output_tensor, 1); + model.value().Run(Ort::RunOptions{nullptr}, inputs, input_tensors.data(), input_tensors.size(), outputs, + &output_tensor, 1); for (int64_t i = 0; i < length; i++) { if (output[i] < PHONEME_LENGTH_MINIMAL) output[i] = PHONEME_LENGTH_MINIMAL; @@ -266,6 +296,11 @@ bool yukarin_sa_forward(int64_t length, int64_t *vowel_phoneme_list, int64_t *co return false; } auto [model_index, model_speaker_id] = get_model_index_and_speaker_id(*speaker_id); + auto model = std::move(status->yukarin_sa_list[model_index]); + if (!model) { + error_message = NOT_LOADED_ERR; + return false; + } try { const char *inputs[] = { "length", "vowel_phoneme_list", "consonant_phoneme_list", "start_accent_list", @@ -283,8 +318,8 @@ bool yukarin_sa_forward(int64_t length, int64_t *vowel_phoneme_list, int64_t *co to_tensor(&model_speaker_id, speaker_shape)}; Ort::Value output_tensor = to_tensor(output, phoneme_shape); - status->yukarin_sa_list[model_index].Run(Ort::RunOptions{nullptr}, inputs, input_tensors.data(), - input_tensors.size(), outputs, &output_tensor, 1); + model.value().Run(Ort::RunOptions{nullptr}, inputs, input_tensors.data(), input_tensors.size(), outputs, + &output_tensor, 1); } catch (const Ort::Exception &e) { error_message = ONNX_ERR; error_message += e.what(); @@ -346,6 +381,11 @@ bool decode_forward(int64_t length, int64_t phoneme_size, float *f0, float *phon return false; } auto [model_index, model_speaker_id] = get_model_index_and_speaker_id(*speaker_id); + auto model = std::move(status->decode_list[model_index]); + if (!model) { + error_message = NOT_LOADED_ERR; + return false; + } try { // 音が途切れてしまうのを避けるworkaround処理が入っている // TODO: 改善したらここのpadding処理を取り除く @@ -381,8 +421,8 @@ bool decode_forward(int64_t length, int64_t phoneme_size, float *f0, float *phon const char *inputs[] = {"f0", "phoneme", "speaker_id"}; const char *outputs[] = {"wave"}; - status->decode_list[model_index].Run(Ort::RunOptions{nullptr}, inputs, input_tensor.data(), input_tensor.size(), - outputs, &output_tensor, 1); + model.value().Run(Ort::RunOptions{nullptr}, inputs, input_tensor.data(), input_tensor.size(), outputs, + &output_tensor, 1); // TODO: 改善したらここのcopy処理を取り除く copy_output_with_padding_to_output(output_with_padding, output, padding_f0_size); From eafac3397aebd1a83c6af3d072b7c994f98923b9 Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Thu, 5 May 2022 06:18:37 +0900 Subject: [PATCH 2/9] =?UTF-8?q?load=5Fmodel=E3=81=A8is=5Fmodel=5Floaded?= =?UTF-8?q?=E3=82=92=E8=B6=B3=E3=81=97=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/src/core.cpp | 45 ++++++++++++++++++++++++++++++--------------- core/src/core.h | 20 +++++++++++++++++++- 2 files changed, 49 insertions(+), 16 deletions(-) diff --git a/core/src/core.cpp b/core/src/core.cpp index 90d64e600..1d335664e 100644 --- a/core/src/core.cpp +++ b/core/src/core.cpp @@ -128,7 +128,7 @@ struct Status { /** * モデルを読み込む */ - bool load_model(int model_index, int cpu_num_threads) { + bool load_model(int model_index) { const auto VVMODEL = VVMODEL_LIST[model_index]; embed::Resource yukarin_s_model = VVMODEL.YUKARIN_S(); embed::Resource yukarin_sa_model = VVMODEL.YUKARIN_SA(); @@ -184,7 +184,7 @@ std::pair get_model_index_and_speaker_id(int64_t speaker_id) { return found->second; } -bool initialize(bool use_gpu, int cpu_num_threads) { +bool initialize(bool use_gpu, int cpu_num_threads, bool load_all_models) { initialized = false; #ifdef DIRECTML @@ -201,20 +201,24 @@ bool initialize(bool use_gpu, int cpu_num_threads) { if (!status->load_metas()) { return false; } - for (int model_index = 0; model_index < model_count; model_index++) { - if (!status->load_model(model_index, cpu_num_threads)) { - return false; + + if (load_all_models) { + for (int model_index = 0; model_index < model_count; model_index++) { + if (!status->load_model(model_index)) { + return false; + } + } + + if (use_gpu) { + // 一回走らせて十分なGPUメモリを確保させる + // TODO: 全MODELに対して行う + int length = 500; + int phoneme_size = 45; + std::vector phoneme(length * phoneme_size), f0(length); + int64_t speaker_id = 0; + std::vector output(length * 256); + decode_forward(length, phoneme_size, f0.data(), phoneme.data(), &speaker_id, output.data()); } - } - if (use_gpu) { - // 一回走らせて十分なGPUメモリを確保させる - // TODO: 全MODELに対して行う - int length = 500; - int phoneme_size = 45; - std::vector phoneme(length * phoneme_size), f0(length); - int64_t speaker_id = 0; - std::vector output(length * 256); - decode_forward(length, phoneme_size, f0.data(), phoneme.data(), &speaker_id, output.data()); } } catch (const Ort::Exception &e) { error_message = ONNX_ERR; @@ -233,6 +237,17 @@ bool initialize(bool use_gpu, int cpu_num_threads) { return true; } +bool load_model(int64_t speaker_id) { + auto [model_index, _] = get_model_index_and_speaker_id(speaker_id); + return status->load_model(model_index); +} + +bool is_model_loaded(int64_t speaker_id) { + auto [model_index, _] = get_model_index_and_speaker_id(speaker_id); + return (status->yukarin_s_list[model_index].has_value() && status->yukarin_sa_list[model_index].has_value() && + status->decode_list[model_index].has_value()); +} + void finalize() { initialized = false; status.reset(); diff --git a/core/src/core.h b/core/src/core.h index f87ca5ba8..d66d349b7 100644 --- a/core/src/core.h +++ b/core/src/core.h @@ -38,12 +38,30 @@ typedef enum { * @brief 音声合成するための初期化を行う。他の関数を正しく実行するには先に初期化が必要 * @param use_gpu trueならGPU用、falseならCPU用の初期化を行う * @param cpu_num_threads 推論に用いるスレッド数を設定する。0の場合論理コア数の半分か、物理コア数が設定される + * @param load_all_models trueなら全てのモデルをロードする * @return 成功したらtrue、失敗したらfalse * @detail * 何度も実行可能。use_gpuを変更して実行しなおすことも可能。 * 最後に実行したuse_gpuに従って他の関数が実行される。 */ -VOICEVOX_CORE_API bool initialize(bool use_gpu, int cpu_num_threads = 0); +VOICEVOX_CORE_API bool initialize(bool use_gpu, int cpu_num_threads = 0, bool load_all_models = true); + +/** + * モデルをロードする + * @param speaker_id 話者番号 + * @return 成功したらtrue、失敗したらfalse + * @detail + * 必ずしも話者とモデルが1:1対応しているわけではない。 + */ +VOICEVOX_CORE_API bool load_model(int64_t speaker_id); + +/** + * @fn + * モデルがロード済みかどうか + * @param speaker_id 話者番号 + * @return ロード済みならtrue、そうでないならfalse + */ +VOICEVOX_CORE_API bool is_model_loaded(int64_t speaker_id); /** * @fn From a0c353993ac5e4f9718a6519dff4f21424ec298d Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Thu, 5 May 2022 06:22:32 +0900 Subject: [PATCH 3/9] =?UTF-8?q?=E4=BD=BF=E3=82=8F=E3=82=8C=E3=81=A6?= =?UTF-8?q?=E3=81=AA=E3=81=84=E3=82=A8=E3=83=A9=E3=83=BC=E3=83=A1=E3=83=83?= =?UTF-8?q?=E3=82=BB=E3=83=BC=E3=82=B8=E3=82=92=E6=B6=88=E5=8E=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/src/core.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/core/src/core.cpp b/core/src/core.cpp index 1d335664e..1e431316e 100644 --- a/core/src/core.cpp +++ b/core/src/core.cpp @@ -21,9 +21,6 @@ #define NOT_INITIALIZED_ERR "Call initialize() first." #define NOT_LOADED_ERR "Model is not loaded." -#define NOT_FOUND_ERR "No such file or directory: " -#define FAILED_TO_OPEN_MODEL_ERR "Unable to open model files." -#define FAILED_TO_OPEN_METAS_ERR "Unable to open metas.json." #define ONNX_ERR "ONNX raise exception: " #define JSON_ERR "JSON parser raise exception: " #define GPU_NOT_SUPPORTED_ERR "This library is CPU version. GPU is not supported." From 619e403c964af3433164be6292bfe4901c47721d Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Fri, 6 May 2022 03:08:33 +0900 Subject: [PATCH 4/9] Update core/src/core.cpp Co-authored-by: qwerty2501 <939468+qwerty2501@users.noreply.github.com> --- core/src/core.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/core.cpp b/core/src/core.cpp index 1e431316e..21c2417d6 100644 --- a/core/src/core.cpp +++ b/core/src/core.cpp @@ -268,7 +268,7 @@ bool yukarin_s_forward(int64_t length, int64_t *phoneme_list, int64_t *speaker_i return false; } auto [model_index, model_speaker_id] = get_model_index_and_speaker_id(*speaker_id); - auto model = std::move(status->yukarin_s_list[model_index]); + auto &model = status->yukarin_s_list[model_index]; if (!model) { error_message = NOT_LOADED_ERR; return false; From 49c8e6f863cbf19bc95e3bc2146b18b65c89ba49 Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Fri, 6 May 2022 03:08:41 +0900 Subject: [PATCH 5/9] Update core/src/core.cpp Co-authored-by: qwerty2501 <939468+qwerty2501@users.noreply.github.com> --- core/src/core.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/core.cpp b/core/src/core.cpp index 21c2417d6..7a0b4a17b 100644 --- a/core/src/core.cpp +++ b/core/src/core.cpp @@ -308,7 +308,7 @@ bool yukarin_sa_forward(int64_t length, int64_t *vowel_phoneme_list, int64_t *co return false; } auto [model_index, model_speaker_id] = get_model_index_and_speaker_id(*speaker_id); - auto model = std::move(status->yukarin_sa_list[model_index]); + auto &model = status->yukarin_sa_list[model_index]; if (!model) { error_message = NOT_LOADED_ERR; return false; From deb77c9b992836d0484f7bc72c6ce26874d2434c Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Fri, 6 May 2022 03:08:46 +0900 Subject: [PATCH 6/9] Update core/src/core.cpp Co-authored-by: qwerty2501 <939468+qwerty2501@users.noreply.github.com> --- core/src/core.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/core.cpp b/core/src/core.cpp index 7a0b4a17b..04bb30ae0 100644 --- a/core/src/core.cpp +++ b/core/src/core.cpp @@ -393,7 +393,7 @@ bool decode_forward(int64_t length, int64_t phoneme_size, float *f0, float *phon return false; } auto [model_index, model_speaker_id] = get_model_index_and_speaker_id(*speaker_id); - auto model = std::move(status->decode_list[model_index]); + auto &model = status->decode_list[model_index]; if (!model) { error_message = NOT_LOADED_ERR; return false; From 9daf2e426c8f6476909e6efc1e7ca61b4f95b2a1 Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Fri, 6 May 2022 03:16:57 +0900 Subject: [PATCH 7/9] =?UTF-8?q?python=E3=81=AE=E3=83=A9=E3=83=83=E3=83=91?= =?UTF-8?q?=E3=83=BC=E3=81=AE=E5=9E=8B=E3=82=92=E5=A4=89=E6=9B=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/_core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/_core.py b/core/_core.py index a6fefa448..0899a247b 100644 --- a/core/_core.py +++ b/core/_core.py @@ -27,7 +27,7 @@ lib = cdll.LoadLibrary(str(core_dll_path)) # 関数型定義 -lib.initialize.argtypes = (c_bool, c_int) +lib.initialize.argtypes = (c_bool, c_int, c_bool) lib.initialize.restype = c_bool lib.finalize.argtypes = () From dcfa19f3e9b18d90023c0be50397d633a3760549 Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Fri, 6 May 2022 03:23:52 +0900 Subject: [PATCH 8/9] =?UTF-8?q?load=5Fall=5Fmodels=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/_core.py b/core/_core.py index 0899a247b..f9306db4b 100644 --- a/core/_core.py +++ b/core/_core.py @@ -52,8 +52,8 @@ # ラッパー関数 -def initialize(use_gpu: bool, cpu_num_threads=0): - success = lib.initialize(use_gpu, cpu_num_threads) +def initialize(use_gpu: bool, cpu_num_threads=0, load_all_models=True): + success = lib.initialize(use_gpu, cpu_num_threads, load_all_models) if not success: raise Exception(lib.last_error_message().decode()) From 662147ca637ef6896d8bba95289390db5950c1f2 Mon Sep 17 00:00:00 2001 From: Hiroshiba Date: Fri, 6 May 2022 03:37:51 +0900 Subject: [PATCH 9/9] return true --- core/src/core.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/core.cpp b/core/src/core.cpp index 04bb30ae0..e7b7cc4d8 100644 --- a/core/src/core.cpp +++ b/core/src/core.cpp @@ -120,6 +120,7 @@ struct Status { supported_styles.insert(style["id"].get()); } } + return true; } /**