Skip to content

Commit

Permalink
Merge pull request #104 from VOICEVOX/cpp-library
Browse files Browse the repository at this point in the history
C++ TTSライブラリを`main`にマージ
  • Loading branch information
Hiroshiba authored Mar 23, 2022
2 parents b8651d8 + 5041219 commit 7b36377
Show file tree
Hide file tree
Showing 21 changed files with 1,726 additions and 18 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ on:
release:
types:
- published
pull_request:
workflow_dispatch:

env:
Expand Down Expand Up @@ -129,6 +130,8 @@ jobs:

steps:
- uses: actions/checkout@v2
with:
submodules: true

- name: Setup Python
if: matrix.python_architecture != ''
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "open_jtalk"]
path = open_jtalk
url = https://github.com/VOICEVOX/open_jtalk
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ project(VoiceVoxCore)

# TODO: download onnxruntime
set(ONNXRUNTIME_DIR "${CMAKE_CURRENT_SOURCE_DIR}/onnxruntime" CACHE PATH "Path to ONNX Runtime")
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

option(DIRECTML "Enables building for DirectML" OFF)
set(DIRECTML_DIR "${CMAKE_CURRENT_SOURCE_DIR}/directml" CACHE PATH "Path to ONNX Runtime")
Expand All @@ -14,3 +15,4 @@ if(DIRECTML)
endif()

add_subdirectory(core)
add_subdirectory(open_jtalk/src)
31 changes: 28 additions & 3 deletions core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,15 @@ endif(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
message("core will be installed to: ${CMAKE_INSTALL_PREFIX}")

# coreライブラリのビルド設定
add_library(core SHARED src/core.cpp)
add_library(core
SHARED src/core.cpp
src/engine/full_context_label.cpp
src/engine/acoustic_feature_extractor.cpp
src/engine/openjtalk.cpp
src/engine/kana_parser.cpp
src/engine/mora_list.cpp
src/engine/synthesis_engine.cpp
src/engine.cpp)

# -DONNXRUNTIME_DIRで指定されたパスをもとにonnxruntimeのライブラリを取得する。失敗した場合はfatal error
get_filename_component(ONNXRUNTIME_DIR ${ONNXRUNTIME_DIR} ABSOLUTE)
Expand Down Expand Up @@ -81,10 +89,27 @@ elseif (UNIX)
endif ()

target_compile_options(core PRIVATE
$<$<CXX_COMPILER_ID:MSVC>: /W4 /O2 /utf-8>
$<$<CXX_COMPILER_ID:GNU>: -Wall -Wextra -O2>
$<$<CXX_COMPILER_ID:MSVC>: /W4 /O2 /utf-8 /DVOICEVOX_CORE_EXPORTS>
$<$<CXX_COMPILER_ID:GNU>: -Wall -Wextra -O2 -DVOICEVOX_CORE_EXPORTS>
)

target_include_directories(core
PRIVATE ../open_jtalk/src/jpcommon
PRIVATE ../open_jtalk/src/mecab/src
PRIVATE ../open_jtalk/src/mecab2njd
PRIVATE ../open_jtalk/src/mecab-naist-jdic
PRIVATE ../open_jtalk/src/njd
PRIVATE ../open_jtalk/src/njd2jpcommon
PRIVATE ../open_jtalk/src/njd_set_accent_phrase
PRIVATE ../open_jtalk/src/njd_set_accent_type
PRIVATE ../open_jtalk/src/njd_set_digit
PRIVATE ../open_jtalk/src/njd_set_long_vowel
PRIVATE ../open_jtalk/src/njd_set_pronunciation
PRIVATE ../open_jtalk/src/njd_set_unvoiced_vowel
PRIVATE ../open_jtalk/src/text2mecab)
target_link_libraries(core PUBLIC openjtalk)


# GCC 9.0以前ではstd::filesystemを使うためにリンクが必要 (https://gitlab.kitware.com/cmake/cmake/-/issues/17834)
target_link_libraries(core PRIVATE $<$<AND:$<CXX_COMPILER_ID:GNU>,$<VERSION_LESS:$<CXX_COMPILER_VERSION>,9.0>>:stdc++fs>)

Expand Down
3 changes: 2 additions & 1 deletion core/src/core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#endif

#include <array>
#include <cstdlib>
#include <exception>
#include <filesystem>
#include <fstream>
Expand All @@ -15,7 +14,9 @@

#include "nlohmann/json.hpp"

#ifndef VOICEVOX_CORE_EXPORTS
#define VOICEVOX_CORE_EXPORTS
#endif // VOICEVOX_CORE_EXPORTS
#include "core.h"

#define NOT_INITIALIZED_ERR "Call initialize() first."
Expand Down
100 changes: 86 additions & 14 deletions core/src/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,28 @@
#define VOICEVOX_CORE_API
#endif // _WIN32

#ifdef __cplusplus
#include <cstdint>
#else
#include <stdint.h>
#endif

#ifdef __cplusplus
extern "C" {
#endif

/**
* @enum
* 結果コード
* エラーの種類が増えたら定義を増やす。
* 必ずエラーの値を明示的に指定すること
*/
typedef enum {
// 成功
VOICEVOX_RESULT_SUCCEED = 0,
// OpenJTalk辞書がロードされていない
VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT = 1,
} VoicevoxResultCode;
/**
* @fn
* 初期化する
Expand All @@ -22,7 +44,7 @@
* 何度も実行可能。use_gpuを変更して実行しなおすことも可能。
* 最後に実行したuse_gpuに従って他の関数が実行される。
*/
extern "C" VOICEVOX_CORE_API bool initialize(const char *root_dir_path, bool use_gpu, int cpu_num_threads = 0);
VOICEVOX_CORE_API bool initialize(const char *root_dir_path, bool use_gpu, int cpu_num_threads = 0);

/**
* @fn
Expand All @@ -32,23 +54,23 @@ extern "C" VOICEVOX_CORE_API bool initialize(const char *root_dir_path, bool use
* 何度も実行可能。実行せずにexitしても大抵の場合問題ないが、
* CUDAを利用している場合これを実行しておかないと例外が起こることがある。
*/
extern "C" VOICEVOX_CORE_API void finalize();
VOICEVOX_CORE_API void finalize();

/**
* @fn
* メタ情報を取得する
* @brief 話者名や話者IDのリストを取得する
* @return メタ情報が格納されたjson形式の文字列
*/
extern "C" VOICEVOX_CORE_API const char *metas();
VOICEVOX_CORE_API const char *metas();

/**
* @fn
* 対応デバイス情報を取得する
* @brief cpu, cudaのうち、使用可能なデバイス情報を取得する
* @return 各デバイスが使用可能かどうかをboolで格納したjson形式の文字列
*/
extern "C" VOICEVOX_CORE_API const char *supported_devices();
VOICEVOX_CORE_API const char *supported_devices();

/**
* @fn
Expand All @@ -59,8 +81,7 @@ extern "C" VOICEVOX_CORE_API const char *supported_devices();
* @param speaker_id 話者番号
* @return 音素ごとの長さ
*/
extern "C" VOICEVOX_CORE_API bool yukarin_s_forward(int64_t length, int64_t *phoneme_list, int64_t *speaker_id,
float *output);
VOICEVOX_CORE_API bool yukarin_s_forward(int64_t length, int64_t *phoneme_list, int64_t *speaker_id, float *output);

/**
* @fn
Expand All @@ -76,11 +97,10 @@ extern "C" VOICEVOX_CORE_API bool yukarin_s_forward(int64_t length, int64_t *pho
* @param speaker_id 話者番号
* @return モーラごとの音高
*/
extern "C" VOICEVOX_CORE_API bool yukarin_sa_forward(int64_t length, int64_t *vowel_phoneme_list,
int64_t *consonant_phoneme_list, int64_t *start_accent_list,
int64_t *end_accent_list, int64_t *start_accent_phrase_list,
int64_t *end_accent_phrase_list, int64_t *speaker_id,
float *output);
VOICEVOX_CORE_API bool yukarin_sa_forward(int64_t length, int64_t *vowel_phoneme_list, int64_t *consonant_phoneme_list,
int64_t *start_accent_list, int64_t *end_accent_list,
int64_t *start_accent_phrase_list, int64_t *end_accent_phrase_list,
int64_t *speaker_id, float *output);

/**
* @fn
Expand All @@ -93,12 +113,64 @@ extern "C" VOICEVOX_CORE_API bool yukarin_sa_forward(int64_t length, int64_t *vo
* @param speaker_id 話者番号
* @return 音声波形
*/
extern "C" VOICEVOX_CORE_API bool decode_forward(int64_t length, int64_t phoneme_size, float *f0, float *phoneme,
int64_t *speaker_id, float *output);
VOICEVOX_CORE_API bool decode_forward(int64_t length, int64_t phoneme_size, float *f0, float *phoneme,
int64_t *speaker_id, float *output);

/**
* @fn
* 最後に発生したエラーのメッセージを取得する
* @return エラーメッセージ
*/
extern "C" VOICEVOX_CORE_API const char *last_error_message();
VOICEVOX_CORE_API const char *last_error_message();

/**
* @fn
* open jtalkの辞書を読み込む
* @return 結果コード
*/
VOICEVOX_CORE_API VoicevoxResultCode voicevox_load_openjtalk_dict(const char *dict_path);

/**
* @fn
* text to spearchを実行する
* @param text 音声データに変換するtextデータ
* @param speaker_id 話者番号
* @param output_binary_size 音声データのサイズを出力する先のポインタ
* @param output_wav 音声データを出力する先のポインタ。使用が終わったらvoicevox_wav_freeで開放する必要がある
* @return 結果コード
*/
VOICEVOX_CORE_API VoicevoxResultCode voicevox_tts(const char *text, int64_t speaker_id, int *output_binary_size,
uint8_t **output_wav);

/**
* @fn
* text to spearchをAquesTalkライクな記法で実行する
* @param text 音声データに変換するtextデータ
* @param speaker_id 話者番号
* @param output_binary_size 音声データのサイズを出力する先のポインタ
* @param output_wav 音声データを出力する先のポインタ。使用が終わったらvoicevox_wav_freeで開放する必要がある
* @return 結果コード
*/
VOICEVOX_CORE_API VoicevoxResultCode voicevox_tts_from_kana(const char *text, int64_t speaker_id,
int *output_binary_size, uint8_t **output_wav);

/**
* @fn
* voicevox_ttsで生成した音声データを開放する
* @param wav 開放する音声データのポインタ
*/
VOICEVOX_CORE_API void voicevox_wav_free(uint8_t *wav);

/**
* @fn
* エラーで返ってきた結果コードをメッセージに変換する
* @return エラーメッセージ文字列
*/
VOICEVOX_CORE_API const char *voicevox_error_result_to_message(VoicevoxResultCode result_code);

#ifdef __cplusplus
}
#endif

// 使い終わったマクロ定義は不要なので解除する
#undef VOICEVOX_CORE_API
63 changes: 63 additions & 0 deletions core/src/engine.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#include <cstdlib>
#include <memory>
#include <stdexcept>
#include <vector>

#include "core.h"
#include "engine/kana_parser.h"
#include "engine/model.h"
#include "engine/synthesis_engine.h"

using namespace voicevox::core::engine;

static SynthesisEngine engine;

VoicevoxResultCode voicevox_load_openjtalk_dict(const char *dict_path) {
// TODO: error handling
engine.load_openjtalk_dict(dict_path);
return VOICEVOX_RESULT_SUCCEED;
}

VoicevoxResultCode voicevox_tts(const char *text, int64_t speaker_id, int *output_binary_size, uint8_t **output_wav) {
if (!engine.is_openjtalk_dict_loaded()) {
return VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT;
}

std::vector<AccentPhraseModel> accent_phrases = engine.create_accent_phrases(std::string(text), &speaker_id);
const AudioQueryModel audio_query = {
accent_phrases, 1.0f, 0.0f, 1.0f, 1.0f, 0.1f, 0.1f, engine.default_sampling_rate, false, "",
};

const auto wav = engine.synthesis_wave_format(audio_query, &speaker_id, output_binary_size);
auto *wav_heap = new uint8_t[*output_binary_size];
std::copy(wav.begin(), wav.end(), wav_heap);
*output_wav = wav_heap;
return VOICEVOX_RESULT_SUCCEED;
}

VoicevoxResultCode voicevox_tts_from_kana(const char *text, int64_t speaker_id, int *output_binary_size,
uint8_t **output_wav) {
std::vector<AccentPhraseModel> accent_phrases = parse_kana(std::string(text));
accent_phrases = engine.replace_mora_data(accent_phrases, &speaker_id);
const AudioQueryModel audio_query = {
accent_phrases, 1.0f, 0.0f, 1.0f, 1.0f, 0.1f, 0.1f, engine.default_sampling_rate, false, "",
};

const auto wav = engine.synthesis_wave_format(audio_query, &speaker_id, output_binary_size);
auto *wav_heap = new uint8_t[*output_binary_size];
std::copy(wav.begin(), wav.end(), wav_heap);
*output_wav = wav_heap;
return VOICEVOX_RESULT_SUCCEED;
}

void voicevox_wav_free(uint8_t *wav) { delete wav; }

const char *voicevox_error_result_to_message(VoicevoxResultCode result_code) {
switch (result_code) {
case VOICEVOX_RESULT_NOT_LOADED_OPENJTALK_DICT:
return "Call voicevox_load_openjtalk_dict() first.";

default:
throw std::runtime_error("Unexpected error result code.");
}
}
18 changes: 18 additions & 0 deletions core/src/engine/acoustic_feature_extractor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#include "acoustic_feature_extractor.h"

namespace voicevox::core::engine {
long OjtPhoneme::phoneme_id() const {
if (phoneme.empty()) return (long)-1;
return (long)phoneme_map().at(phoneme);
}

std::vector<OjtPhoneme> OjtPhoneme::convert(std::vector<OjtPhoneme> phonemes) {
if (phonemes[0].phoneme.find("sil") != std::string::npos) {
phonemes[0].phoneme = OjtPhoneme::space_phoneme();
}
if (phonemes.back().phoneme.find("sil") != std::string::npos) {
phonemes.back().phoneme = OjtPhoneme::space_phoneme();
}
return phonemes;
}
} // namespace voicevox::core::engine
42 changes: 42 additions & 0 deletions core/src/engine/acoustic_feature_extractor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#pragma once

#include <map>
#include <string>
#include <vector>

namespace voicevox::core::engine {
// TODO: 現状のVOICEVOX/voiceovox_engineではOjtしか使われていないので、一旦これのみ実装した
class OjtPhoneme {
public:
std::string phoneme;
float start;
float end;

static const std::map<std::string, int> phoneme_map() {
std::map<std::string, int> phoneme_map = {
{"pau", 0}, {"A", 1}, {"E", 2}, {"I", 3}, {"N", 4}, {"O", 5}, {"U", 6}, {"a", 7}, {"b", 8},
{"by", 9}, {"ch", 10}, {"cl", 11}, {"d", 12}, {"dy", 13}, {"e", 14}, {"f", 15}, {"g", 16}, {"gw", 17},
{"gy", 18}, {"h", 19}, {"hy", 20}, {"i", 21}, {"j", 22}, {"k", 23}, {"kw", 24}, {"ky", 25}, {"m", 26},
{"my", 27}, {"n", 28}, {"ny", 29}, {"o", 30}, {"p", 31}, {"py", 32}, {"r", 33}, {"ry", 34}, {"s", 35},
{"sh", 36}, {"t", 37}, {"ts", 38}, {"ty", 39}, {"u", 40}, {"v", 41}, {"w", 42}, {"y", 43}, {"z", 44}};
return phoneme_map;
}
static int num_phoneme() { return (int)phoneme_map().size(); }
static const std::string space_phoneme() { return std::string("pau"); }

OjtPhoneme() {
phoneme = "";
start = 0.0;
end = 0.0;
}

OjtPhoneme(std::string c_phoneme, float c_start, float c_end) {
phoneme = c_phoneme;
start = c_start;
end = c_end;
}

long phoneme_id() const;
static std::vector<OjtPhoneme> convert(std::vector<OjtPhoneme> phonemes);
};
} // namespace voicevox::core::engine
Loading

0 comments on commit 7b36377

Please sign in to comment.