diff --git a/deps/is_utf8/.gitignore b/deps/is_utf8/.gitignore new file mode 100644 index 0000000..91151f2 --- /dev/null +++ b/deps/is_utf8/.gitignore @@ -0,0 +1,2 @@ +build/ +src/dependencies/ diff --git a/deps/is_utf8/CMakeLists.txt b/deps/is_utf8/CMakeLists.txt index 39b8411..4c5bd50 100644 --- a/deps/is_utf8/CMakeLists.txt +++ b/deps/is_utf8/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.15) project(is_utf8 DESCRIPTION "Fast UTF-8 Validation" LANGUAGES CXX - VERSION 1.2.1 + VERSION 1.3.0 ) include(GNUInstallDirs) @@ -25,8 +25,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_MACOSX_RPATH OFF) -set(IS_UTF8_LIB_VERSION "1.2.1" CACHE STRING "is_utf8 library version") -set(IS_UTF8_LIB_SOVERSION "1" CACHE STRING "is_utf8 library soversion") +set(IS_UTF8_LIB_VERSION "1.3.0" CACHE STRING "is_utf8 library version") +set(IS_UTF8_LIB_SOVERSION "2" CACHE STRING "is_utf8 library soversion") set(IS_UTF8_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src) add_subdirectory(src) diff --git a/deps/is_utf8/README.md b/deps/is_utf8/README.md index acc6895..fcce935 100644 --- a/deps/is_utf8/README.md +++ b/deps/is_utf8/README.md @@ -3,12 +3,10 @@ Most strings online are in unicode using the UTF-8 encoding. Validating strings quickly before accepting them is important. - - - ## How to use is_utf8 -This is a simple one-source file library to validate UTF-8 strings at high speeds using SIMD instructions. It works on all platforms (ARM, x64). +This is a simple one-source file library to validate UTF-8 strings at high +speeds using SIMD instructions. It works on all platforms (ARM, x64). Build and link `is_utf8.cpp` with your project. Code usage: @@ -21,13 +19,66 @@ Build and link `is_utf8.cpp` with your project. Code usage: It should be able to validate strings using less than 1 cycle per input byte. +## Requirements + +- C++11 compatible compiler. We support LLVM clang, GCC, Visual Studio. (Our + optional benchmark tool requires C++17.) +- For high speed, you should have a recent 64-bit system (e.g., ARM or x64). +- If you rely on CMake, you should use a recent CMake (at least 3.15). +- AVX-512 support require a processor with AVX512-VBMI2 (Ice Lake or better) and + a recent compiler (GCC 8 or better, Visual Studio 2019 or better, LLVM clang 6 + or better). You need a correspondingly recent assembler such as gas (2.30+) or + nasm (2.14+): recent compilers usually come with recent assemblers. If you mix + a recent compiler with an incompatible/old assembler (e.g., when using a + recent compiler with an old Linux distribution), you may get errors at build + time because the compiler produces instructions that the assembler does not + recognize: you should update your assembler to match your compiler (e.g., + upgrade binutils to version 2.30 or better under Linux) or use an older + compiler matching the capabilities of your assembler. + +## Build with CMake + +``` +cmake -B build +cmake --build build +cd build +ctest . +``` + +Visual Studio users must specify whether they want to build the Release or Debug +version. + +To run benchmarks, build and execute the `bench` command. + +``` +cmake -B build +cmake --build build +./build/benchmarks/bench +``` + +Instructions are similar for Visual Studio users. + +## Real-word usage + +This C++ library is part of the JavaScript package +[utf-8-validate](https://github.com/websockets/utf-8-validate). The +utf-8-validate package is routinely downloaded more than +[a million times per week](https://www.npmjs.com/package/utf-8-validate). + +If you are using Node JS (19.4.0 or better), you already have access to this +function as +[`buffer.isUtf8(input)`](https://nodejs.org/api/buffer.html#bufferisutf8input). + ## Reference -- John Keiser, Daniel Lemire, [Validating UTF-8 In Less Than One Instruction Per Byte](https://arxiv.org/abs/2010.03090), Software: Practice & Experience 51 (5), 2021 +- John Keiser, Daniel Lemire, + [Validating UTF-8 In Less Than One Instruction Per Byte](https://arxiv.org/abs/2010.03090), + Software: Practice & Experience 51 (5), 2021 -### Want more? +## Want more? -If you want a wide range of fast Unicode function for production use, you can rely on the simdutf library. It is as simple as the following: +If you want a wide range of fast Unicode function for production use, you can +rely on the simdutf library. It is as simple as the following: ```C++ #include "simdutf.cpp" @@ -48,12 +99,11 @@ int main(int argc, char *argv[]) { See https://github.com/simdutf/ - ## License -This library is distributed under the terms of any of the following -licenses, at your option: +This library is distributed under the terms of any of the following licenses, at +your option: -* Apache License (Version 2.0) [LICENSE-APACHE](LICENSE-APACHE), -* Boost Software License [LICENSE-BOOST](LICENSE-BOOST), or -* MIT License [LICENSE-MIT](LICENSE-MIT). +- Apache License (Version 2.0) [LICENSE-APACHE](LICENSE-APACHE), +- Boost Software License [LICENSE-BOOST](LICENSE-BOOST), or +- MIT License [LICENSE-MIT](LICENSE-MIT). diff --git a/deps/is_utf8/src/is_utf8.cpp b/deps/is_utf8/src/is_utf8.cpp index bfd7612..e6cec38 100644 --- a/deps/is_utf8/src/is_utf8.cpp +++ b/deps/is_utf8/src/is_utf8.cpp @@ -688,7 +688,7 @@ class implementation { virtual uint32_t required_instruction_sets() const { return _required_instruction_sets; - }; + } /** * Validate the UTF-8 string. @@ -826,8 +826,7 @@ template class atomic_ptr { /** * The list of available implementations compiled into simdutf. */ -extern IS_UTF8_DLLIMPORTEXPORT const internal::available_implementation_list - available_implementations; +extern IS_UTF8_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations(); /** * The active implementation. @@ -835,8 +834,7 @@ extern IS_UTF8_DLLIMPORTEXPORT const internal::available_implementation_list * Automatically initialized on first use to the most advanced implementation * supported by this hardware. */ -extern IS_UTF8_DLLIMPORTEXPORT internal::atomic_ptr - active_implementation; +extern IS_UTF8_DLLIMPORTEXPORT internal::atomic_ptr& get_active_implementation(); } // namespace is_utf8_internals @@ -4640,33 +4638,39 @@ detect_best_supported_implementation_on_first_use::set_best() const noexcept { if (force_implementation_name) { auto force_implementation = - available_implementations[force_implementation_name]; + get_available_implementations()[force_implementation_name]; if (force_implementation) { - return active_implementation = force_implementation; + return get_active_implementation() = force_implementation; } else { // Note: abort() and stderr usage within the library is forbidden. - return active_implementation = &unsupported_singleton; + return get_active_implementation() = &unsupported_singleton; } } - return active_implementation = - available_implementations.detect_best_supported(); + return get_active_implementation() = + get_available_implementations().detect_best_supported(); } } // namespace internal -IS_UTF8_DLLIMPORTEXPORT const internal::available_implementation_list - available_implementations{}; -IS_UTF8_DLLIMPORTEXPORT internal::atomic_ptr - active_implementation{ - &internal::detect_best_supported_implementation_on_first_use_singleton}; +IS_UTF8_DLLIMPORTEXPORT const internal::available_implementation_list& get_available_implementations() { + static const internal::available_implementation_list available_implementations{}; + return available_implementations; +} + +IS_UTF8_DLLIMPORTEXPORT internal::atomic_ptr& get_active_implementation() { + static const internal::detect_best_supported_implementation_on_first_use detect_best_supported_implementation_on_first_use_singleton; + static internal::atomic_ptr active_implementation{&detect_best_supported_implementation_on_first_use_singleton}; + return active_implementation; +} + is_utf8_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept { - return active_implementation->validate_utf8(buf, len); + return get_active_implementation()->validate_utf8(buf, len); } const implementation *builtin_implementation() { static const implementation *builtin_impl = - available_implementations[IS_UTF8_STRINGIFY( + get_available_implementations()[IS_UTF8_STRINGIFY( IS_UTF8_BUILTIN_IMPLEMENTATION)]; return builtin_impl; }