Skip to content

Commit

Permalink
feat: support match UTF-16 and UTF-32 haystacks
Browse files Browse the repository at this point in the history
  • Loading branch information
Chaoses-Ib committed Dec 16, 2023
1 parent 50162b9 commit 27266f2
Show file tree
Hide file tree
Showing 12 changed files with 392 additions and 45 deletions.
8 changes: 8 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ keywords.workspace = true
arraystring = "0.3.0"
bitflags = "2.4.1"
regex = "1.10.2"
widestring = { version = "1.0.2", optional = true }

[features]
inmut-data = []
minimal = ["inmut-data"]
encoding = ["dep:widestring"]
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,14 @@ assert!(matcher.is_match("拼音搜索Everything"));
#include <ib_pinyin/ib_pinyin.h>
#include <ib_pinyin/notation.h>

// UTF-8
bool is_match = ib_pinyin_is_match_u8c(u8"pysousuoeve", u8"拼音搜索Everything", PINYIN_NOTATION_ASCII_FIRST_LETTER | PINYIN_NOTATION_ASCII);

// UTF-16
bool is_match = ib_pinyin_is_match_u16c(u"pysousuoeve", u"拼音搜索Everything", PINYIN_NOTATION_ASCII_FIRST_LETTER | PINYIN_NOTATION_ASCII);

// UTF-32
bool is_match = ib_pinyin_is_match_u32c(U"pysousuoeve", U"拼音搜索Everything", PINYIN_NOTATION_ASCII_FIRST_LETTER | PINYIN_NOTATION_ASCII);
```

### C++
Expand Down
3 changes: 2 additions & 1 deletion bindings/c/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ crate-type = ["staticlib", "cdylib"]
[dependencies]
diplomat = "0.7.0"
diplomat-runtime = "0.7.0"
ib-pinyin = { path = "../..", features = ["minimal"] }
ib-pinyin = { path = "../..", features = ["minimal", "encoding"] }
widestring = "1.0.2"
46 changes: 43 additions & 3 deletions bindings/c/examples/cmake/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,56 @@
#include <ib_pinyin/ib_pinyin.h>
#include <ib_pinyin/notation.h>

int main()
{
const char *pattern = u8"pysousuoeve";
void test_u8() {
const char *pattern = u8"pysousuoeve";
const char *haystack = u8"拼音搜索Everything";
// 0x3
const PinyinNotation notations = PINYIN_NOTATION_ASCII_FIRST_LETTER | PINYIN_NOTATION_ASCII;

printf("%d\n", ib_pinyin_is_match_u8(pattern, strlen(pattern), haystack, strlen(haystack), notations));

printf("%d\n", ib_pinyin_is_match_u8c(pattern, haystack, notations));
}

void test_u16() {
const char16_t *pattern = u"pysousuoeve";
const char16_t *haystack = u"拼音搜索Everything";
// 0x3
const PinyinNotation notations = PINYIN_NOTATION_ASCII_FIRST_LETTER | PINYIN_NOTATION_ASCII;

printf("%d\n", ib_pinyin_is_match_u16(
pattern,
sizeof(u"pysousuoeve") / sizeof(char16_t) - 1,
haystack,
sizeof(u"拼音搜索Everything") / sizeof(char16_t) - 1,
notations
));

printf("%d\n", ib_pinyin_is_match_u16c(pattern, haystack, notations));
}

void test_u32() {
const char32_t *pattern = U"pysousuoeve";
const char32_t *haystack = U"拼音搜索Everything";
// 0x3
const PinyinNotation notations = PINYIN_NOTATION_ASCII_FIRST_LETTER | PINYIN_NOTATION_ASCII;

printf("%d\n", ib_pinyin_is_match_u32(
pattern,
sizeof(U"pysousuoeve") / sizeof(char32_t) - 1,
haystack,
sizeof(U"拼音搜索Everything") / sizeof(char32_t) - 1,
notations
));

printf("%d\n", ib_pinyin_is_match_u32c(pattern, haystack, notations));
}

int main()
{
test_u8();
test_u16();
test_u32();

return 0;
}
8 changes: 8 additions & 0 deletions bindings/c/include/ib_pinyin/ib_pinyin.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@ extern "C" {
bool ib_pinyin_is_match_u8(const char* pattern_data, size_t pattern_len, const char* haystack_data, size_t haystack_len, uint32_t pinyin_notations);

bool ib_pinyin_is_match_u8c(const uint8_t* pattern, const uint8_t* haystack, uint32_t pinyin_notations);

bool ib_pinyin_is_match_u16(const uint16_t* pattern, size_t pattern_len, const uint16_t* haystack, size_t haystack_len, uint32_t pinyin_notations);

bool ib_pinyin_is_match_u16c(const uint16_t* pattern, const uint16_t* haystack, uint32_t pinyin_notations);

bool ib_pinyin_is_match_u32(const uint32_t* pattern, size_t pattern_len, const uint32_t* haystack, size_t haystack_len, uint32_t pinyin_notations);

bool ib_pinyin_is_match_u32c(const uint32_t* pattern, const uint32_t* haystack, uint32_t pinyin_notations);
void ib_pinyin_destroy(ib_pinyin* self);

#ifdef __cplusplus
Expand Down
46 changes: 46 additions & 0 deletions bindings/c/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ mod ffi {
use std::ffi::CStr;

use ::ib_pinyin::{minimal, pinyin::PinyinNotation};
use widestring::{U16CStr, U16Str, U32CStr, U32Str};

/// https://github.com/rust-diplomat/diplomat/issues/392
#[allow(non_camel_case_types)]
Expand All @@ -20,6 +21,7 @@ mod ffi {
)
}

/// TODO: Lossy decoding?
pub fn is_match_u8c(pattern: &u8, haystack: &u8, pinyin_notations: u32) -> bool {
(|| -> Result<bool, std::str::Utf8Error> {
Ok(Self::is_match_u8(
Expand All @@ -30,5 +32,49 @@ mod ffi {
})()
.unwrap_or(false)
}

pub fn is_match_u16(
pattern: &u16,
pattern_len: usize,
haystack: &u16,
haystack_len: usize,
pinyin_notations: u32,
) -> bool {
minimal::is_pinyin_match_u16(
unsafe { U16Str::from_ptr(pattern as *const u16, pattern_len) },
unsafe { U16Str::from_ptr(haystack as *const u16, haystack_len) },
PinyinNotation::from_bits_truncate(pinyin_notations),
)
}

pub fn is_match_u16c(pattern: &u16, haystack: &u16, pinyin_notations: u32) -> bool {
minimal::is_pinyin_match_u16(
unsafe { U16CStr::from_ptr_str(pattern as *const u16) }.as_ustr(),
unsafe { U16CStr::from_ptr_str(haystack as *const u16) }.as_ustr(),
PinyinNotation::from_bits_truncate(pinyin_notations),
)
}

pub fn is_match_u32(
pattern: &u32,
pattern_len: usize,
haystack: &u32,
haystack_len: usize,
pinyin_notations: u32,
) -> bool {
minimal::is_pinyin_match_u32(
unsafe { U32Str::from_ptr(pattern as *const u32, pattern_len) },
unsafe { U32Str::from_ptr(haystack as *const u32, haystack_len) },
PinyinNotation::from_bits_truncate(pinyin_notations),
)
}

pub fn is_match_u32c(pattern: &u32, haystack: &u32, pinyin_notations: u32) -> bool {
minimal::is_pinyin_match_u32(
unsafe { U32CStr::from_ptr_str(pattern as *const u32) }.as_ustr(),
unsafe { U32CStr::from_ptr_str(haystack as *const u32) }.as_ustr(),
PinyinNotation::from_bits_truncate(pinyin_notations),
)
}
}
}
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#![feature(return_position_impl_trait_in_trait)]

pub mod matcher;
#[cfg(feature = "minimal")]
pub mod minimal;
Expand Down
90 changes: 90 additions & 0 deletions src/matcher/encoding.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/// TODO: Extended ASCII code pages
/// TODO: Index/SliceIndex
pub trait EncodedStr {
const ELEMENT_LEN_BYTE: usize;

fn is_ascii(&self) -> bool;
fn as_bytes(&self) -> &[u8];

fn char_index_strs(&self) -> impl Iterator<Item = (usize, char, &Self)>;
fn char_len_next_strs(&self) -> impl Iterator<Item = (char, usize, &Self)>;
}

impl EncodedStr for str {
const ELEMENT_LEN_BYTE: usize = core::mem::size_of::<u8>();

fn is_ascii(&self) -> bool {
self.is_ascii()
}

fn as_bytes(&self) -> &[u8] {
self.as_bytes()
}

fn char_index_strs(&self) -> impl Iterator<Item = (usize, char, &Self)> {
self.char_indices().map(|(i, c)| (i, c, &self[i..]))
}

fn char_len_next_strs(&self) -> impl Iterator<Item = (char, usize, &Self)> {
self.char_indices().map(|(i, c)| {
let len = c.len_utf8();
(c, len, &self[i + len..])
})
}
}

#[cfg(feature = "encoding")]
impl EncodedStr for widestring::U16Str {
const ELEMENT_LEN_BYTE: usize = core::mem::size_of::<u16>();

fn is_ascii(&self) -> bool {
self.as_bytes().is_ascii()
}

fn as_bytes(&self) -> &[u8] {
unsafe {
core::slice::from_raw_parts(
self.as_ptr() as *const u8,
self.len() * core::mem::size_of::<u16>(),
)
}
}

fn char_index_strs(&self) -> impl Iterator<Item = (usize, char, &Self)> {
self.char_indices_lossy().map(|(i, c)| (i, c, &self[i..]))
}

fn char_len_next_strs(&self) -> impl Iterator<Item = (char, usize, &Self)> {
self.char_indices_lossy().map(|(i, c)| {
let len = c.len_utf16();
(c, len, &self[i + len..])
})
}
}

#[cfg(feature = "encoding")]
impl EncodedStr for widestring::U32Str {
const ELEMENT_LEN_BYTE: usize = core::mem::size_of::<u32>();

fn is_ascii(&self) -> bool {
self.as_bytes().is_ascii()
}

fn as_bytes(&self) -> &[u8] {
unsafe {
core::slice::from_raw_parts(
self.as_ptr() as *const u8,
self.len() * core::mem::size_of::<u32>(),
)
}
}

fn char_index_strs(&self) -> impl Iterator<Item = (usize, char, &Self)> {
self.char_indices_lossy().map(|(i, c)| (i, c, &self[i..]))
}

fn char_len_next_strs(&self) -> impl Iterator<Item = (char, usize, &Self)> {
self.char_indices_lossy()
.map(|(i, c)| (c, 1, &self[i + 1..]))
}
}
Loading

0 comments on commit 27266f2

Please sign in to comment.