Skip to content

Commit

Permalink
exclude typo
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 committed Jun 30, 2024
1 parent ddd1d26 commit 68dc0a6
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 19 deletions.
2 changes: 2 additions & 0 deletions _typos.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ ba = "ba"
nd = "nd"
opt_nd = "opt_nd"
ANDed = "ANDed"
ody = "ody"

[default.extend-words]
arange = "arange"
Expand All @@ -19,6 +20,7 @@ strat = "strat"
wee = "wee"
ser = "ser"
ND = "ND"
ody = "ody"

[type.csv]
extend-glob = ["*.csv"]
Expand Down
78 changes: 60 additions & 18 deletions crates/polars-ops/src/chunked_array/strings/find_many.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
use arrow::array::Utf8ViewArray;
use polars_core::prelude::*;
use polars_core::utils::align_chunks_binary;

fn build_ac(patterns: &StringChunked, ascii_case_insensitive: bool) -> PolarsResult<AhoCorasick> {
AhoCorasickBuilder::new()
Expand All @@ -8,6 +10,16 @@ fn build_ac(patterns: &StringChunked, ascii_case_insensitive: bool) -> PolarsRes
.map_err(|e| polars_err!(ComputeError: "could not build aho corasick automaton {}", e))
}

fn build_ac_arr(
patterns: &Utf8ViewArray,
ascii_case_insensitive: bool,
) -> PolarsResult<AhoCorasick> {
AhoCorasickBuilder::new()
.ascii_case_insensitive(ascii_case_insensitive)
.build(patterns.into_iter().flatten())
.map_err(|e| polars_err!(ComputeError: "could not build aho corasick automaton {}", e))
}

pub fn contains_any(
ca: &StringChunked,
patterns: &StringChunked,
Expand Down Expand Up @@ -43,32 +55,62 @@ pub fn replace_all(
Ok(ca.apply_generic(|opt_val| opt_val.map(|val| ac.replace_all(val, replace_with.as_slice()))))
}

fn push(val: &str, builder: &mut ListStringChunkedBuilder, ac: &AhoCorasick, overlapping: bool) {
if overlapping {
let iter = ac.find_overlapping_iter(val);
let iter = iter.map(|m| &val[m.start()..m.end()]);
builder.append_values_iter(iter);
} else {
let iter = ac.find_iter(val);
let iter = iter.map(|m| &val[m.start()..m.end()]);
builder.append_values_iter(iter);
}
}

pub fn extract_many(
ca: &StringChunked,
patterns: &StringChunked,
patterns: &Series,
ascii_case_insensitive: bool,
overlapping: bool,
) -> PolarsResult<ListChunked> {
let ac = build_ac(patterns, ascii_case_insensitive)?;
match patterns.dtype() {
DataType::List(inner) if inner.is_string() => {
let mut builder = ListStringChunkedBuilder::new(ca.name(), ca.len(), ca.len() * 2);
let patterns = patterns.list().unwrap();
let (ca, patterns) = align_chunks_binary(ca, patterns);

let mut builder = ListStringChunkedBuilder::new(ca.name(), ca.len(), ca.len() * 2);
for (arr, pat_arr) in ca.downcast_iter().zip(patterns.downcast_iter()) {
for z in arr.into_iter().zip(pat_arr.into_iter()) {
match z {
(None, _) | (_, None) => builder.append_null(),
(Some(val), Some(pat)) => {
let pat = pat.as_any().downcast_ref::<Utf8ViewArray>().unwrap();
let ac = build_ac_arr(pat, ascii_case_insensitive)?;
push(val, &mut builder, &ac, overlapping);
},
}
}
}
Ok(builder.finish())
},
DataType::String => {
let patterns = patterns.str().unwrap();
let ac = build_ac(patterns, ascii_case_insensitive)?;
let mut builder = ListStringChunkedBuilder::new(ca.name(), ca.len(), ca.len() * 2);

for arr in ca.downcast_iter() {
for opt_val in arr.into_iter() {
if let Some(val) = opt_val {
if overlapping {
let iter = ac.find_overlapping_iter(val);
let iter = iter.map(|m| &val[m.start()..m.end()]);
builder.append_values_iter(iter);
} else {
let iter = ac.find_iter(val);
let iter = iter.map(|m| &val[m.start()..m.end()]);
builder.append_values_iter(iter);
for arr in ca.downcast_iter() {
for opt_val in arr.into_iter() {
if let Some(val) = opt_val {
push(val, &mut builder, &ac, overlapping);
} else {
builder.append_null();
}
}
} else {
builder.append_null();
}
}
Ok(builder.finish())
},
_ => {
polars_bail!(InvalidOperation: "expected 'String/List<String>' datatype for 'patterns' argument")
},
}
Ok(builder.finish())
}
3 changes: 2 additions & 1 deletion crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ impl Display for StringFunction {
ContainsMany { .. } => "contains_many",
#[cfg(feature = "find_many")]
ReplaceMany { .. } => "replace_many",
#[cfg(feature = "find_many")]
ExtractMany { .. } => "extract_many",
};
write!(f, "str.{s}")
Expand Down Expand Up @@ -432,7 +433,7 @@ fn extract_many(
overlapping: bool,
) -> PolarsResult<Series> {
let ca = s[0].str()?;
let patterns = s[1].str()?;
let patterns = &s[1];

polars_ops::chunked_array::strings::extract_many(
ca,
Expand Down

0 comments on commit 68dc0a6

Please sign in to comment.