diff --git a/build/Cargo.toml b/build/Cargo.toml index 6fb737d..d215e63 100644 --- a/build/Cargo.toml +++ b/build/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "nlprule-build" version = "0.4.5-pre" -authors = ["Benjamin Minixhofer "] +authors = ["Benjamin Minixhofer ", "Bernhard Schuster "] edition = "2018" license = "MIT OR Apache-2.0" description = "Build tools for a fast, low-resource Natural Language Processing and Error Correction library." @@ -17,7 +17,9 @@ directories = "3" reqwest = { version = "0.11", default_features = false, features = ["blocking", "rustls-tls"]} nlprule = { path = "../nlprule", features = ["compile"], version = "0.4.5-pre" } # BUILD_BINDINGS_COMMENT # nlprule = { package = "nlprule-core", path = "../nlprule", features = ["compile"] } # BUILD_BINDINGS_UNCOMMENT +fs-err = "2.5" [dev-dependencies] tempdir = "0.3" -smush = "0.1.5" \ No newline at end of file +smush = "0.1.5" +env_logger = "0.8" diff --git a/build/src/lib.rs b/build/src/lib.rs index 8fa7af7..344667c 100644 --- a/build/src/lib.rs +++ b/build/src/lib.rs @@ -1,28 +1,58 @@ //! This crate provides a builder to make it easier to use the correct binaries for [nlprule](https://github.com/bminixhofer/nlprule). //! See `README.md` for details. -use flate2::read::GzDecoder; +use flate2::bufread::GzDecoder; +use fs::File; +use fs_err as fs; use nlprule::{compile, rules_filename, tokenizer_filename}; +use std::fs::Permissions; use std::{ - fs::{self, File}, - io::{self, BufWriter, Cursor, Read}, + io::{self, BufReader, BufWriter, Cursor, Read, Seek, SeekFrom, Write}, path::{Path, PathBuf}, + result, }; -use thiserror::Error; use zip::result::ZipError; -#[derive(Debug, Error)] +pub type OtherError = Box; + +#[derive(Debug, thiserror::Error)] pub enum Error { - #[error("request error: {0}")] + #[error(transparent)] RequestError(#[from] reqwest::Error), - #[error("i/o error: {0}")] + #[error("Binaries were not found on the remote")] + BinariesNotFound, + #[error("Failed to validate {1:?} binary for lang {0}")] + ValidationFailed(String, Binary, #[source] nlprule::Error), + #[error(transparent)] IOError(#[from] io::Error), - #[error("zip error: {0}")] + #[error(transparent)] ZipError(#[from] ZipError), #[error("error postprocessing binaries: {0}")] - PostprocessingError(Box), + PostprocessingError(#[source] OtherError), + #[error("error transforming binaries: {0}")] + TransformError(#[source] OtherError), + #[error("Collation failed")] + CollationFailed(#[source] nlprule::compile::Error), +} + +pub type Result = std::result::Result; + +/// Definition of the data transformation for the network retrieved, binencoded rules and tokenizer datasets. +pub trait TransformDataFn: + for<'w> Fn(Box, Box) -> result::Result<(), OtherError> +{ } +impl TransformDataFn for T where + T: for<'w> Fn(Box, Box) -> result::Result<(), OtherError> +{ +} + +/// Definition of the path transformation for the network retrieved, binencoded rules and tokenizer datasets. +pub trait TransformPathFn: Fn(PathBuf) -> result::Result {} + +impl TransformPathFn for T where T: Fn(PathBuf) -> result::Result {} + #[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)] pub enum Binary { Tokenizer, @@ -40,50 +70,141 @@ impl Binary { /// Stores the binaries for the given language and version in out_dir. /// Tries downloading the binaries from their distribution source. -/// Optionally caches them at some directory. -pub fn get_binary>( +/// +/// This implicitly unpacks the originally gzip'd sources and returns +/// an in-memory buffer. +fn obtain_binary_from_github_release( version: &str, lang_code: &str, binary: Binary, - cache_dir: Option

, -) -> Result { +) -> Result>> { let filename = binary.filename(lang_code); - let mut cache_path: Option = None; - - if let Some(dir) = cache_dir { - cache_path = Some(dir.as_ref().join(version).join(lang_code).join(&filename)); - } - - // if the file can be read, the data is already cached - if let Some(path) = &cache_path { - if let Ok(bytes) = fs::read(path) { - return Ok(Cursor::new(bytes)); - } - } - - // ... otherwise, request the data from the URL ... let bytes = reqwest::blocking::get(&format!( "https://github.com/bminixhofer/nlprule/releases/download/{}/{}.gz", version, filename ))? - .error_for_status()? + .error_for_status() + .map_err(|e| { + if let Some(404) = e.status().map(|x| x.as_u16()) { + Error::BinariesNotFound + } else { + e.into() + } + })? .bytes()?; let mut gz = GzDecoder::new(&bytes[..]); let mut buffer = Vec::new(); gz.read_to_end(&mut buffer)?; - // ... and then cache the data at the provided file, if one was found - if let Some(path) = &cache_path { - fs::create_dir_all(path.parent().expect("path must have parent"))?; - fs::write(path, &buffer)?; + Ok(Cursor::new(buffer)) +} + +fn construct_cache_path( + version: &str, + lang_code: &str, + binary: Binary, + cache_dir: Option<&PathBuf>, + transform_path_fn: Option<&dyn TransformPathFn>, +) -> Result> { + let filename = binary.filename(lang_code); + + cache_dir + .map(move |dir| { + let path = dir.join(version).join(lang_code).join(&filename); + Ok(if let Some(transform_path_fn) = transform_path_fn { + transform_path_fn(path).map_err(Error::TransformError)? + } else { + path + }) + }) + .transpose() +} + +/// Returns a `dyn Read` type, that is either directly feed +/// from an in-memory slice or from the on disk cache. +/// If the on-disk cache is disabled or is not present, +/// it will attempt to download it via [`obtain_binary_from_github_release`]. +/// The data is then written into `dest`. +/// Also updates the cache. +fn obtain_binary_cache_or_github( + version: &str, + lang_code: &str, + binary: Binary, + cache_dir: Option<&PathBuf>, + transform_path_fn: Option<&dyn TransformPathFn>, + transform_data_fn: Option<&dyn TransformDataFn>, +) -> Result> { + let cache_path = + construct_cache_path(version, lang_code, binary, cache_dir, transform_path_fn)?; + + // if the file can be read, the data is already cached and the transform was applied before + if let Some(ref cache_path) = cache_path { + if let Ok(bytes) = fs::read(cache_path) { + return Ok(Box::new(Cursor::new(bytes))); + } } - Ok(Cursor::new(buffer)) + // the binencoded data from github + let reader_binenc = obtain_binary_from_github_release(version, lang_code, binary)?; + + // apply the transform if any to an intermediate buffer + let mut reader_transformed = if let Some(transform_data_fn) = transform_data_fn { + // TODO this is not optimal, the additional copy is a bit annoying + let mut intermediate = Box::new(Cursor::new(Vec::::new())); + transform_data_fn(Box::new(reader_binenc), Box::new(&mut intermediate)) + .map_err(Error::TransformError)?; + intermediate + } else { + Box::new(reader_binenc) + }; + + // update the cache entry + if let Some(ref cache_path) = cache_path { + fs::create_dir_all(cache_path.parent().expect("path must have parent"))?; + let mut cache_file = fs::OpenOptions::new() + .truncate(true) + .create(true) + .write(true) + .open(cache_path)?; + io::copy(&mut reader_transformed, &mut cache_file)?; + } + + // move the cursor back to the beginning + reader_transformed.seek(SeekFrom::Start(0_u64))?; + + Ok(reader_transformed) +} + +fn assure_binary_availability( + version: &str, + lang_code: &str, + binary: Binary, + cache_dir: Option<&PathBuf>, + transform_path_fn: Option<&dyn TransformPathFn>, + transform_data_fn: Option<&dyn TransformDataFn>, + out: PathBuf, +) -> Result<()> { + let mut source = obtain_binary_cache_or_github( + version, + lang_code, + binary, + cache_dir, + transform_path_fn, + transform_data_fn, + )?; + + let mut out_file = fs::OpenOptions::new() + .truncate(true) + .create(true) + .write(true) + .open(out)?; + io::copy(&mut source, &mut out_file)?; + Ok(()) } -pub fn get_build_dir>(lang_code: &str, out_dir: P) -> Result<(), Error> { +pub fn get_build_dir>(lang_code: &str, out_dir: P) -> Result<()> { let bytes = reqwest::blocking::get(&format!( "https://f000.backblazeb2.com/file/nlprule/{}.zip", lang_code @@ -123,7 +244,7 @@ pub fn get_build_dir>(lang_code: &str, out_dir: P) -> Result<(), use std::os::unix::fs::PermissionsExt; if let Some(mode) = file.unix_mode() { - fs::set_permissions(&outpath, fs::Permissions::from_mode(mode))?; + fs::set_permissions(&outpath, Permissions::from_mode(mode))?; } } } @@ -139,7 +260,6 @@ pub fn supported_language_codes() -> Vec<&'static str> { } /// Places all nlprule binaries for the given languages in some directory. -#[derive(Debug)] pub struct BinaryBuilder { language_codes: Vec, out_dir: PathBuf, @@ -148,52 +268,62 @@ pub struct BinaryBuilder { fallback_to_build_dir: bool, build_dir: Option, outputs: Vec, + transform_path_fn: Option>, + transform_data_fn: Option>, } impl BinaryBuilder { + /// ```plain + /// github release resource --[fn transform]--> $cache_dir --[fn postprocess]--> $OUT_DIR/ + /// ``` + /// + /// ```plain + /// github release resource --[fn transform]--> $OUT_DIR --[fn postprocess]--> $OUT_DIR/ + /// ``` + /// /// Acquires the rule and tokenizer binaries for one language by: /// - Trying to download them from their distribution source (or load them local cache). /// - If they are not found (i. e. a dev version of nlprule is used) and `fallback_to_build_dir`is true /// downloads the latest build directory and builds the binaries from it. /// This can still fail if the dev version is sufficiently outdated for the latest build dir. - /// In that case, the user can't be helped. - fn build_language(&mut self, lang_code: &str) { - let tokenizer_out = self.out_dir.join(tokenizer_filename(lang_code)); - let rules_out = self.out_dir.join(rules_filename(lang_code)); + /// In that case, the user is encouraged to update to a release or a newer git sha. + fn build_language(&mut self, lang_code: &str) -> Result<()> { + // adjust the destination path for now + let path_transform = |out: PathBuf| -> Result { + Ok( + if let Some(ref transform_path_fn) = self.transform_path_fn { + transform_path_fn(out).map_err(Error::TransformError)? + } else { + out + }, + ) + }; + + let tokenizer_out = path_transform(self.out_dir.join(tokenizer_filename(lang_code)))?; + let rules_out = path_transform(self.out_dir.join(rules_filename(lang_code)))?; let mut did_not_find_binaries = false; - for (binary, out) in [ + for (binary, out) in &[ (Binary::Tokenizer, &tokenizer_out), (Binary::Rules, &rules_out), - ] - .iter() - { - let response = get_binary(&self.version, lang_code, *binary, self.cache_dir.as_ref()); - - (match response { - Ok(mut reader) => { - let mut bytes = Vec::new(); - reader - .read_to_end(&mut bytes) - .expect("reading binary bytes failed"); - - fs::write(out, bytes).expect("writing binary to file failed"); - - Ok(()) - } - Err(Error::RequestError(error)) => { - if let Some(404) = error.status().map(|x| x.as_u16()) { - // binaries for this version are not distributed, fall back to building them - did_not_find_binaries = true; - break; - } - - Err(error.into()) + ] { + let out = out.to_owned().to_owned(); + match assure_binary_availability( + &self.version, + lang_code, + *binary, + self.cache_dir.as_ref(), + self.transform_path_fn.as_ref().map(|x| x.as_ref()), + self.transform_data_fn.as_ref().map(|x| x.as_ref()), + out, + ) { + Err(Error::BinariesNotFound) => { + did_not_find_binaries = true; + break; } - Err(x) => Err(x), - }) - .expect("error loading binary") + res => res?, + } } if did_not_find_binaries && self.fallback_to_build_dir { @@ -210,12 +340,42 @@ impl BinaryBuilder { get_build_dir(lang_code, &build_dir).expect("error loading build directory"); } - compile::compile(&compile::BuildOptions { - build_dir, - rules_out: rules_out.clone(), - tokenizer_out: tokenizer_out.clone(), - }) - .expect("Compiling from build directory failed. Upgrading to a more recent development version of NLPRule might fix this problem."); + let mut rules_sink = BufWriter::new( + fs::OpenOptions::new() + .truncate(true) + .create(true) + .write(true) + .open(&rules_out)?, + ); + let mut tokenizer_sink = BufWriter::new( + fs::OpenOptions::new() + .truncate(true) + .create(true) + .write(true) + .open(&tokenizer_out)?, + ); + if let Some(ref transform_data_fn) = self.transform_data_fn { + let mut transfer_buffer_rules = Cursor::new(Vec::new()); + let mut transfer_buffer_tokenizer = Cursor::new(Vec::new()); + + compile::compile( + build_dir, + &mut transfer_buffer_rules, + &mut transfer_buffer_tokenizer, + ) + .map_err(Error::CollationFailed)?; + + transform_data_fn(Box::new(transfer_buffer_rules), Box::new(rules_sink)) + .map_err(Error::TransformError)?; + transform_data_fn( + Box::new(transfer_buffer_tokenizer), + Box::new(tokenizer_sink), + ) + .map_err(Error::TransformError)?; + } else { + compile::compile(build_dir, &mut rules_sink, &mut tokenizer_sink) + .map_err(Error::CollationFailed)?; + }; } else if did_not_find_binaries { panic!( "Did not find binaries for version {}. \ @@ -227,21 +387,25 @@ impl BinaryBuilder { self.outputs.push(tokenizer_out); self.outputs.push(rules_out); + Ok(()) } /// Creates a new binary builder. `language_codes` must be in ISO 639-1 (two-letter) format. - /// If `language_codes` is `None`, uses all supported languages. + /// If `language_codes` is `&[]`, uses all supported languages. /// If this is used in a `build.rs`, `out_dir` should probably be the OUT_DIR environment variable. - pub fn new>(language_codes: Option<&[&str]>, out_dir: P) -> Self { - let language_codes: Vec<_> = language_codes.map_or_else( - || { - supported_language_codes() - .into_iter() - .map(|x| x.to_owned()) - .collect() - }, - |x| x.iter().map(|x| x.to_string()).collect(), - ); + pub fn new>(language_codes: &[&str], out_dir: P) -> Self { + let language_codes: Vec<_> = if language_codes.is_empty() { + supported_language_codes() + .into_iter() + .map(ToOwned::to_owned) + .collect() + } else { + language_codes + .iter() + .map(ToOwned::to_owned) + .map(ToOwned::to_owned) + .collect::>() + }; let project_dir = directories::ProjectDirs::from("", "", "nlprule"); // this should be CARGO_ARTIFACT_DIR once it is merged: https://github.com/rust-lang/rfcs/pull/3035 @@ -258,6 +422,8 @@ impl BinaryBuilder { fallback_to_build_dir: false, build_dir, outputs: Vec::new(), + transform_data_fn: None, + transform_path_fn: None, } } @@ -297,36 +463,27 @@ impl BinaryBuilder { } /// Builds by {downloading, copying, building} the binaries to the out directory. - pub fn build(mut self) -> Self { - for lang_code in self.language_codes[..].to_vec() { - self.build_language(&lang_code); - } - self + pub fn build(mut self) -> Result { + self.language_codes + .clone() + .into_iter() + .try_for_each(|lang_code| self.build_language(&lang_code))?; + Ok(self) } /// Validates the binaries by checking if they can be loaded by nlprule. - pub fn validate(self) -> Self { + pub fn validate(self) -> Result { for lang_code in &self.language_codes { let tokenizer_out = self.out_dir.join(tokenizer_filename(lang_code)); let rules_out = self.out_dir.join(rules_filename(lang_code)); - nlprule::Rules::new(rules_out).unwrap_or_else(|e| { - panic!( - "failed to validate rules binary for {lang_code}: {error}", - lang_code = lang_code, - error = e - ) - }); - nlprule::Tokenizer::new(tokenizer_out).unwrap_or_else(|e| { - panic!( - "failed to validate tokenizer binary for {lang_code}: {error}", - lang_code = lang_code, - error = e - ) - }); + nlprule::Rules::new(rules_out) + .map_err(|e| Error::ValidationFailed(lang_code.to_owned(), Binary::Rules, e))?; + nlprule::Tokenizer::new(tokenizer_out) + .map_err(|e| Error::ValidationFailed(lang_code.to_owned(), Binary::Tokenizer, e))?; } - self + Ok(self) } /// Gets the paths to all files this builder created. @@ -334,6 +491,27 @@ impl BinaryBuilder { &self.outputs } + /// Applies the given transformation function to the binary immediately after obtaining it. + /// This happens before placing the file in the cache (if any) so by using a compression + /// function the size of the cache directory can be reduced. + /// Modifies the path of the cached binaries by the given `path_fn`. + /// If no cache directory is set or the binaries are built from the build dir, the `path_fn` does nothing. + /// + /// The resulting files will then reside in the given cache dir if any. + /// + /// Attention: Any compression applied here, must be undone in the + /// `fn postprocess` provided closure to retain the original binenc file + /// to be consumed by the application code. + pub fn transform( + mut self, + proc_fn: &'static (dyn TransformDataFn), + path_fn: &'static (dyn TransformPathFn), + ) -> Self { + self.transform_data_fn = Some(Box::new(proc_fn)); + self.transform_path_fn = Some(Box::new(path_fn)); + self + } + /// Applies the given postprocessing function to the binaries e. g. for compression. /// Modifies the output path by the given path function. /// @@ -345,7 +523,7 @@ impl BinaryBuilder { /// # let tempdir = tempdir::TempDir::new("builder_test")?; /// # let tempdir = tempdir.path(); /// # - /// # let mut builder = BinaryBuilder::new(Some(&["en"]), tempdir).version("0.3.0"); + /// # let mut builder = BinaryBuilder::new(&["en"], tempdir).version("0.3.0"); /// builder /// .build() /// .validate() @@ -364,24 +542,21 @@ impl BinaryBuilder { /// )?; /// # Ok::<(), nlprule_build::Error>(()) /// ``` - pub fn postprocess>( - mut self, - proc_fn: C, - path_fn: F, - ) -> Result + pub fn postprocess(mut self, proc_fn: C, path_fn: F) -> Result where - C: Fn(Vec, BufWriter) -> Result<(), Box>, + C: Fn(BufReader, BufWriter) -> result::Result<(), OtherError>, F: Fn(PathBuf) -> P, + P: AsRef, { - for (i, path) in self.outputs.to_vec().iter().enumerate() { - let buffer = fs::read(&path)?; + for (i, path) in self.outputs.clone().into_iter().enumerate() { + let reader = BufReader::new(fs::File::open(&path)?); let new_path = path_fn(path.clone()); let new_path = new_path.as_ref(); let writer = BufWriter::new(File::create(new_path)?); - proc_fn(buffer, writer).map_err(Error::PostprocessingError)?; + proc_fn(reader, writer).map_err(Error::PostprocessingError)?; if new_path != path { self.outputs[i] = new_path.to_path_buf(); @@ -400,15 +575,19 @@ mod tests { use super::*; #[test] - fn getting_binary_works() -> Result<(), Error> { + fn getting_binary_works() -> Result<()> { // this is nice to keep roughly in sync with the latest released version but it is not necessary - get_binary::("0.3.0", "en", Binary::Rules, None)?; + let tempdir = tempdir::TempDir::new("build_dir")?; + let tempdir = tempdir.path().join("foo.bin"); + assure_binary_availability("0.3.0", "en", Binary::Rules, None, None, None, tempdir)?; Ok(()) } #[test] - fn getting_build_dir_works() -> Result<(), Error> { + fn getting_build_dir_works() -> Result<()> { + let _ = env_logger::builder().is_test(true).try_init(); + let tempdir = tempdir::TempDir::new("build_dir_test")?; let tempdir = tempdir.path(); @@ -420,42 +599,44 @@ mod tests { } #[test] - fn binary_builder_works() -> Result<(), Error> { + fn binary_builder_works() -> Result<()> { let tempdir = tempdir::TempDir::new("builder_test")?; let tempdir = tempdir.path(); - BinaryBuilder::new(None, tempdir) + BinaryBuilder::new(&[], tempdir) .fallback_to_build_dir(true) - .build() - .validate(); + .build()? + .validate()?; Ok(()) } #[test] - fn binary_builder_works_with_released_version() -> Result<(), Error> { + fn binary_builder_works_with_released_version() -> Result<()> { let tempdir = tempdir::TempDir::new("builder_test")?; let tempdir = tempdir.path(); - BinaryBuilder::new(Some(&["en"]), tempdir) + BinaryBuilder::new(&["en"], tempdir) .version("0.3.0") - .build(); + .build()?; Ok(()) } #[test] - fn binary_builder_works_with_smush() -> Result<(), Error> { + fn binary_builder_works_with_smush() -> Result<()> { let tempdir = tempdir::TempDir::new("builder_test")?; let tempdir = tempdir.path(); - BinaryBuilder::new(Some(&["en"]), tempdir) + BinaryBuilder::new(&["en"], tempdir) .version("0.3.0") - .build() + .build()? .postprocess( - |buffer, mut writer| { + |mut buffer, mut writer| { + let mut tmp = Vec::new(); + buffer.read_to_end(&mut tmp)?; Ok(writer.write_all(&smush::encode( - &buffer, + &tmp, smush::Codec::Gzip, smush::Quality::Default, )?)?) @@ -477,18 +658,20 @@ mod tests { } #[test] - fn binary_builder_works_with_flate2() -> Result<(), Error> { + fn binary_builder_works_with_flate2() -> Result<()> { let tempdir = tempdir::TempDir::new("builder_test")?; let tempdir = tempdir.path(); - let builder = BinaryBuilder::new(Some(&["en"]), tempdir) + let builder = BinaryBuilder::new(&["en"], tempdir) .version("0.3.0") - .build() + .build()? .postprocess( - |buffer, writer| { + |mut buffer, writer| { + let mut tmp = Vec::new(); + buffer.read_to_end(&mut tmp)?; Ok( flate2::write::GzEncoder::new(writer, flate2::Compression::default()) - .write_all(&buffer)?, + .write_all(&tmp)?, ) }, |p| { @@ -519,4 +702,76 @@ mod tests { Ok(()) } + + #[test] + fn build_with_zstd_transform() -> Result<()> { + let tempdir = tempdir::TempDir::new("builder_test")?; + let tempdir = tempdir.path(); + + let builder = BinaryBuilder::new(&["en"], tempdir) + .version("0.3.0") + .transform( + &|mut buffer, mut writer| { + let mut data = Vec::new(); + buffer.read_to_end(&mut data)?; + let data = smush::encode( + data.as_slice(), + smush::Codec::Zstd, + smush::Quality::Maximum, + )?; + writer.write_all(&data)?; + + // XXX hangs forever, both with `smush` and `brotli` directly + // let _ = brotli::BrotliCompress(&mut buffer, &mut sink, &Default::default())?; + Ok(()) + }, + &|p: PathBuf| { + let mut s = p.to_string_lossy().to_string(); + s.push_str(".zstd"); + Ok(PathBuf::from(s)) + }, + ) + .build()? + .postprocess( + |mut buffer, mut writer| { + dbg!("postprocessing"); + let mut tmp = Vec::new(); + buffer.read_to_end(&mut tmp)?; + let data = smush::decode(tmp.as_slice(), smush::Codec::Zstd)?; + writer.write_all(data.as_slice())?; + Ok(()) + // XXX + // Ok(brotli::BrotliDecompress(&mut buffer, &mut writer)?) + }, + |p| { + let path = p.to_string_lossy(); + assert!(path.ends_with(".zstd")); + let end = path.len().saturating_sub(".zstd".len()); + assert_ne!(end, 0); + path[..end].to_owned() + }, + )?; + + assert_eq!( + builder.outputs(), + &[ + tempdir.join("en_tokenizer.bin"), + tempdir.join("en_rules.bin") + ] + ); + + let rules_path = tempdir + .join(Path::new(&rules_filename("en"))) + .with_extension("bin"); + assert!(rules_path.exists()); + + // The following will always fail since the versions will mismatch and rebuilding does not make sense + // `get_build_dir` is tested separately + // + // ```rust,no_run + // let _ = nlprule::Rules::new(rules_path) + // .map_err(|e| Error::ValidationFailed("en".to_owned(), Binary::Rules, e))?; + // ``` + Ok(()) + } } diff --git a/nlprule/Cargo.toml b/nlprule/Cargo.toml index 68ee9b1..9b6a9bd 100644 --- a/nlprule/Cargo.toml +++ b/nlprule/Cargo.toml @@ -27,6 +27,7 @@ indexmap = { version = "1", features = ["serde"] } unicase = "2.6" derivative = "2.2" fst = "0.4" +fs-err = "2.5" aho-corasick = "0.7" half = { version = "1.7", features = ["serde"] } srx = { version = "^0.1.2", features = ["serde"] } diff --git a/nlprule/src/bin/compile.rs b/nlprule/src/bin/compile.rs index 0b52c1a..6f63412 100644 --- a/nlprule/src/bin/compile.rs +++ b/nlprule/src/bin/compile.rs @@ -1,9 +1,29 @@ use clap::Clap; -use nlprule::compile::{compile, BuildOptions, Error}; +use fs_err as fs; +use nlprule::compile::{compile, Error}; +use std::io::BufWriter; +use std::path::PathBuf; + +#[derive(clap::Clap)] +#[clap( + version = env!("CARGO_PKG_VERSION"), + author = "Benjamin Minixhofer " +)] +pub struct BuildOptions { + #[clap(long, parse(from_os_str))] + pub build_dir: PathBuf, + #[clap(long, parse(from_os_str))] + pub tokenizer_out: PathBuf, + #[clap(long, parse(from_os_str))] + pub rules_out: PathBuf, +} fn main() -> Result<(), Error> { env_logger::init(); let opts = BuildOptions::parse(); - compile(&opts) + let tokenizer_sink = BufWriter::new(fs::File::create(&opts.tokenizer_out)?); + let rules_sink = BufWriter::new(fs::File::create(&opts.rules_out)?); + + compile(opts.build_dir, rules_sink, tokenizer_sink) } diff --git a/nlprule/src/compile/mod.rs b/nlprule/src/compile/mod.rs index 34da90a..2ed3054 100644 --- a/nlprule/src/compile/mod.rs +++ b/nlprule/src/compile/mod.rs @@ -1,9 +1,11 @@ //! Creates the nlprule binaries from a *build directory*. Usage information in /build/README.md. +use fs::File; +use fs_err as fs; + use std::{ - fs::{self, File}, hash::{Hash, Hasher}, - io::{BufReader, BufWriter}, + io::{self, BufReader, BufWriter}, num::ParseIntError, path::{Path, PathBuf}, str::FromStr, @@ -15,7 +17,6 @@ use crate::{ tokenizer::{chunk::Chunker, multiword::MultiwordTagger, tag::Tagger, Tokenizer}, types::DefaultHasher, }; -use clap::Clap; use log::info; use self::parse_structure::{BuildInfo, RegexCache}; @@ -26,20 +27,6 @@ mod parse_structure; mod structure; mod utils; -#[derive(Clap)] -#[clap( - version = env!("CARGO_PKG_VERSION"), - author = "Benjamin Minixhofer " -)] -pub struct BuildOptions { - #[clap(long, parse(from_os_str))] - pub build_dir: PathBuf, - #[clap(long, parse(from_os_str))] - pub tokenizer_out: PathBuf, - #[clap(long, parse(from_os_str))] - pub rules_out: PathBuf, -} - struct BuildFilePaths { lang_code_path: PathBuf, tag_paths: Vec, @@ -95,8 +82,12 @@ pub enum Error { ParseError(#[from] ParseIntError), } -pub fn compile(opts: &BuildOptions) -> Result<(), Error> { - let paths = BuildFilePaths::new(&opts.build_dir); +pub fn compile( + build_dir: impl AsRef, + mut rules_dest: impl io::Write, + mut tokenizer_dest: impl io::Write, +) -> Result<(), Error> { + let paths = BuildFilePaths::new(&build_dir); let lang_code = fs::read_to_string(paths.lang_code_path)?; @@ -185,14 +176,11 @@ pub fn compile(opts: &BuildOptions) -> Result<(), Error> { tokenizer_options, )?; - let f = BufWriter::new(File::create(&opts.tokenizer_out)?); - bincode::serialize_into(f, &tokenizer)?; + bincode::serialize_into(&mut tokenizer_dest, &tokenizer)?; info!("Creating grammar rules."); let rules = Rules::from_xml(&paths.grammar_path, &mut build_info, &rules_options); - - let f = BufWriter::new(File::create(&opts.rules_out)?); - bincode::serialize_into(f, &rules)?; + bincode::serialize_into(&mut rules_dest, &rules)?; // we need to write the regex cache after building the rules, otherwise it isn't fully populated let f = BufWriter::new(File::create(&paths.regex_cache_path)?);