diff --git a/src/app.rs b/src/app.rs index bbe72db..ea32037 100644 --- a/src/app.rs +++ b/src/app.rs @@ -196,10 +196,16 @@ pub fn app() -> App<'static, 'static> { .long("fst-dir") .help("Emit the table as a FST in Rust source code.") .takes_value(true); + let flag_flat_table = Arg::with_name("flat-table").long("flat-table").help( + "When emitting a map of a single codepoint to multiple codepoints, emit \ + entries as `(u32, [u32; 3])` instead of as `(u32, &[u32])` (replacing \ + `u32` with `char` if `--chars` is passed). \ + Conceptually unoccupied indices of the array will contain `!0u32` (for \ + u32) or `\\u{0}` (for `char`)." + ); let ucd_dir = Arg::with_name("ucd-dir") .required(true) .help("Directory containing the Unicode character database files."); - // Subcommands. let cmd_bidi_class = SubCommand::with_name("bidi-class") .author(clap::crate_authors!()) @@ -506,7 +512,8 @@ pub fn app() -> App<'static, 'static> { .arg(Arg::with_name("all-pairs").long("all-pairs").help( "Emit a table where each codepoint includes all possible \ Simple mappings.", - )); + )) + .arg(flag_flat_table.clone().requires("all-pairs")); let cmd_case_mapping = SubCommand::with_name("case-mapping") .author(clap::crate_authors!()) .version(clap::crate_version!()) @@ -520,7 +527,8 @@ pub fn app() -> App<'static, 'static> { "Only emit the simple case mapping tables \ (emit maps of codepoint to codepoint, \ ignoring rules from SpecialCasing.txt)", - )); + )) + .arg(flag_flat_table.clone().conflicts_with("simple")); let cmd_grapheme_cluster_break = SubCommand::with_name("grapheme-cluster-break") diff --git a/src/case_folding.rs b/src/case_folding.rs index 3c6f5c5..9d09875 100644 --- a/src/case_folding.rs +++ b/src/case_folding.rs @@ -67,7 +67,8 @@ pub fn command(args: ArgMatches<'_>) -> Result<()> { } wtr.codepoint_to_codepoint(args.name(), &equiv)?; } else if args.is_present("all-pairs") { - wtr.multi_codepoint_to_codepoint(args.name(), &table_all)?; + let flat = args.is_present("flat-table"); + wtr.multi_codepoint_to_codepoint(args.name(), &table_all, flat)?; } else { wtr.codepoint_to_codepoint(args.name(), &table)?; } diff --git a/src/case_mapping.rs b/src/case_mapping.rs index f70ee3f..2b9375a 100644 --- a/src/case_mapping.rs +++ b/src/case_mapping.rs @@ -60,9 +60,10 @@ pub fn command(args: ArgMatches<'_>) -> Result<()> { ); } } - wtr.codepoint_to_codepoints("LOWER", &lower_map)?; - wtr.codepoint_to_codepoints("UPPER", &upper_map)?; - wtr.codepoint_to_codepoints("TITLE", &upper_map)?; + let flat = args.is_present("flat-table"); + wtr.codepoint_to_codepoints("LOWER", &lower_map, flat)?; + wtr.codepoint_to_codepoints("UPPER", &upper_map, flat)?; + wtr.codepoint_to_codepoints("TITLE", &upper_map, flat)?; } Ok(()) } diff --git a/src/writer.rs b/src/writer.rs index dfde2d8..bad925c 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -594,6 +594,7 @@ impl Writer { &mut self, name: &str, map: &BTreeMap>, + emit_flat_table: bool, ) -> Result<()> { if self.opts.fst_dir.is_some() { return err!("cannot emit codepoint multimaps as an FST"); @@ -604,7 +605,7 @@ impl Writer { let vs2 = vs.iter().cloned().collect(); map2.insert(k, vs2); } - self.codepoint_to_codepoints(name, &map2) + self.codepoint_to_codepoints(name, &map2, emit_flat_table) } /// Write a map that associates codepoints with a sequence of other @@ -615,6 +616,7 @@ impl Writer { &mut self, name: &str, map: &BTreeMap>, + emit_flat_table: bool, ) -> Result<()> { if self.opts.fst_dir.is_some() { return err!("cannot emit codepoint->codepoints map as an FST"); @@ -625,11 +627,19 @@ impl Writer { let name = rust_const_name(name); let ty = self.rust_codepoint_type(); - writeln!( - self.wtr, - "pub const {}: &'static [({}, &'static [{}])] = &[", - name, ty, ty - )?; + if !emit_flat_table { + writeln!( + self.wtr, + "pub const {}: &'static [({}, &'static [{}])] = &[", + name, ty, ty + )?; + } else { + writeln!( + self.wtr, + "pub const {}: &'static [({}, [{}; 3])] = &[", + name, ty, ty + )?; + } 'LOOP: for (&k, vs) in map { // Make sure both our keys and values can be represented in the // user's chosen codepoint format. @@ -637,15 +647,43 @@ impl Writer { None => continue 'LOOP, Some(k) => k, }; + + let (padded_vs, slice_prefix) = if emit_flat_table { + // These checks are for future-proofing and cannot be hit currently. + if vs.len() > 3 { + return err!( + "flat-table representation cannot be used when value \ + arrays may contain more than 3 entries" + ); + } + let flat_padding = + if self.opts.char_literals { 0 } else { !0 }; + if vs.contains(&flat_padding) { + return err!( + "flat-table --chars representation cannot be used when \ + the NUL character is present in the value array. (This \ + error probably can be fixed by removing `--chars`)" + ); + } + let res = vs + .iter() + .copied() + .chain(std::iter::repeat(flat_padding)) + .take(3) + .collect::>(); + (res, "") + } else { + (vs.clone(), "&") + }; let mut vstrs = vec![]; - for &v in vs { + for v in padded_vs { match self.rust_codepoint(v) { None => continue 'LOOP, Some(v) => vstrs.push(v), } } - self.wtr.write_str(&format!("({}, &[", kstr))?; + self.wtr.write_str(&format!("({}, {}[", kstr, slice_prefix))?; if vstrs.len() == 1 { self.wtr.write_str(&format!("{}", &vstrs[0]))?; } else { @@ -1168,6 +1206,12 @@ impl Writer { fn rust_codepoint(&self, cp: u32) -> Option { if self.opts.char_literals { char::from_u32(cp).map(|c| format!("{:?}", c)) + } else if cp == !0 { + // Used to represent missing entries in some cases (specifically + // --flat-table), and writing it as `!0` makes the whole table much + // easier to read while maintaining identical semantics, even if + // `--flat-table` is not in use. + Some("!0".to_string()) } else { Some(cp.to_string()) }