Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for emitting flattened case tables #34

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions src/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,10 +196,16 @@ pub fn app() -> App<'static, 'static> {
.long("fst-dir")
.help("Emit the table as a FST in Rust source code.")
.takes_value(true);
let flag_flat_table = Arg::with_name("flat-table").long("flat-table").help(
"When emitting a map of a single codepoint to multiple codepoints, emit \
entries as `(u32, [u32; 3])` instead of as `(u32, &[u32])` (replacing \
`u32` with `char` if `--chars` is passed). \
Conceptually unoccupied indices of the array will contain `!0u32` (for \
u32) or `\\u{0}` (for `char`)."
);
let ucd_dir = Arg::with_name("ucd-dir")
.required(true)
.help("Directory containing the Unicode character database files.");

// Subcommands.
let cmd_bidi_class = SubCommand::with_name("bidi-class")
.author(clap::crate_authors!())
Expand Down Expand Up @@ -506,7 +512,8 @@ pub fn app() -> App<'static, 'static> {
.arg(Arg::with_name("all-pairs").long("all-pairs").help(
"Emit a table where each codepoint includes all possible \
Simple mappings.",
));
))
.arg(flag_flat_table.clone().requires("all-pairs"));
let cmd_case_mapping = SubCommand::with_name("case-mapping")
.author(clap::crate_authors!())
.version(clap::crate_version!())
Expand All @@ -520,7 +527,8 @@ pub fn app() -> App<'static, 'static> {
"Only emit the simple case mapping tables \
(emit maps of codepoint to codepoint, \
ignoring rules from SpecialCasing.txt)",
));
))
.arg(flag_flat_table.clone().conflicts_with("simple"));

let cmd_grapheme_cluster_break =
SubCommand::with_name("grapheme-cluster-break")
Expand Down
3 changes: 2 additions & 1 deletion src/case_folding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ pub fn command(args: ArgMatches<'_>) -> Result<()> {
}
wtr.codepoint_to_codepoint(args.name(), &equiv)?;
} else if args.is_present("all-pairs") {
wtr.multi_codepoint_to_codepoint(args.name(), &table_all)?;
let flat = args.is_present("flat-table");
wtr.multi_codepoint_to_codepoint(args.name(), &table_all, flat)?;
} else {
wtr.codepoint_to_codepoint(args.name(), &table)?;
}
Expand Down
7 changes: 4 additions & 3 deletions src/case_mapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,10 @@ pub fn command(args: ArgMatches<'_>) -> Result<()> {
);
}
}
wtr.codepoint_to_codepoints("LOWER", &lower_map)?;
wtr.codepoint_to_codepoints("UPPER", &upper_map)?;
wtr.codepoint_to_codepoints("TITLE", &upper_map)?;
let flat = args.is_present("flat-table");
wtr.codepoint_to_codepoints("LOWER", &lower_map, flat)?;
wtr.codepoint_to_codepoints("UPPER", &upper_map, flat)?;
wtr.codepoint_to_codepoints("TITLE", &upper_map, flat)?;
}
Ok(())
}
60 changes: 52 additions & 8 deletions src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,7 @@ impl Writer {
&mut self,
name: &str,
map: &BTreeMap<u32, BTreeSet<u32>>,
emit_flat_table: bool,
) -> Result<()> {
if self.opts.fst_dir.is_some() {
return err!("cannot emit codepoint multimaps as an FST");
Expand All @@ -604,7 +605,7 @@ impl Writer {
let vs2 = vs.iter().cloned().collect();
map2.insert(k, vs2);
}
self.codepoint_to_codepoints(name, &map2)
self.codepoint_to_codepoints(name, &map2, emit_flat_table)
}

/// Write a map that associates codepoints with a sequence of other
Expand All @@ -615,6 +616,7 @@ impl Writer {
&mut self,
name: &str,
map: &BTreeMap<u32, Vec<u32>>,
emit_flat_table: bool,
) -> Result<()> {
if self.opts.fst_dir.is_some() {
return err!("cannot emit codepoint->codepoints map as an FST");
Expand All @@ -625,27 +627,63 @@ impl Writer {

let name = rust_const_name(name);
let ty = self.rust_codepoint_type();
writeln!(
self.wtr,
"pub const {}: &'static [({}, &'static [{}])] = &[",
name, ty, ty
)?;
if !emit_flat_table {
writeln!(
self.wtr,
"pub const {}: &'static [({}, &'static [{}])] = &[",
name, ty, ty
)?;
} else {
writeln!(
self.wtr,
"pub const {}: &'static [({}, [{}; 3])] = &[",
name, ty, ty
)?;
}
'LOOP: for (&k, vs) in map {
// Make sure both our keys and values can be represented in the
// user's chosen codepoint format.
let kstr = match self.rust_codepoint(k) {
None => continue 'LOOP,
Some(k) => k,
};

let (padded_vs, slice_prefix) = if emit_flat_table {
// These checks are for future-proofing and cannot be hit currently.
if vs.len() > 3 {
return err!(
"flat-table representation cannot be used when value \
arrays may contain more than 3 entries"
);
}
let flat_padding =
if self.opts.char_literals { 0 } else { !0 };
if vs.contains(&flat_padding) {
return err!(
"flat-table --chars representation cannot be used when \
the NUL character is present in the value array. (This \
error probably can be fixed by removing `--chars`)"
);
}
let res = vs
.iter()
.copied()
.chain(std::iter::repeat(flat_padding))
.take(3)
.collect::<Vec<_>>();
(res, "")
} else {
(vs.clone(), "&")
};
let mut vstrs = vec![];
for &v in vs {
for v in padded_vs {
match self.rust_codepoint(v) {
None => continue 'LOOP,
Some(v) => vstrs.push(v),
}
}

self.wtr.write_str(&format!("({}, &[", kstr))?;
self.wtr.write_str(&format!("({}, {}[", kstr, slice_prefix))?;
if vstrs.len() == 1 {
self.wtr.write_str(&format!("{}", &vstrs[0]))?;
} else {
Expand Down Expand Up @@ -1168,6 +1206,12 @@ impl Writer {
fn rust_codepoint(&self, cp: u32) -> Option<String> {
if self.opts.char_literals {
char::from_u32(cp).map(|c| format!("{:?}", c))
} else if cp == !0 {
// Used to represent missing entries in some cases (specifically
// --flat-table), and writing it as `!0` makes the whole table much
// easier to read while maintaining identical semantics, even if
// `--flat-table` is not in use.
Some("!0".to_string())
} else {
Some(cp.to_string())
}
Expand Down