-
-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathgeneral_category.rs
136 lines (123 loc) · 4.72 KB
/
general_category.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
use std::collections::{BTreeMap, BTreeSet};
use ucd_parse::{self, UnicodeData, UnicodeDataExpander};
use crate::args::ArgMatches;
use crate::error::Result;
use crate::util::{print_property_values, PropertyValues};
pub fn command(args: ArgMatches<'_>) -> Result<()> {
let dir = args.ucd_dir()?;
let propvals = PropertyValues::from_ucd_dir(&dir)?;
let filter = args.filter(|name| propvals.canonical("gc", name))?;
let unexpanded = ucd_parse::parse(&dir)?;
// If we were tasked with listing the available categories, then do that
// and quit.
if args.is_present("list-categories") {
return print_property_values(&propvals, "General_Category");
}
let mut bycat = expand_into_categories(unexpanded, &propvals)?;
// As another special case, collect all "related" groups of categories.
// But don't do this when printing an enumeration, because in an
// enumeration each codepoint should belong to exactly one category, which
// is not true if we include related categories.
if !args.is_present("enum") && !args.is_present("rust-enum") {
for (name, set) in related(&propvals, &bycat) {
if filter.contains(&name) {
bycat.insert(name, set);
}
}
}
// Finally, filter out any sets according to what the user asked for.
let bycat = bycat
.into_iter()
.filter(|&(ref name, _)| filter.contains(name))
.collect();
let mut wtr = args.writer("general_category")?;
if args.is_present("enum") {
wtr.ranges_to_enum(args.name(), &bycat)?;
} else if args.is_present("rust-enum") {
let variants = bycat.keys().map(String::as_str).collect::<Vec<_>>();
wtr.ranges_to_rust_enum(args.name(), &variants, &bycat)?;
} else {
wtr.names(bycat.keys().filter(|n| filter.contains(n)))?;
for (name, set) in bycat {
wtr.ranges(&name, &set)?;
}
}
Ok(())
}
/// Expand a list of UnicodeData rows and group by category.
pub fn expand_into_categories(
unexpanded: Vec<UnicodeData>,
propvals: &PropertyValues,
) -> Result<BTreeMap<String, BTreeSet<u32>>> {
// Expand all of our UnicodeData rows. This results in one big list of
// all assigned codepoints.
let rows: Vec<_> = UnicodeDataExpander::new(unexpanded).collect();
// Collect each general category into an ordered set.
let mut bycat: BTreeMap<String, BTreeSet<u32>> = BTreeMap::new();
let mut assigned = BTreeSet::new();
for row in rows {
assigned.insert(row.codepoint.value());
let gc = propvals.canonical("gc", &row.general_category)?.to_string();
bycat
.entry(gc)
.or_insert(BTreeSet::new())
.insert(row.codepoint.value());
}
// As a special case, collect all unassigned codepoints.
let unassigned_name = propvals.canonical("gc", "unassigned")?.to_string();
bycat.insert(unassigned_name.clone(), BTreeSet::new());
for cp in 0..=0x10FFFF {
if !assigned.contains(&cp) {
bycat.get_mut(&unassigned_name).unwrap().insert(cp);
}
}
Ok(bycat)
}
/// Related returns a set of sets of codepoints corresponding to the "related"
/// groups of categories defined by Table 12 in UAX#44 S5.7.1.
///
/// The given `cats` should correspond to the normal set of general categories,
/// keyed by canonical name.
fn related(
propvals: &PropertyValues,
cats: &BTreeMap<String, BTreeSet<u32>>,
) -> BTreeMap<String, BTreeSet<u32>> {
let mut sets = BTreeMap::new();
for (name, components) in related_categories(propvals) {
let set = sets.entry(name).or_insert(BTreeSet::new());
for component in components {
set.extend(cats[&component].iter().cloned());
}
}
sets
}
/// Return all groups of "related" general categories.
fn related_categories(
propvals: &PropertyValues,
) -> Vec<(String, Vec<String>)> {
// canonicalize a gencat property value
let c = |name: &str| -> String {
propvals.canonical("gc", name).unwrap().to_string()
};
vec![
(c("Cased_Letter"), vec![c("lu"), c("ll"), c("lt")]),
(c("Letter"), vec![c("lu"), c("ll"), c("lt"), c("lm"), c("lo")]),
(c("Mark"), vec![c("mn"), c("mc"), c("me")]),
(c("Number"), vec![c("nd"), c("nl"), c("no")]),
(
c("Punctuation"),
vec![
c("pc"),
c("pd"),
c("ps"),
c("pe"),
c("pi"),
c("pf"),
c("po"),
],
),
(c("Symbol"), vec![c("sm"), c("sc"), c("sk"), c("so")]),
(c("Separator"), vec![c("zs"), c("zl"), c("zp")]),
(c("Other"), vec![c("cc"), c("cf"), c("cs"), c("co"), c("cn")]),
]
}