Skip to content

Commit

Permalink
Serialize/load trie data as JSON
Browse files Browse the repository at this point in the history
  • Loading branch information
Hopding committed Jan 15, 2018
1 parent 3f3a2bb commit 13160bd
Show file tree
Hide file tree
Showing 4 changed files with 578 additions and 29 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
data.json
data.trie
trie.json
.DS_Store
node_modules/
*.js
18 changes: 11 additions & 7 deletions generate.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ UnicodeTrieBuilder = require 'unicode-trie/builder'

log2 = Math.log2 or (n) ->
Math.log(n) / Math.LN2

bits = (n) ->
(log2(n) + 1) | 0

Expand All @@ -18,7 +18,7 @@ scriptCount = 0
eawCount = 0

for codePoint in codePoints when codePoint?
categories[codePoint.category] ?= categoryCount++
categories[codePoint.category] ?= categoryCount++
combiningClasses[codePoint.combiningClassName] ?= combiningClassCount++
scripts[codePoint.script] ?= scriptCount++
eaws[codePoint.eastAsianWidth] ?= eawCount++
Expand Down Expand Up @@ -58,27 +58,31 @@ numericValue = (numeric) ->
while (mant % 60) is 0
mant /= 60
++exp

((mant + 0xbf) << 2) + (exp - 1)
else
0

trie = new UnicodeTrieBuilder
for codePoint in codePoints when codePoint?
for codePoint in codePoints when codePoint?
category = categories[codePoint.category]
combiningClass = combiningClasses[codePoint.combiningClassName] or 0
script = scripts[codePoint.script] or 0
eaw = eaws[codePoint.eastAsianWidth] or 0

val = (category << categoryShift) |
(combiningClass << combiningShift) |
(script << scriptShift) |
(eaw << eawShift) |
numericValue(codePoint.numeric)

trie.set codePoint.code, val

fs.writeFileSync 'data.trie', trie.toBuffer()

# Trie is serialized suboptimally as JSON so it can be loaded via require,
# allowing unicode-properties to work in the browser
fs.writeFileSync 'trie.json', JSON.stringify trie.toBuffer()

fs.writeFileSync 'data.json', JSON.stringify
categories: Object.keys(categories)
combiningClasses: Object.keys(combiningClasses)
Expand Down
46 changes: 25 additions & 21 deletions index.coffee
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
UnicodeTrie = require 'unicode-trie'
data = require './data.json'
fs = require 'fs'
trie = new UnicodeTrie fs.readFileSync __dirname + '/data.trie'

# Trie is serialized as a Buffer in node, but here
# we may be running in a browser so we make an Uint8Array
trieBuffer = require './trie.json'
trieData = new Uint8Array trieBuffer.data
trie = new UnicodeTrie trieData

log2 = Math.log2 or (n) ->
Math.log(n) / Math.LN2

bits = (n) ->
(log2(n) + 1) | 0

Expand All @@ -31,23 +35,23 @@ NUMBER_MASK = (1 << NUMBER_BITS) - 1
exports.getCategory = (codePoint) ->
val = trie.get codePoint
data.categories[(val >> CATEGORY_SHIFT) & CATEGORY_MASK]

exports.getCombiningClass = (codePoint) ->
val = trie.get codePoint
data.combiningClasses[(val >> COMBINING_SHIFT) & COMBINING_MASK]

exports.getScript = (codePoint) ->
val = trie.get codePoint
data.scripts[(val >> SCRIPT_SHIFT) & SCRIPT_MASK]

exports.getEastAsianWidth = (codePoint) ->
val = trie.get codePoint
data.eaw[(val >> EAW_SHIFT) & EAW_MASK]

exports.getNumericValue = (codePoint) ->
val = trie.get codePoint
num = val & NUMBER_MASK

if num is 0
return null
else if num <= 50
Expand All @@ -61,46 +65,46 @@ exports.getNumericValue = (codePoint) ->
# base 10
val = (num >> 5) - 14
exp = (num & 0x1f) + 2

while exp > 0
val *= 10
exp--

return val
else
# base 60
val = (num >> 2) - 0xbf
exp = (num & 3) + 1

while exp > 0
val *= 60
exp--

return val

exports.isAlphabetic = (codePoint) ->
exports.getCategory(codePoint) in ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl']

exports.isDigit = (codePoint) ->
exports.getCategory(codePoint) is 'Nd'

exports.isPunctuation = (codePoint) ->
exports.getCategory(codePoint) in ['Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps']

exports.isLowerCase = (codePoint) ->
exports.getCategory(codePoint) is 'Ll'

exports.isUpperCase = (codePoint) ->
exports.getCategory(codePoint) is 'Lu'

exports.isTitleCase = (codePoint) ->
exports.getCategory(codePoint) is 'Lt'

exports.isWhiteSpace = (codePoint) ->
exports.getCategory(codePoint) in ['Zs', 'Zl', 'Zp']

exports.isBaseForm = (codePoint) ->
exports.getCategory(codePoint) in ['Nd', 'No', 'Nl', 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Me', 'Mc']

exports.isMark = (codePoint) ->
exports.getCategory(codePoint) in ['Mn', 'Me', 'Mc']
Loading

0 comments on commit 13160bd

Please sign in to comment.