Skip to content

Commit

Permalink
new: Add CLDR and locale based shortcodes.
Browse files Browse the repository at this point in the history
  • Loading branch information
milesj committed Aug 12, 2020
1 parent 6ca4938 commit ff41c68
Show file tree
Hide file tree
Showing 39 changed files with 214,458 additions and 91 deletions.
6,692 changes: 6,692 additions & 0 deletions packages/data/da/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/de/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/en-gb/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/en/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/es-mx/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/es/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/et/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/fi/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/fr/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/hu/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/it/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/ja/shortcodes/cldr-native.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/ja/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/ko/shortcodes/cldr-native.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/ko/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/lt/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/ms/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/nb/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/nl/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/pl/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/pt/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/ru/shortcodes/cldr-native.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/ru/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/sv/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/th/shortcodes/cldr-native.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/th/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/uk/shortcodes/cldr-native.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/uk/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/zh-hant/shortcodes/cldr-native.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/zh-hant/shortcodes/cldr.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/zh/shortcodes/cldr-native.json

Large diffs are not rendered by default.

6,692 changes: 6,692 additions & 0 deletions packages/data/zh/shortcodes/cldr.json

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion packages/generator/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,11 @@
"chalk": "^4.1.0",
"cheerio": "^1.0.0-rc.3",
"fs-extra": "^9.0.1",
"kuroshiro": "^1.1.2",
"kuroshiro-analyzer-kuromoji": "^1.1.0",
"node-fetch": "^2.6.0",
"regexgen": "^1.3.0"
"regexgen": "^1.3.0",
"transliteration": "^2.1.11"
},
"devDependencies": {
"emojibase": "^4.1.1"
Expand Down
5 changes: 0 additions & 5 deletions packages/generator/src/generators/generateData.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,6 @@ function createEmoji(
skin.annotation =
(annotations[stripHexcode(skin.hexcode)] || {}).annotation || emoji.annotation || '';

// TODO
// skin.shortcodes = (emoji.shortcodes || []).map(
// (code) => `${code}_tone${Array.isArray(skin.tone) ? skin.tone.join('-') : skin.tone}`,
// );

// Remove any tags
delete skin.tags;

Expand Down
220 changes: 136 additions & 84 deletions packages/generator/src/generators/generateShortcodes.ts
Original file line number Diff line number Diff line change
@@ -1,93 +1,145 @@
import path from 'path';
import { Emoji, TEXT } from 'emojibase';
import writeFile from '../helpers/writeFile';

const GUIDELINES = `/**
* Official Emojibase shortcodes list.
*
* NAMING GUIDELINES
*
* - Gender neutral emoji must be prefixed with "person_",
* while female emoji use "woman_", and male "man_".
* Plural forms use "people_", "women_", and "men_".
* In rare occasions, the gender can be suffixed,
* like "bald_man" or "blonde_woman".
*
* - Animals depicted from the side use the animal name,
* while animals depicted with a head, or a face,
* must use the animal name suffixed with "_face".
*
* - Japenese specific emoji must be prefixed with "ja_".
*
* - Specifiers, like color or size, must be used as a
* prefix. For example, "small_", or "red_".
*
* - Use a more descriptive term over the annotation if
* applicable. For example, "storm" over the annotation
* "cloud with lightning and rain".
*
* - Use emotions when describing smiley faces. For example,
* "happy" over the annotation "smiling face with open
* mouth & smiling eyes".
* https://www.dailywritingtips.com/100-words-for-facial-expressions/
*
* ADDING SHORTCODES
*
* Please submit a PR with the addition so that it
* may be discussed.
*
* RENAMING/REMOVING SHORTCODES
*
* Shortcodes are meant to be permanent, and should never
* change (excluding typos), as to not destroy historical
* usage of the shortcode. If a more descriptive term
* is wanted, or the Unicode standard has changed meaning
* or naming, we should persist the original shortcode.
* We can do this by shifting the old shortcode to the end
* of the array, while placing the new shortcode at the
* beginning. This allows for backwards compatible changes.
*/`;

export default async function generateShortcodes(): Promise<string> {
// eslint-disable-next-line
const data: Required<Emoji>[] = require(path.join(process.cwd(), 'packages/data/en/data.json'));
const output: string[] = [
'/* eslint-disable sort-keys */',
'',
GUIDELINES,
'',
'export default {',
];
let lastVersion = 0;

// Sort by version -> order
data.sort((a, b) => (a.version === b.version ? a.order - b.order : a.version - b.version));

// Add each emoji to the list
data.forEach((emoji) => {
if (emoji.version !== lastVersion) {
if (lastVersion !== 0) {
output.push('');
}
/* eslint-disable @typescript-eslint/no-unsafe-assignment, unicorn/better-regex */

import { SUPPORTED_LOCALES } from 'emojibase';
import Kuroshiro from 'kuroshiro';
import KuromojiAnalyzer from 'kuroshiro-analyzer-kuromoji';
import { transliterate } from 'transliteration';
import buildEmojiData from '../builders/buildEmojiData';
import buildAnnotationData from '../builders/buildAnnotationData';
import writeDataset from '../helpers/writeDataset';
import filterData from '../helpers/filterData';
import log from '../helpers/log';
import { ShortcodeMap, EmojiModification } from '../types';

const CUSTOM_SHORTCODES: { [key: string]: string } = {
e_mail: 'email',
t_rex: 'trex',
};

output.push(` // VERSION ${emoji.version}`);
// Non-latin: ja, ko, ru, th, uk, zh, zh-hant
const LATIN_LOCALES = new Set([
'da',
'de',
'en',
'en-gb',
'es',
'es-mx',
'et',
'fi',
'fr',
'hu',
'it',
'lt',
'ms',
'nb',
'nl',
'pl',
'pt',
'sv',
]);

lastVersion = emoji.version;
const kuroshiro = new Kuroshiro();

async function slugify(value: string, locale: string, transform: boolean = false): Promise<string> {
let slug = value.trim();

if (transform) {
// Japanese: https://github.com/dzcpy/transliteration/issues/226
if (locale === 'ja') {
slug = await kuroshiro.convert(slug, {
mode: 'spaced',
romajiSystem: 'passport',
to: 'romaji',
});
} else {
slug = transliterate(slug);
}
}

slug = slug
.toLocaleLowerCase()
// Remove separators
.replace(/(\s|-|`|\/|\\)+/g, '_')
// Remove special chars
.replace(/([!"&'()+,.:;<>?ʼ])/g, '')
// Remove multiple underscores
.replace(/_{2,}/g, '_')
// Remove leading underscores
.replace(/^_+/, '')
// Remove trailing underscores
.replace(/_+$/, '');

return CUSTOM_SHORTCODES[slug] || slug;
}

function appendToneIndex(shortcode: string, mod: EmojiModification): string {
return `${shortcode}_${Array.isArray(mod.tone) ? mod.tone.join('-') : mod.tone}`;
}

export default async function generateShortcodes(): Promise<void> {
log.title('data', 'Generating shortcode datasets');

const data = await buildEmojiData();
const emojis = Object.values(filterData(data));

const unicode = emoji.type === TEXT ? emoji.text : emoji.emoji;
const shortcodes = emoji.shortcodes.map((sc) => `'${sc}'`);
// Setup transliterations
await kuroshiro.init(new KuromojiAnalyzer()); // Japanese

output.push(` // ${unicode} ${emoji.annotation || emoji.name}`);
output.push(` '${emoji.hexcode}': [${shortcodes.join(', ')}],`);
});
// Generate CLDR shortcodes for each locale
await Promise.all(
SUPPORTED_LOCALES.map(async (locale: string) => {
const isLatinChars = LATIN_LOCALES.has(locale);
const annotations = await buildAnnotationData(locale);
const cldr: ShortcodeMap = {};
const cldrNonLatin: ShortcodeMap = {};
let hasLatin = false;
let hasNonLatin = false;

output.push('};');
// eslint-disable-next-line no-restricted-syntax
for await (const emoji of emojis) {
const row = annotations[emoji.hexcode];

if (!row || !row.annotation) {
// eslint-disable-next-line no-continue
continue;
}

cldr[emoji.hexcode] = [await slugify(row.annotation, locale, true)];
hasLatin = true;

if (!isLatinChars) {
cldrNonLatin[emoji.hexcode] = [await slugify(row.annotation, locale)];
hasNonLatin = true;
}

// Skin tones
if (emoji.modifications) {
// eslint-disable-next-line no-loop-func
Object.values(emoji.modifications).forEach((mod) => {
if (hasLatin) {
cldr[mod.hexcode] = cldr[emoji.hexcode].map((code) => appendToneIndex(code, mod));
}

if (hasNonLatin) {
cldrNonLatin[mod.hexcode] = cldrNonLatin[emoji.hexcode].map((code) =>
appendToneIndex(code, mod),
);
}
});
}
}

const promises: Promise<unknown>[] = [];

if (hasLatin) {
promises.push(writeDataset(`${locale}/shortcodes/cldr.json`, cldr));
}

if (hasNonLatin) {
promises.push(writeDataset(`${locale}/shortcodes/cldr-native.json`, cldrNonLatin));
}

// Write it!
return writeFile(
path.join(process.cwd(), 'packages/generator/src/resources'),
'shortcodes.ts',
output.join('\n'),
return Promise.all(promises);
}),
);
}
93 changes: 93 additions & 0 deletions packages/generator/src/generators/generateShortcodesOLD.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import path from 'path';
import { Emoji, TEXT } from 'emojibase';
import writeFile from '../helpers/writeFile';

const GUIDELINES = `/**
* Official Emojibase shortcodes list.
*
* NAMING GUIDELINES
*
* - Gender neutral emoji must be prefixed with "person_",
* while female emoji use "woman_", and male "man_".
* Plural forms use "people_", "women_", and "men_".
* In rare occasions, the gender can be suffixed,
* like "bald_man" or "blonde_woman".
*
* - Animals depicted from the side use the animal name,
* while animals depicted with a head, or a face,
* must use the animal name suffixed with "_face".
*
* - Japenese specific emoji must be prefixed with "ja_".
*
* - Specifiers, like color or size, must be used as a
* prefix. For example, "small_", or "red_".
*
* - Use a more descriptive term over the annotation if
* applicable. For example, "storm" over the annotation
* "cloud with lightning and rain".
*
* - Use emotions when describing smiley faces. For example,
* "happy" over the annotation "smiling face with open
* mouth & smiling eyes".
* https://www.dailywritingtips.com/100-words-for-facial-expressions/
*
* ADDING SHORTCODES
*
* Please submit a PR with the addition so that it
* may be discussed.
*
* RENAMING/REMOVING SHORTCODES
*
* Shortcodes are meant to be permanent, and should never
* change (excluding typos), as to not destroy historical
* usage of the shortcode. If a more descriptive term
* is wanted, or the Unicode standard has changed meaning
* or naming, we should persist the original shortcode.
* We can do this by shifting the old shortcode to the end
* of the array, while placing the new shortcode at the
* beginning. This allows for backwards compatible changes.
*/`;

export default async function generateShortcodes(): Promise<string> {
// eslint-disable-next-line
const data: Required<Emoji>[] = require(path.join(process.cwd(), 'packages/data/en/data.json'));
const output: string[] = [
'/* eslint-disable sort-keys */',
'',
GUIDELINES,
'',
'export default {',
];
let lastVersion = 0;

// Sort by version -> order
data.sort((a, b) => (a.version === b.version ? a.order - b.order : a.version - b.version));

// Add each emoji to the list
data.forEach((emoji) => {
if (emoji.version !== lastVersion) {
if (lastVersion !== 0) {
output.push('');
}

output.push(` // VERSION ${emoji.version}`);

lastVersion = emoji.version;
}

const unicode = emoji.type === TEXT ? emoji.text : emoji.emoji;
const shortcodes = emoji.shortcodes.map((sc) => `'${sc}'`);

output.push(` // ${unicode} ${emoji.annotation || emoji.name}`);
output.push(` '${emoji.hexcode}': [${shortcodes.join(', ')}],`);
});

output.push('};');

// Write it!
return writeFile(
path.join(process.cwd(), 'packages/generator/src/resources'),
'shortcodes.ts',
output.join('\n'),
);
}
2 changes: 1 addition & 1 deletion packages/regex/tests/regex.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import CODEPOINT_EMOJI_PATTERN from '../codepoint/emoji';
import CODEPOINT_EMOJI_LOOSE_PATTERN from '../codepoint/emoji-loose';
import CODEPOINT_TEXT_PATTERN from '../codepoint/text';
import CODEPOINT_TEXT_LOOSE_PATTERN from '../codepoint/text-loose';
import SHORTCODE_PATTERN from '../shortcode';
// import SHORTCODE_PATTERN from '../shortcode';
import EMOTICON_PATTERN from '../emoticon';

const PATTERNS = {
Expand Down
15 changes: 15 additions & 0 deletions types/kuroshiro.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
declare module 'kuroshiro' {
class Kuroshiro {
constructor();
init(analyzer: unknown): Promise<void>;
convert(value: string, options: {
mode?: 'normal' | 'spaced' | 'okurigana' | 'furigana',
to?: 'hiragana' | 'katakana' | 'romaji',
romajiSystem?: 'nippon' | 'passport' | 'hepburn'
}): Promise<string>;
}

export default Kuroshiro;
}

declare module 'kuroshiro-analyzer-kuromoji';
Loading

0 comments on commit ff41c68

Please sign in to comment.