new: Add CLDR and locale based shortcodes.

milesj · Aug 12, 2020 · ff41c68 · ff41c68
1 parent 6ca4938
commit ff41c68
Show file tree

Hide file tree

Showing 39 changed files with 214,458 additions and 91 deletions.
diff --git a/packages/data/da/shortcodes/cldr.json b/packages/data/da/shortcodes/cldr.json
diff --git a/packages/data/de/shortcodes/cldr.json b/packages/data/de/shortcodes/cldr.json
diff --git a/packages/data/en-gb/shortcodes/cldr.json b/packages/data/en-gb/shortcodes/cldr.json
diff --git a/packages/data/en/shortcodes/cldr.json b/packages/data/en/shortcodes/cldr.json
diff --git a/packages/data/es-mx/shortcodes/cldr.json b/packages/data/es-mx/shortcodes/cldr.json
diff --git a/packages/data/es/shortcodes/cldr.json b/packages/data/es/shortcodes/cldr.json
diff --git a/packages/data/et/shortcodes/cldr.json b/packages/data/et/shortcodes/cldr.json
diff --git a/packages/data/fi/shortcodes/cldr.json b/packages/data/fi/shortcodes/cldr.json
diff --git a/packages/data/fr/shortcodes/cldr.json b/packages/data/fr/shortcodes/cldr.json
diff --git a/packages/data/hu/shortcodes/cldr.json b/packages/data/hu/shortcodes/cldr.json
diff --git a/packages/data/it/shortcodes/cldr.json b/packages/data/it/shortcodes/cldr.json
diff --git a/packages/data/ja/shortcodes/cldr-native.json b/packages/data/ja/shortcodes/cldr-native.json
diff --git a/packages/data/ja/shortcodes/cldr.json b/packages/data/ja/shortcodes/cldr.json
diff --git a/packages/data/ko/shortcodes/cldr-native.json b/packages/data/ko/shortcodes/cldr-native.json
diff --git a/packages/data/ko/shortcodes/cldr.json b/packages/data/ko/shortcodes/cldr.json
diff --git a/packages/data/lt/shortcodes/cldr.json b/packages/data/lt/shortcodes/cldr.json
diff --git a/packages/data/ms/shortcodes/cldr.json b/packages/data/ms/shortcodes/cldr.json
diff --git a/packages/data/nb/shortcodes/cldr.json b/packages/data/nb/shortcodes/cldr.json
diff --git a/packages/data/nl/shortcodes/cldr.json b/packages/data/nl/shortcodes/cldr.json
diff --git a/packages/data/pl/shortcodes/cldr.json b/packages/data/pl/shortcodes/cldr.json
diff --git a/packages/data/pt/shortcodes/cldr.json b/packages/data/pt/shortcodes/cldr.json
diff --git a/packages/data/ru/shortcodes/cldr-native.json b/packages/data/ru/shortcodes/cldr-native.json
diff --git a/packages/data/ru/shortcodes/cldr.json b/packages/data/ru/shortcodes/cldr.json
diff --git a/packages/data/sv/shortcodes/cldr.json b/packages/data/sv/shortcodes/cldr.json
diff --git a/packages/data/th/shortcodes/cldr-native.json b/packages/data/th/shortcodes/cldr-native.json
diff --git a/packages/data/th/shortcodes/cldr.json b/packages/data/th/shortcodes/cldr.json
diff --git a/packages/data/uk/shortcodes/cldr-native.json b/packages/data/uk/shortcodes/cldr-native.json
diff --git a/packages/data/uk/shortcodes/cldr.json b/packages/data/uk/shortcodes/cldr.json
diff --git a/packages/data/zh-hant/shortcodes/cldr-native.json b/packages/data/zh-hant/shortcodes/cldr-native.json
diff --git a/packages/data/zh-hant/shortcodes/cldr.json b/packages/data/zh-hant/shortcodes/cldr.json
diff --git a/packages/data/zh/shortcodes/cldr-native.json b/packages/data/zh/shortcodes/cldr-native.json
diff --git a/packages/data/zh/shortcodes/cldr.json b/packages/data/zh/shortcodes/cldr.json
diff --git a/packages/generator/package.json b/packages/generator/package.json
@@ -13,8 +13,11 @@
     "chalk": "^4.1.0",
     "cheerio": "^1.0.0-rc.3",
     "fs-extra": "^9.0.1",
+    "kuroshiro": "^1.1.2",
+    "kuroshiro-analyzer-kuromoji": "^1.1.0",
     "node-fetch": "^2.6.0",
-    "regexgen": "^1.3.0"
+    "regexgen": "^1.3.0",
+    "transliteration": "^2.1.11"
   },
   "devDependencies": {
     "emojibase": "^4.1.1"

diff --git a/packages/generator/src/generators/generateData.ts b/packages/generator/src/generators/generateData.ts
@@ -94,11 +94,6 @@ function createEmoji(
       skin.annotation =
         (annotations[stripHexcode(skin.hexcode)] || {}).annotation || emoji.annotation || '';
 
-      // TODO
-      // skin.shortcodes = (emoji.shortcodes || []).map(
-      //   (code) => `${code}_tone${Array.isArray(skin.tone) ? skin.tone.join('-') : skin.tone}`,
-      // );
-
       // Remove any tags
       delete skin.tags;
 

diff --git a/packages/generator/src/generators/generateShortcodes.ts b/packages/generator/src/generators/generateShortcodes.ts
@@ -1,93 +1,145 @@
-import path from 'path';
-import { Emoji, TEXT } from 'emojibase';
-import writeFile from '../helpers/writeFile';
-
-const GUIDELINES = `/**
- * Official Emojibase shortcodes list.
- *
- * NAMING GUIDELINES
- *
- *  - Gender neutral emoji must be prefixed with "person_",
- *    while female emoji use "woman_", and male "man_".
- *    Plural forms use "people_", "women_", and "men_".
- *    In rare occasions, the gender can be suffixed,
- *    like "bald_man" or "blonde_woman".
- *
- *  - Animals depicted from the side use the animal name,
- *    while animals depicted with a head, or a face,
- *    must use the animal name suffixed with "_face".
- *
- *  - Japenese specific emoji must be prefixed with "ja_".
- *
- *  - Specifiers, like color or size, must be used as a
- *    prefix. For example, "small_", or "red_".
- *
- *  - Use a more descriptive term over the annotation if
- *    applicable. For example, "storm" over the annotation
- *    "cloud with lightning and rain".
- *
- *  - Use emotions when describing smiley faces. For example,
- *    "happy" over the annotation "smiling face with open
- *    mouth & smiling eyes".
- *    https://www.dailywritingtips.com/100-words-for-facial-expressions/
- *
- * ADDING SHORTCODES
- *
- * Please submit a PR with the addition so that it
- * may be discussed.
- *
- * RENAMING/REMOVING SHORTCODES
- *
- * Shortcodes are meant to be permanent, and should never
- * change (excluding typos), as to not destroy historical
- * usage of the shortcode. If a more descriptive term
- * is wanted, or the Unicode standard has changed meaning
- * or naming, we should persist the original shortcode.
- * We can do this by shifting the old shortcode to the end
- * of the array, while placing the new shortcode at the
- * beginning. This allows for backwards compatible changes.
- */`;
-
-export default async function generateShortcodes(): Promise<string> {
-  // eslint-disable-next-line
-  const data: Required<Emoji>[] = require(path.join(process.cwd(), 'packages/data/en/data.json'));
-  const output: string[] = [
-    '/* eslint-disable sort-keys */',
-    '',
-    GUIDELINES,
-    '',
-    'export default {',
-  ];
-  let lastVersion = 0;
-
-  // Sort by version -> order
-  data.sort((a, b) => (a.version === b.version ? a.order - b.order : a.version - b.version));
-
-  // Add each emoji to the list
-  data.forEach((emoji) => {
-    if (emoji.version !== lastVersion) {
-      if (lastVersion !== 0) {
-        output.push('');
-      }
+/* eslint-disable @typescript-eslint/no-unsafe-assignment, unicorn/better-regex */
+
+import { SUPPORTED_LOCALES } from 'emojibase';
+import Kuroshiro from 'kuroshiro';
+import KuromojiAnalyzer from 'kuroshiro-analyzer-kuromoji';
+import { transliterate } from 'transliteration';
+import buildEmojiData from '../builders/buildEmojiData';
+import buildAnnotationData from '../builders/buildAnnotationData';
+import writeDataset from '../helpers/writeDataset';
+import filterData from '../helpers/filterData';
+import log from '../helpers/log';
+import { ShortcodeMap, EmojiModification } from '../types';
+
+const CUSTOM_SHORTCODES: { [key: string]: string } = {
+  e_mail: 'email',
+  t_rex: 'trex',
+};
 
-      output.push(`  // VERSION ${emoji.version}`);
+// Non-latin: ja, ko, ru, th, uk, zh, zh-hant
+const LATIN_LOCALES = new Set([
+  'da',
+  'de',
+  'en',
+  'en-gb',
+  'es',
+  'es-mx',
+  'et',
+  'fi',
+  'fr',
+  'hu',
+  'it',
+  'lt',
+  'ms',
+  'nb',
+  'nl',
+  'pl',
+  'pt',
+  'sv',
+]);
 
-      lastVersion = emoji.version;
+const kuroshiro = new Kuroshiro();
+
+async function slugify(value: string, locale: string, transform: boolean = false): Promise<string> {
+  let slug = value.trim();
+
+  if (transform) {
+    // Japanese: https://github.com/dzcpy/transliteration/issues/226
+    if (locale === 'ja') {
+      slug = await kuroshiro.convert(slug, {
+        mode: 'spaced',
+        romajiSystem: 'passport',
+        to: 'romaji',
+      });
+    } else {
+      slug = transliterate(slug);
     }
+  }
+
+  slug = slug
+    .toLocaleLowerCase()
+    // Remove separators
+    .replace(/(\s|-|`|\/|\\)+/g, '_')
+    // Remove special chars
+    .replace(/([!"&'()+,.:;<>?ʼ’“”])/g, '')
+    // Remove multiple underscores
+    .replace(/_{2,}/g, '_')
+    // Remove leading underscores
+    .replace(/^_+/, '')
+    // Remove trailing underscores
+    .replace(/_+$/, '');
+
+  return CUSTOM_SHORTCODES[slug] || slug;
+}
+
+function appendToneIndex(shortcode: string, mod: EmojiModification): string {
+  return `${shortcode}_${Array.isArray(mod.tone) ? mod.tone.join('-') : mod.tone}`;
+}
+
+export default async function generateShortcodes(): Promise<void> {
+  log.title('data', 'Generating shortcode datasets');
+
+  const data = await buildEmojiData();
+  const emojis = Object.values(filterData(data));
 
-    const unicode = emoji.type === TEXT ? emoji.text : emoji.emoji;
-    const shortcodes = emoji.shortcodes.map((sc) => `'${sc}'`);
+  // Setup transliterations
+  await kuroshiro.init(new KuromojiAnalyzer()); // Japanese
 
-    output.push(`  // ${unicode} ${emoji.annotation || emoji.name}`);
-    output.push(`  '${emoji.hexcode}': [${shortcodes.join(', ')}],`);
-  });
+  // Generate CLDR shortcodes for each locale
+  await Promise.all(
+    SUPPORTED_LOCALES.map(async (locale: string) => {
+      const isLatinChars = LATIN_LOCALES.has(locale);
+      const annotations = await buildAnnotationData(locale);
+      const cldr: ShortcodeMap = {};
+      const cldrNonLatin: ShortcodeMap = {};
+      let hasLatin = false;
+      let hasNonLatin = false;
 
-  output.push('};');
+      // eslint-disable-next-line no-restricted-syntax
+      for await (const emoji of emojis) {
+        const row = annotations[emoji.hexcode];
+
+        if (!row || !row.annotation) {
+          // eslint-disable-next-line no-continue
+          continue;
+        }
+
+        cldr[emoji.hexcode] = [await slugify(row.annotation, locale, true)];
+        hasLatin = true;
+
+        if (!isLatinChars) {
+          cldrNonLatin[emoji.hexcode] = [await slugify(row.annotation, locale)];
+          hasNonLatin = true;
+        }
+
+        // Skin tones
+        if (emoji.modifications) {
+          // eslint-disable-next-line no-loop-func
+          Object.values(emoji.modifications).forEach((mod) => {
+            if (hasLatin) {
+              cldr[mod.hexcode] = cldr[emoji.hexcode].map((code) => appendToneIndex(code, mod));
+            }
+
+            if (hasNonLatin) {
+              cldrNonLatin[mod.hexcode] = cldrNonLatin[emoji.hexcode].map((code) =>
+                appendToneIndex(code, mod),
+              );
+            }
+          });
+        }
+      }
+
+      const promises: Promise<unknown>[] = [];
+
+      if (hasLatin) {
+        promises.push(writeDataset(`${locale}/shortcodes/cldr.json`, cldr));
+      }
+
+      if (hasNonLatin) {
+        promises.push(writeDataset(`${locale}/shortcodes/cldr-native.json`, cldrNonLatin));
+      }
 
-  // Write it!
-  return writeFile(
-    path.join(process.cwd(), 'packages/generator/src/resources'),
-    'shortcodes.ts',
-    output.join('\n'),
+      return Promise.all(promises);
+    }),
   );
 }
diff --git a/packages/generator/src/generators/generateShortcodesOLD.ts b/packages/generator/src/generators/generateShortcodesOLD.ts
@@ -0,0 +1,93 @@
+import path from 'path';
+import { Emoji, TEXT } from 'emojibase';
+import writeFile from '../helpers/writeFile';
+
+const GUIDELINES = `/**
+ * Official Emojibase shortcodes list.
+ *
+ * NAMING GUIDELINES
+ *
+ *  - Gender neutral emoji must be prefixed with "person_",
+ *    while female emoji use "woman_", and male "man_".
+ *    Plural forms use "people_", "women_", and "men_".
+ *    In rare occasions, the gender can be suffixed,
+ *    like "bald_man" or "blonde_woman".
+ *
+ *  - Animals depicted from the side use the animal name,
+ *    while animals depicted with a head, or a face,
+ *    must use the animal name suffixed with "_face".
+ *
+ *  - Japenese specific emoji must be prefixed with "ja_".
+ *
+ *  - Specifiers, like color or size, must be used as a
+ *    prefix. For example, "small_", or "red_".
+ *
+ *  - Use a more descriptive term over the annotation if
+ *    applicable. For example, "storm" over the annotation
+ *    "cloud with lightning and rain".
+ *
+ *  - Use emotions when describing smiley faces. For example,
+ *    "happy" over the annotation "smiling face with open
+ *    mouth & smiling eyes".
+ *    https://www.dailywritingtips.com/100-words-for-facial-expressions/
+ *
+ * ADDING SHORTCODES
+ *
+ * Please submit a PR with the addition so that it
+ * may be discussed.
+ *
+ * RENAMING/REMOVING SHORTCODES
+ *
+ * Shortcodes are meant to be permanent, and should never
+ * change (excluding typos), as to not destroy historical
+ * usage of the shortcode. If a more descriptive term
+ * is wanted, or the Unicode standard has changed meaning
+ * or naming, we should persist the original shortcode.
+ * We can do this by shifting the old shortcode to the end
+ * of the array, while placing the new shortcode at the
+ * beginning. This allows for backwards compatible changes.
+ */`;
+
+export default async function generateShortcodes(): Promise<string> {
+  // eslint-disable-next-line
+  const data: Required<Emoji>[] = require(path.join(process.cwd(), 'packages/data/en/data.json'));
+  const output: string[] = [
+    '/* eslint-disable sort-keys */',
+    '',
+    GUIDELINES,
+    '',
+    'export default {',
+  ];
+  let lastVersion = 0;
+
+  // Sort by version -> order
+  data.sort((a, b) => (a.version === b.version ? a.order - b.order : a.version - b.version));
+
+  // Add each emoji to the list
+  data.forEach((emoji) => {
+    if (emoji.version !== lastVersion) {
+      if (lastVersion !== 0) {
+        output.push('');
+      }
+
+      output.push(`  // VERSION ${emoji.version}`);
+
+      lastVersion = emoji.version;
+    }
+
+    const unicode = emoji.type === TEXT ? emoji.text : emoji.emoji;
+    const shortcodes = emoji.shortcodes.map((sc) => `'${sc}'`);
+
+    output.push(`  // ${unicode} ${emoji.annotation || emoji.name}`);
+    output.push(`  '${emoji.hexcode}': [${shortcodes.join(', ')}],`);
+  });
+
+  output.push('};');
+
+  // Write it!
+  return writeFile(
+    path.join(process.cwd(), 'packages/generator/src/resources'),
+    'shortcodes.ts',
+    output.join('\n'),
+  );
+}
diff --git a/packages/regex/tests/regex.test.ts b/packages/regex/tests/regex.test.ts
@@ -15,7 +15,7 @@ import CODEPOINT_EMOJI_PATTERN from '../codepoint/emoji';
 import CODEPOINT_EMOJI_LOOSE_PATTERN from '../codepoint/emoji-loose';
 import CODEPOINT_TEXT_PATTERN from '../codepoint/text';
 import CODEPOINT_TEXT_LOOSE_PATTERN from '../codepoint/text-loose';
-import SHORTCODE_PATTERN from '../shortcode';
+// import SHORTCODE_PATTERN from '../shortcode';
 import EMOTICON_PATTERN from '../emoticon';
 
 const PATTERNS = {

diff --git a/types/kuroshiro.d.ts b/types/kuroshiro.d.ts
@@ -0,0 +1,15 @@
+declare module 'kuroshiro' {
+  class Kuroshiro {
+    constructor();
+    init(analyzer: unknown): Promise<void>;
+    convert(value: string, options: {
+      mode?: 'normal' | 'spaced' | 'okurigana' | 'furigana',
+      to?: 'hiragana' | 'katakana' | 'romaji',
+      romajiSystem?: 'nippon' | 'passport' | 'hepburn'
+    }): Promise<string>;
+  }
+
+  export default Kuroshiro;
+}
+
+declare module 'kuroshiro-analyzer-kuromoji';