Skip to content

Commit

Permalink
fix: zwj emojis #2 (#16)
Browse files Browse the repository at this point in the history
  • Loading branch information
niklasvh authored Aug 2, 2021
1 parent 5a70a8f commit a314ea3
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 22 deletions.
21 changes: 14 additions & 7 deletions scripts/generate_line_break_tests.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@ data.split('\n')
const inputs = input.split(/\s+/g);
const codePoints: string[] = [];
const breaks: string[] = [];
inputs.forEach((input) => {
if ([BREAK_ALLOWED, BREAK_MANDATORY, BREAK_NOT_ALLOWED].indexOf(input) !== -1) {
breaks.push(input);
inputs.forEach((value) => {
if ([BREAK_ALLOWED, BREAK_MANDATORY, BREAK_NOT_ALLOWED].indexOf(value) !== -1) {
breaks.push(value);
} else {
codePoints.push(`0x${input}`);
codePoints.push(`0x${value}`);
}
});
tests.push(`it('${comment}', () => test([${codePoints.join(', ')}], ${JSON.stringify(breaks)}));`);
Expand All @@ -28,13 +28,20 @@ data.split('\n')

const template = `// Generated tests from LineBreakTest.txt, do NOT modify
'use strict';
import {equal} from 'assert';
import {lineBreakAtIndex, codePointsToCharacterClasses, BREAK_MANDATORY, BREAK_ALLOWED} from '../src/LineBreak';
import {strictEqual} from 'assert';
import {lineBreakAtIndex, codePointsToCharacterClasses, BREAK_MANDATORY, BREAK_ALLOWED, classes} from '../src/LineBreak';
const reverseClasses: {[key: number]: string} = Object.keys(classes).reduce((acc: {[key: number]: string}, key: string) => {
acc[classes[key]] = key;
return acc;
}, {});
const test = (codePoints: number[], breaks: string[]) => {
const [indices, types] = codePointsToCharacterClasses(codePoints);
breaks.forEach((c: string, i: number) => {
const b = lineBreakAtIndex(codePoints, i).replace(BREAK_MANDATORY, BREAK_ALLOWED);
equal(b, c, \`\${b} at \${i}, expected \${c} with \${codePointsToCharacterClasses(codePoints)}\`);
strictEqual(b, c, \`\${b} at \${i}, expected \${c} with indices \${indices} and types \${types.map((type) => reverseClasses[type])}\`);
});
};
Expand Down
31 changes: 19 additions & 12 deletions src/LineBreak.ts
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ export const codePointsToCharacterClasses = (
lineBreak: string = 'strict'
): [number[], number[], boolean[]] => {
const types: number[] = [];
const indicies: number[] = [];
const indices: number[] = [];
const categories: boolean[] = [];
codePoints.forEach((codePoint, index) => {
let classType = UnicodeTrie.get(codePoint);
Expand All @@ -136,30 +136,30 @@ export const codePointsToCharacterClasses = (
if (['normal', 'auto', 'loose'].indexOf(lineBreak) !== -1) {
// U+2010, – U+2013, 〜 U+301C, ゠ U+30A0
if ([0x2010, 0x2013, 0x301c, 0x30a0].indexOf(codePoint) !== -1) {
indicies.push(index);
indices.push(index);
return types.push(CB);
}
}

if (classType === CM || classType === ZWJ) {
// LB10 Treat any remaining combining mark or ZWJ as AL.
if (index === 0) {
indicies.push(index);
indices.push(index);
return types.push(AL);
}

// LB9 Do not break a combining character sequence; treat it as if it has the line breaking class of
// the base character in all of the following rules. Treat ZWJ as if it were CM.
const prev = types[index - 1];
if (LINE_BREAKS.indexOf(prev) === -1) {
indicies.push(indicies[index - 1]);
indices.push(indices[index - 1]);
return types.push(prev);
}
indicies.push(index);
indices.push(index);
return types.push(AL);
}

indicies.push(index);
indices.push(index);

if (classType === CJ) {
return types.push(lineBreak === 'strict' ? NS : ID);
Expand Down Expand Up @@ -187,7 +187,7 @@ export const codePointsToCharacterClasses = (
types.push(classType);
});

return [indicies, types, categories];
return [indices, types, categories];
};

const isAdjacentWithSpaceIgnored = (
Expand Down Expand Up @@ -257,13 +257,15 @@ const previousNonSpaceClassType = (currentIndex: number, classTypes: number[]):
return 0;
};

export type BREAK_OPPORTUNITIES = typeof BREAK_NOT_ALLOWED | typeof BREAK_ALLOWED | typeof BREAK_MANDATORY;

const _lineBreakAtIndex = (
codePoints: number[],
classTypes: number[],
indicies: number[],
index: number,
forbiddenBreaks?: boolean[]
) => {
): BREAK_OPPORTUNITIES => {
if (indicies[index] === 0) {
return BREAK_NOT_ALLOWED;
}
Expand Down Expand Up @@ -305,11 +307,16 @@ const _lineBreakAtIndex = (
return BREAK_ALLOWED;
}

// LB8a Do not break between a zero width joiner and an ideograph, emoji base or emoji modifier.
// LB8a Do not break after a zero width joiner.
if (UnicodeTrie.get(codePoints[currentIndex]) === ZWJ) {
return BREAK_NOT_ALLOWED;
}

// zwj emojis
if ((current === EB || current === EM) && UnicodeTrie.get(codePoints[afterIndex]) === ZWJ) {
return BREAK_NOT_ALLOWED;
}

// LB11 Do not break before or after Word joiner and related characters.
if (current === WJ || next === WJ) {
return BREAK_NOT_ALLOWED;
Expand Down Expand Up @@ -512,7 +519,7 @@ const _lineBreakAtIndex = (
return BREAK_ALLOWED;
};

export const lineBreakAtIndex = (codePoints: number[], index: number) => {
export const lineBreakAtIndex = (codePoints: number[], index: number): BREAK_OPPORTUNITIES => {
// LB2 Never break at the start of text.
if (index === 0) {
return BREAK_NOT_ALLOWED;
Expand All @@ -523,9 +530,9 @@ export const lineBreakAtIndex = (codePoints: number[], index: number) => {
return BREAK_MANDATORY;
}

const [indicies, classTypes] = codePointsToCharacterClasses(codePoints);
const [indices, classTypes] = codePointsToCharacterClasses(codePoints);

return _lineBreakAtIndex(codePoints, classTypes, indicies, index);
return _lineBreakAtIndex(codePoints, classTypes, indices, index);
};

export type LINE_BREAK = 'auto' | 'normal' | 'strict';
Expand Down
15 changes: 15 additions & 0 deletions tests/LineBreaker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,21 @@ describe('LineBreaker', () => {
deepEqual(words, ['Lorem ', 'ipsum ', 'lol.']);
});

it('should handle zwj emojis', () => {
const breaker = LineBreaker('Text with zwj emojis 👨‍👩‍👧‍👦 and modifiers 🤷🏾‍♂️.');

const words = [];
let bk;

while (!(bk = breaker.next()).done) {
if (bk.value) {
words.push(bk.value.slice());
}
}

deepEqual(words, ['Text ', 'with ', 'zwj ', 'emojis ', '👨‍👩‍👧‍👦 ', 'and ', 'modifiers ', '🤷🏾‍♂️.']);
});

it('Works with options', () => {
const breaker = LineBreaker('次の単語グレートブリテンおよび北アイルランド連合王国で本当に大きな言葉', {wordBreak: 'keep-all'});

Expand Down
13 changes: 10 additions & 3 deletions tests/linebreak.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
// Generated tests from LineBreakTest.txt, do NOT modify
'use strict';
import {equal} from 'assert';
import {lineBreakAtIndex, codePointsToCharacterClasses, BREAK_MANDATORY, BREAK_ALLOWED} from '../src/LineBreak';
import {strictEqual} from 'assert';
import {lineBreakAtIndex, codePointsToCharacterClasses, BREAK_MANDATORY, BREAK_ALLOWED, classes} from '../src/LineBreak';

const reverseClasses: {[key: number]: string} = Object.keys(classes).reduce((acc: {[key: number]: string}, key: string) => {
acc[classes[key]] = key;
return acc;
}, {});

const test = (codePoints: number[], breaks: string[]) => {
const [indices, types] = codePointsToCharacterClasses(codePoints);

breaks.forEach((c: string, i: number) => {
const b = lineBreakAtIndex(codePoints, i).replace(BREAK_MANDATORY, BREAK_ALLOWED);
equal(b, c, `${b} at ${i}, expected ${c} with ${codePointsToCharacterClasses(codePoints)}`);
strictEqual(b, c, `${b} at ${i}, expected ${c} with indices ${indices} and types ${types.map((type) => reverseClasses[type])}`);
});
};

Expand Down

0 comments on commit a314ea3

Please sign in to comment.