diff --git a/.changeset/quiet-walls-refuse.md b/.changeset/quiet-walls-refuse.md new file mode 100644 index 0000000..c9a3753 --- /dev/null +++ b/.changeset/quiet-walls-refuse.md @@ -0,0 +1,5 @@ +--- +'uniorg-parse': patch +--- + +Fix parsing unicode characters in headline tags. The regex for parsing tags previously used `\w` (word) class, which does not behave correctly with unicode. Update it to use unicode's Letter and Number character properties instead. diff --git a/packages/uniorg-parse/src/__snapshots__/parser.spec.ts.snap b/packages/uniorg-parse/src/__snapshots__/parser.spec.ts.snap index dc70848..14fec9c 100644 --- a/packages/uniorg-parse/src/__snapshots__/parser.spec.ts.snap +++ b/packages/uniorg-parse/src/__snapshots__/parser.spec.ts.snap @@ -2053,6 +2053,31 @@ children: value: "Something" `; +exports[`org/parser headline statistics-cookie non-ascii characters in headline tags 1`] = ` +type: "org-data" +contentsBegin: 0 +contentsEnd: 21 +children: + - type: "section" + contentsBegin: 0 + contentsEnd: 21 + children: + - type: "headline" + level: 1 + todoKeyword: null + priority: null + commented: false + rawValue: "headline" + tags: + - "hello" + - "你好" + contentsBegin: 2 + contentsEnd: 10 + children: + - type: "text" + value: "headline" +`; + exports[`org/parser headline statistics-cookie statistics cookie with long trailing space 1`] = ` type: "org-data" contentsBegin: 0 diff --git a/packages/uniorg-parse/src/parser.spec.ts b/packages/uniorg-parse/src/parser.spec.ts index d340596..2958f46 100644 --- a/packages/uniorg-parse/src/parser.spec.ts +++ b/packages/uniorg-parse/src/parser.spec.ts @@ -124,6 +124,11 @@ describe('org/parser', () => { `* TODO [#A] COMMENT headline /italic/ title :some:tags: [1/3]` ); + itParses( + 'non-ascii characters in headline tags', + `* headline :hello:你好:` + ); + itParses('statistics cookie without trailing space', `* [/]hello`); itParses('statistics cookie with long trailing space', `* [/] hello`); diff --git a/packages/uniorg-parse/src/parser.ts b/packages/uniorg-parse/src/parser.ts index 15dc264..6492fa9 100644 --- a/packages/uniorg-parse/src/parser.ts +++ b/packages/uniorg-parse/src/parser.ts @@ -725,7 +725,9 @@ class Parser { const titleStart = this.r.offset(); - const tagsM = this.r.lookingAt(/^(.*?)[ \t]+:([\w@#%:]+):[ \t]*$/m); + const tagsM = this.r.lookingAt( + /^(.*?)[ \t]+:([\p{L}\p{N}_@#%:]+):[ \t]*$/mu + ); const tags = tagsM?.[2].split(':') ?? []; const titleEnd = tagsM ? titleStart + tagsM.index + tagsM[1].length