Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Inflection-64 Convert dictionary-parser to consume Wikidata #65

Merged
merged 4 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions inflection/gradle/versions.gradle
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
/*
* Copyright 2025 Unicode Incorporated and others. All rights reserved.
* Copyright 2020-2024 Apple Inc. All rights reserved.
*/
// This is a sorted map of every dependency we want to use throughout the whole Inflection repo
ext.dependencyVersions = [
commons_text: '1.10.0',
icu4j: '75.1',
commons_compress: '1.27.1',
commons_text: '1.13.0',
icu4j: '76.1',
jackson: '2.18.2',
jsr305: '3.0.2',
junit_jupiter: '5.10.2',
junit_platform: '1.10.2',
log4j: '2.22.1',
xerces: '2.12.2',
junit_jupiter: '5.11.4',
junit_platform: '1.11.4',
log4j: '2.24.3',
]
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<?xml version='1.0' encoding='utf-8'?>
<!--
Copyright 2025 Unicode Incorporated and others. All rights reserved.
Copyright 2016-2024 Apple Inc. All rights reserved.
-->
<!DOCTYPE languages SYSTEM "grammar.dtd">
Expand All @@ -23,7 +24,7 @@
<grammeme name="interjection"/>
<!-- grammeme name="letter"/ -->
<grammeme name="noun"/>
<!-- grammeme name="number"/ -->
<grammeme name="numeral"/>
<!-- grammeme name="participle" -->
<grammeme name="particle"/>
<!-- grammeme name="postposition"/ -->
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/*
* Copyright 2025 Unicode Incorporated and others. All rights reserved.
* Copyright 2016-2024 Apple Inc. All rights reserved.
*/
#include "catch2/catch_test_macros.hpp"
Expand Down Expand Up @@ -111,7 +112,7 @@ TEST_CASE("LanguageGrammarFeaturesTest#testLanguageGrammarFeatures") /* throws(E
const auto& nbLangGramFeatures = ::inflection::lang::features::LanguageGrammarFeatures::getLanguageGrammarFeatures(::inflection::util::ULocale("nb"));
auto nbCategories = nbLangGramFeatures.getCategories();
REQUIRE(nbCategories.size() == 11);
REQUIRE(categorySize(nbCategories, u"pos") == 10);
REQUIRE(categorySize(nbCategories, u"pos") == 11);
REQUIRE(categorySize(nbCategories, u"number") == 2);

auto nbFeatures = nbLangGramFeatures.getFeatures();
Expand All @@ -134,7 +135,7 @@ TEST_CASE("LanguageGrammarFeaturesTest#testLanguageGrammarFeatures") /* throws(E
const auto& rootLangGramFeatures = ::inflection::lang::features::LanguageGrammarFeatures::getLanguageGrammarFeatures(::inflection::util::LocaleUtils::ROOT());
auto rootCategories = rootLangGramFeatures.getCategories();
REQUIRE(rootCategories.size() == 2);
REQUIRE(categorySize(rootCategories, u"pos") == 10);
REQUIRE(categorySize(rootCategories, u"pos") == 11);
REQUIRE(categorySize(rootCategories, u"gender") == -1);
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#
# Copyright 2025 Unicode Incorporated and others. All rights reserved.
# Copyright 2020-2024 Apple Inc. All rights reserved.
#
JAR=build/libs/dictionary-parser-all.jar
if [ ! -e "$JAR" ]
then
../../gradlew clean fatJar >&2
fi
java -Dfile.encoding=UTF-8 -cp $JAR com.apple.lexicon.ParseLexicon $*
java -Dfile.encoding=UTF-8 -cp $JAR org.unicode.wikidata.ParseWikidata $*
24 changes: 10 additions & 14 deletions inflection/tools/dictionary-parser/README.md
Original file line number Diff line number Diff line change
@@ -1,27 +1,23 @@
<!--
Copyright 2025 Unicode Incorporated and others. All rights reserved.
Copyright 2021-2024 Apple Inc. All rights reserved.
-->
# Dictionary Parsers

These tools generate files that describes the grammatical properties of words from the various sources.

## Usage for Wiktionary
## Usage for Wikidata

1) Download a copy of Wiktionary data from http://dumps.wikimedia.org/ (e.g. http://dumps.wikimedia.org/enwiktionary/20130825/enwiktionary-20130825-pages-articles.xml.bz2)
2) Decompress dump
3) Run `./ParseWiktionary ~/Downloads/enwiktionary-20130825-pages-articles.xml > ../../resources/com/apple/inflection/dictionary/dictionary_en.lst`

## Usage for Apple's Lexical Resources (ALR)

1) Get the latest lexicon for your language.
grhoten marked this conversation as resolved.
Show resolved Hide resolved
1) Download a copy of Wikidata from https://dumps.wikimedia.org/wikidatawiki/entities/ (e.g. https://dumps.wikimedia.org/wikidatawiki/entities/20250115/wikidata-20250115-lexemes.json.bz2)
2) Check what options were used for your language. They are at the end of the generated dictionary_XX.lst, look for "generated with options"
- Run `grep 'generated with options' ../../resources/com/apple/inflection/dictionary/dictionary_XX.lst | cut -d':' -f2`
- Run `grep 'generated with options' ../../resources/org/unicode/inflection/dictionary/dictionary_XX.lst | cut -d':' -f2`
- If the above command prints nothing, no additional options were used to generate the file, or it was generated with a different tool.
- To see what options are available run `./ParseLexicon`
- To see what options are available run `./ParseWikidata`
- At minimum use the `--locale` option to specify the ISO-639 code for the language to extract.
3) Run
```
./ParseLexicon <THE_OPTIONS_FROM_STEP_2> \
--inflections ../../resources/com/apple/inflection/dictionary/inflectional_XX.xml \
<THE_LEXICON_FILE> \
> ../../resources/com/apple/inflection/dictionary/dictionary_XX.lst
./ParseWikidata <THE_OPTIONS_FROM_STEP_2> \
--inflections ../../resources/org/unicode/inflection/dictionary/inflectional_XX.xml \
--dictionary ../../resources/org/unicode/inflection/dictionary/dictionary_XX.lst \
<wikidata-NNNNNNNN-lexemes.json.bz2>
```
10 changes: 6 additions & 4 deletions inflection/tools/dictionary-parser/build.gradle
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/*
* Copyright 2025 Unicode Incorporated and others. All rights reserved.
* Copyright 2020-2024 Apple Inc. All rights reserved.
*/
group = 'com.apple.inflection.tools'
Expand All @@ -7,15 +8,16 @@ description = 'Converts lexical dictionaries into a form that can be consumed by
dependencies {
implementation group: 'com.ibm.icu', name: 'icu4j', version: dependencyVersions.icu4j
implementation(group: 'org.apache.commons', name: 'commons-text', version: dependencyVersions.commons_text)
runtimeOnly(group: 'xerces', name: 'xercesImpl', version: dependencyVersions.xerces) {
exclude(group: 'xml-apis')
}
implementation(group: 'org.apache.commons', name: 'commons-compress', version: dependencyVersions.commons_compress)
implementation(group: 'com.fasterxml.jackson.core', name: 'jackson-core', version: dependencyVersions.jackson)
implementation(group: 'com.fasterxml.jackson.core', name: 'jackson-annotations', version: dependencyVersions.jackson)
implementation(group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: dependencyVersions.jackson)
}

tasks.register('fatJar', Jar) {
manifest {
attributes 'Implementation-Version': project.version,
'Main-Class': 'com.apple.wiktionary.ParseWiktionary'
'Main-Class': 'org.unicode.wikidata.ParseWikidata'
}
archiveFileName = project.name + '-all.jar'
duplicatesStrategy = 'include'
Expand Down
Loading