Skip to content

Commit

Permalink
Merge branch 'release/v7.17.1'
Browse files Browse the repository at this point in the history
- Fixed loading coccoc native lib thread safety issue
- Added CI
  • Loading branch information
duydo committed May 26, 2022
2 parents 19adce2 + c370112 commit 0985dd7
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 15 deletions.
36 changes: 36 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Test

on: [push, pull_request]

jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
entry:
- { version: 11, distribution: 'adopt' }
- { version: 17, distribution: 'adopt' }
steps:
- name: Checkout analysis-vietnamese
uses: actions/checkout@v3
- name: Check Out coccoc-tokenizer
uses: actions/checkout@v3
with:
repository: coccoc/coccoc-tokenizer
path: coccoc-tokenizer
- name: Build coccoc-tokenizer
working-directory: ./coccoc-tokenizer
run: |
mkdir build && cd build
cmake -DBUILD_JAVA=1 ..
sudo make install
- name: Set up JDK ${{ matrix.entry.version }}
uses: actions/setup-java@v3
with:
java-version: ${{ matrix.entry.version }}
distribution: ${{ matrix.entry.distribution }}
cache: maven
- name: Build and Test
run: |
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
mvn --batch-mode test
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Vietnamese Analysis Plugin for Elasticsearch

[![Test](https://github.com/duydo/elasticsearch-analysis-vietnamese/actions/workflows/test.yml/badge.svg)](https://github.com/duydo/elasticsearch-analysis-vietnamese/actions/workflows/test.yml)

Vietnamese Analysis plugin integrates Vietnamese language analysis into Elasticsearch. It uses [C++ tokenizer for Vietnamese](https://github.com/coccoc/coccoc-tokenizer) library developed by
CocCoc team for their Search Engine and Ads systems.

Expand Down Expand Up @@ -114,7 +116,6 @@ If you want to use the plugin with prior versions of Elasticsearch, you can buil

| Vietnamese Analysis Plugin | Elasticsearch |
| -------------------------- | ------------- |
| chưa hỗ trợ | 8.x |
| master | 7.16~7.17 |
| 7.12.1 | 7.12.1~7.15.x |
| 7.3.1 | 7.3.1 |
Expand Down
8 changes: 4 additions & 4 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,13 @@
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.build.java.version>1.8</project.build.java.version>
<elasticsearch.version>${project.version}</elasticsearch.version>
<log4j.version>2.17.0</log4j.version>
<log4j.version>2.17.2</log4j.version>
</properties>
<dependencies>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>30.1.1-jre</version>
<version>31.1-jre</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
Expand Down Expand Up @@ -123,9 +123,9 @@
<!-- we skip surefire to work with randomized testing above -->
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.9</version>
<version>2.20</version>
<configuration>
<skipTests>true</skipTests>
<skipTests>false</skipTests>
</configuration>
</plugin>
<plugin>
Expand Down
35 changes: 26 additions & 9 deletions src/main/java/com/coccoc/Tokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,6 @@ public class Tokenizer {
System.loadLibrary(TOKENIZER_SHARED_LIB_NAME);
}

public static final String SPACE = " ";
public static final String UNDERSCORE = "_";
public static final String COMMA = ",";
public static final String DOT = ".";


public enum TokenizeOption {
NORMAL(0),
Expand All @@ -37,12 +32,34 @@ public int value() {
}
}

public Tokenizer(String dictPath) {
public static final String SPACE = " ";
public static final String UNDERSCORE = "_";
public static final String COMMA = ",";
public static final String DOT = ".";


private static String dictPath = null;

private static final class Loader {
private static final Tokenizer INSTANCE = get();

private Loader() {
}

private static Tokenizer get() {
return new Tokenizer(dictPath);
}
}

public static Tokenizer getInstance(String dictPath) {
Tokenizer.dictPath = dictPath;
return Loader.INSTANCE;
}

private Tokenizer(String dictPath) {
int status = initialize(dictPath);
if (0 > status) {
throw new RuntimeException(
String.format("Cannot initialize Tokenizer: %s", dictPath)
);
throw new RuntimeException(String.format("Cannot initialize Tokenizer: %s", dictPath));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ final class VietnameseTokenizerImpl {
option = TokenizeOption.NORMAL;
}
tokenizer = AccessController.doPrivileged(
(PrivilegedAction<Tokenizer>) () -> new Tokenizer(config.dictPath)
(PrivilegedAction<Tokenizer>) () -> Tokenizer.getInstance(config.dictPath)
);
pending = new CopyOnWriteArrayList<>();
}
Expand Down

0 comments on commit 0985dd7

Please sign in to comment.