diff --git a/.gitignore b/.gitignore index 9fbb3c4..f822971 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,13 @@ *.class \.* target/ +out/ # Package Files # *.jar *.war *.ear -jieba-analysis.iml +*.iml +*.ipr +*.iws diff --git a/build.gradle b/build.gradle new file mode 100644 index 0000000..524ca0c --- /dev/null +++ b/build.gradle @@ -0,0 +1,27 @@ +apply plugin: 'java' +apply plugin: 'maven' +apply plugin: 'idea' + +group = 'com.huaban' +version = '1.0.3-SNAPSHOT' + +description = """结巴分词工具(jieba for java)""" + +sourceCompatibility = 1.7 +targetCompatibility = 1.7 +tasks.withType(JavaCompile) { + options.encoding = 'UTF-8' +} + +configurations.all { +} + +repositories { + maven {url "http://maven.aliyun.com/nexus/content/groups/public"} + maven { url "https://oss.sonatype.org/content/repositories/snapshots" } + maven { url "http://repo.maven.apache.org/maven2" } +} +dependencies { + compile group: 'org.apache.commons', name: 'commons-lang3', version:'3.3.1' + testCompile group: 'junit', name: 'junit', version:'4.8' +} diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 0000000..c545d94 --- /dev/null +++ b/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,6 @@ +#Sat Jul 29 16:48:54 CST 2017 +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-3.2-bin.zip diff --git a/gradlew b/gradlew new file mode 100644 index 0000000..4ef3a87 --- /dev/null +++ b/gradlew @@ -0,0 +1,171 @@ +#!/usr/bin/env sh + +############################################################################## +## +## Gradle start up script for UN*X +## +############################################################################## + +# Attempt to set APP_HOME +# Resolve links: $0 may be a link +PRG="$0" +# Need this for relative symlinks. +while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`"/$link" + fi +done +SAVED="`pwd`" +cd "`dirname \"$PRG\"`/" >/dev/null +APP_HOME="`pwd -P`" +cd "$SAVED" >/dev/null + +APP_NAME="Gradle" +APP_BASE_NAME=`basename "$0"` + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS="" + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD="maximum" + +warn ( ) { + echo "$*" +} + +die ( ) { + echo + echo "$*" + echo + exit 1 +} + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "`uname`" in + CYGWIN* ) + cygwin=true + ;; + Darwin* ) + darwin=true + ;; + MINGW* ) + msys=true + ;; + NONSTOP* ) + nonstop=true + ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD="java" + which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." +fi + +# Increase the maximum file descriptors if we can. +if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then + MAX_FD_LIMIT=`ulimit -H -n` + if [ $? -eq 0 ] ; then + if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then + MAX_FD="$MAX_FD_LIMIT" + fi + ulimit -n $MAX_FD + if [ $? -ne 0 ] ; then + warn "Could not set maximum file descriptor limit: $MAX_FD" + fi + else + warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" + fi +fi + +# For Darwin, add options to specify how the application appears in the dock +if $darwin; then + GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" +fi + +# For Cygwin, switch paths to Windows format before running java +if $cygwin ; then + APP_HOME=`cygpath --path --mixed "$APP_HOME"` + CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` + JAVACMD=`cygpath --unix "$JAVACMD"` + + # We build the pattern for arguments to be converted via cygpath + ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` + SEP="" + for dir in $ROOTDIRSRAW ; do + ROOTDIRS="$ROOTDIRS$SEP$dir" + SEP="|" + done + OURCYGPATTERN="(^($ROOTDIRS))" + # Add a user-defined pattern to the cygpath arguments + if [ "$GRADLE_CYGPATTERN" != "" ] ; then + OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" + fi + # Now convert the arguments - kludge to limit ourselves to /bin/sh + i=0 + for arg in "$@" ; do + CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` + CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option + + if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition + eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` + else + eval `echo args$i`="\"$arg\"" + fi + i=$((i+1)) + done + case $i in + (0) set -- ;; + (1) set -- "$args0" ;; + (2) set -- "$args0" "$args1" ;; + (3) set -- "$args0" "$args1" "$args2" ;; + (4) set -- "$args0" "$args1" "$args2" "$args3" ;; + (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; + (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; + (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; + (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; + (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; + esac +fi + +# Escape application args +for s in "${@}" ; do + s=\"$s\" + APP_ARGS=$APP_ARGS" "$s +done + +# Collect all arguments for the java command, following the shell quoting and substitution rules +eval set -- "$DEFAULT_JVM_OPTS" "$JAVA_OPTS" "$GRADLE_OPTS" "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" + +# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong +if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then + cd "$(dirname "$0")" +fi + +exec "$JAVACMD" "$@" diff --git a/gradlew.bat b/gradlew.bat new file mode 100644 index 0000000..f955316 --- /dev/null +++ b/gradlew.bat @@ -0,0 +1,84 @@ +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS= + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto init + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto init + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:init +@rem Get command-line arguments, handling Windows variants + +if not "%OS%" == "Windows_NT" goto win9xME_args + +:win9xME_args +@rem Slurp the command line arguments. +set CMD_LINE_ARGS= +set _SKIP=2 + +:win9xME_args_slurp +if "x%~1" == "x" goto execute + +set CMD_LINE_ARGS=%* + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/settings.gradle b/settings.gradle new file mode 100644 index 0000000..dcbd3dc --- /dev/null +++ b/settings.gradle @@ -0,0 +1 @@ +rootProject.name = 'jieba-analysis' diff --git a/src/main/java/com/huaban/analysis/jieba/WordDictionary.java b/src/main/java/com/huaban/analysis/jieba/WordDictionary.java index 8cb1e53..b0aed58 100644 --- a/src/main/java/com/huaban/analysis/jieba/WordDictionary.java +++ b/src/main/java/com/huaban/analysis/jieba/WordDictionary.java @@ -9,28 +9,38 @@ import java.io.InputStreamReader; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Locale; -import java.util.Map; +import java.util.*; import java.util.Map.Entry; -import java.util.Set; public class WordDictionary { - private static WordDictionary singleton; + private static volatile WordDictionary singleton; private static final String MAIN_DICT = "/dict.txt"; private static String USER_DICT_SUFFIX = ".dict"; - public final Map freqs = new HashMap(); + public final Map freqs = new HashMap(); public final Set loadedPath = new HashSet(); private Double minFreq = Double.MAX_VALUE; private Double total = 0.0; private DictSegment _dict; + private boolean useDefaultDict = true; + private static final String CONFIG_NAME = "jieba.defaultDict"; private WordDictionary() { - this.loadDict(); + loadConfig(); + + if (this.useDefaultDict) { + this.loadDict(); + } + } + + private void loadConfig() { + String configString = System.getenv(CONFIG_NAME); + if (configString == null) { + configString = System.getProperty(CONFIG_NAME, "true"); + } + this.useDefaultDict = Boolean.valueOf(configString); } @@ -100,16 +110,12 @@ public void loadDict() { continue; String word = tokens[0]; - double freq = Double.valueOf(tokens[1]); + int freq = Integer.valueOf(tokens[1]); total += freq; word = addWord(word); - freqs.put(word, freq); - } - // normalize - for (Entry entry : freqs.entrySet()) { - entry.setValue((Math.log(entry.getValue() / total))); - minFreq = Math.min(entry.getValue(), minFreq); + freqs.put(word, new ComputedFreq(freq)); } + normalizeFreqs(); System.out.println(String.format(Locale.getDefault(), "main dict load finished, time elapsed %d ms", System.currentTimeMillis() - s)); } @@ -127,6 +133,14 @@ public void loadDict() { } } + private void normalizeFreqs() { + // normalize + for (Entry entry : freqs.entrySet()) { + entry.getValue().compFreq = Math.log((double) entry.getValue().freq / total); + minFreq = Math.min(entry.getValue().compFreq, minFreq); + } + } + private String addWord(String word) { if (null != word && !"".equals(word.trim())) { @@ -144,9 +158,10 @@ public void loadUserDict(Path userDict) { } - public void loadUserDict(Path userDict, Charset charset) { + public void loadUserDict(Path userDict, Charset charset) { + BufferedReader br = null; try { - BufferedReader br = Files.newBufferedReader(userDict, charset); + br = Files.newBufferedReader(userDict, charset); long s = System.currentTimeMillis(); int count = 0; while (br.ready()) { @@ -160,18 +175,33 @@ public void loadUserDict(Path userDict, Charset charset) { String word = tokens[0]; - double freq = 3.0d; - if (tokens.length == 2) - freq = Double.valueOf(tokens[1]); - word = addWord(word); - freqs.put(word, Math.log(freq / total)); + int freq = 3; + if (tokens.length >= 2) + freq = Integer.valueOf(tokens[1]); + word = addWord(word); + ComputedFreq computedFreq = freqs.get(word); + if (computedFreq == null) { + freqs.put(word, new ComputedFreq(freq)); + } else { + computedFreq.freq += freq; + } + total += freq; count++; } + normalizeFreqs(); System.out.println(String.format(Locale.getDefault(), "user dict %s load finished, tot words:%d, time elapsed:%dms", userDict.toString(), count, System.currentTimeMillis() - s)); - br.close(); } catch (IOException e) { System.err.println(String.format(Locale.getDefault(), "%s: load user dict failure!", userDict.toString())); + } finally { + if (br != null) { + try { + br.close(); + } catch (IOException e) { + System.err.println(String.format(Locale.getDefault(), + "%s: load user dict failure! %s", userDict.toString(), e.getMessage())); + } + } } } @@ -185,11 +215,23 @@ public boolean containsWord(String word) { return freqs.containsKey(word); } + public boolean isUseDefaultDict() { + return useDefaultDict; + } public Double getFreq(String key) { if (containsWord(key)) - return freqs.get(key); + return freqs.get(key).compFreq; else return minFreq; } + + private static class ComputedFreq { + private double compFreq; + private int freq; + + public ComputedFreq(int freq) { + this.freq = freq; + } + } } diff --git a/src/test/java/com/huaban/analysis/jieba/WordDictionaryTest.java b/src/test/java/com/huaban/analysis/jieba/WordDictionaryTest.java new file mode 100644 index 0000000..762895c --- /dev/null +++ b/src/test/java/com/huaban/analysis/jieba/WordDictionaryTest.java @@ -0,0 +1,59 @@ +/** + * (C) Copyright 2017 alex qian + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +package com.huaban.analysis.jieba; + +import org.junit.Assert; +import org.junit.Test; + +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.file.Paths; + +/** + * + * WordDictionaryTest + * + * @author alex.Q + * @date 2017/7/29 + */ +public class WordDictionaryTest { + + @Test + public void test_loadDict() throws URISyntaxException { + WordDictionary wordDict = WordDictionary.getInstance(); + double d = wordDict.getFreq("司机"); + System.out.println(d); + URL url = ClassLoader.getSystemResource("test_user.dict"); + wordDict.loadUserDict(Paths.get(url.toURI())); + d = wordDict.getFreq("司机"); + System.out.println(d); + } + + @Test + public void test_useDefaultDict() { + WordDictionary wordDict = WordDictionary.getInstance(); + Assert.assertTrue(wordDict.isUseDefaultDict()); + } + + @Test + public void test_useDefaultDict2() { + System.setProperty("jieba.defaultDict", "false"); + WordDictionary wordDict = WordDictionary.getInstance(); + Assert.assertTrue(wordDict.isUseDefaultDict() == false); + } + +} diff --git a/src/test/resources/test_user.dict b/src/test/resources/test_user.dict new file mode 100644 index 0000000..4e9743d --- /dev/null +++ b/src/test/resources/test_user.dict @@ -0,0 +1,2 @@ +老司机 10 n +司机 1000 n \ No newline at end of file