diff --git a/docs/synonyms.md b/docs/synonyms.md new file mode 100644 index 000000000..0b7b805df --- /dev/null +++ b/docs/synonyms.md @@ -0,0 +1,83 @@ +# Using Synonyms and Classification Terms + +Photon has built-in support for using custom query-time synonyms and +special phrases for searching a place by its type. This document explains +how to configure this feature. + +## Configuration + +Synonyms and classification terms are configured with a JSON file which can +be added to a Photon server instance using the command line parameter +`-synonym-file`. Synonyms are a run-time feature. Handing in a synonym list +at import time has no effect. The list of synonyms in use can simply be +changed by restarting the Photon server with a different synonym list (or +not at all, if you want to completely disable the feature again). + +Here is a simple example of a synonym configuration file: + +``` +{ + "search_synonyms": [ + "first,1st", + "second,2nd" + ], + "classification_terms": [ + { + "key": "aeroway", + "value": "aerodrome", + "terms": ["airport", "airfield"] + }, + { + "key": "railway", + "value": "station", + "terms": ["station"] + } + ] +} +``` + +The file has two main sections: `search_synonyms` allows for simple synonym +replacements in the query. `classification_term` defines descriptive terms +for a OSM key/value pair. + +## Synonyms + +The `search_synonyms` section must contain a list of synonym replacements. +Each entry contains a comma-separated of terms that may be replaced with each +other in the query. Only single-word terms are allowed. That means the terms +must neither contain spaces nor hyphens or the like.[^1] + +[^1] This is a restriction of ElasticSearch 5. Synonym replacement does not + create correct term positions when multi-word synonyms are involved. + +## Classification Terms + +The second section `classification_terms` defines a list of OSM key/value +pairs with their descriptive terms. `place` and `building` may not be used as +keys. Neither will `highway=residential` nor `highway=unclassified` work. +There may be multiple entries for the same key/value pair (for example, +if you have extra entries for each supported language). + +The classification terms can help improve search when the type of an object +is used in the query but does not appear in the name. For example, with the +configuration given above a query of "Berlin Station" will find a railway +station which in OpenStreetMap has the name "Berlin" and also one with +the name "Berlin Hauptbahnhof". + +Classification terms do not enable searching for objects of a certain type. +"Station London" will not get you all railway stations in London but a +railway station _named_ London. + +## Usage Advice + +Use synonyms and classification terms sparingly and only if you can be +reasonably sure that they will target the intended part of the address. +Short or frequent terms can have unexpected side-effects and worsen the +search results. For example, it might sound like a good idea to use synonyms +to handle the abbreviation from 'Saint' to 'St'. The problem here is that +'St' is also used as an abbreviation for 'Street'. So all searches that +involve a 'Street' will suddenly also search for places containing 'Saint'. + +Do not create synonyms for terms that are used as classification terms. +Photon will not complain but again there might be unintended side effects. + diff --git a/es/index_settings.json b/es/index_settings.json index 33748ea35..60fea3a52 100644 --- a/es/index_settings.json +++ b/es/index_settings.json @@ -61,6 +61,12 @@ "lowercase", "preserving_word_delimiter"], "tokenizer": "standard" + }, + "search_classification": { + "filter": [ + "lowercase" + ], + "tokenizer": "whitespace" } }, "tokenizer": { diff --git a/es/mappings.json b/es/mappings.json index 396933c7f..441ca1be1 100644 --- a/es/mappings.json +++ b/es/mappings.json @@ -76,6 +76,15 @@ "importance": { "type": "float" }, + "classification": { + "type": "text", + "index": "true", + "analyzer": "keyword", + "search_analyzer": "search_classification", + "copy_to": [ + "collector.default" + ] + }, "name": { "properties": { "alt": { diff --git a/src/main/java/de/komoot/photon/App.java b/src/main/java/de/komoot/photon/App.java index 3ba382da4..33868a6c3 100644 --- a/src/main/java/de/komoot/photon/App.java +++ b/src/main/java/de/komoot/photon/App.java @@ -65,7 +65,7 @@ public static void main(String[] rawArgs) throws Exception { // Working on an existing installation. // Update the index settings in case there are any changes. - esServer.updateIndexSettings(); + esServer.updateIndexSettings(args.getSynonymFile()); esClient.admin().cluster().prepareHealth().setWaitForYellowStatus().get(); if (args.isNominatimUpdate()) { diff --git a/src/main/java/de/komoot/photon/CommandLineArgs.java b/src/main/java/de/komoot/photon/CommandLineArgs.java index d7a7bb6f5..2c7eb51be 100644 --- a/src/main/java/de/komoot/photon/CommandLineArgs.java +++ b/src/main/java/de/komoot/photon/CommandLineArgs.java @@ -35,6 +35,9 @@ public class CommandLineArgs { @Parameter(names = "-extra-tags", description = "additional tags to save for each place") private String extraTags = ""; + @Parameter(names = "-synonym-file", description = "file with synonym and classification terms") + private String synonymFile = null; + @Parameter(names = "-json", description = "import nominatim database and dump it to a json like files in (useful for developing)") private String jsonDump = null; diff --git a/src/main/java/de/komoot/photon/Constants.java b/src/main/java/de/komoot/photon/Constants.java index e87cb79f4..8bcc02a93 100644 --- a/src/main/java/de/komoot/photon/Constants.java +++ b/src/main/java/de/komoot/photon/Constants.java @@ -31,4 +31,5 @@ public class Constants { public static final String OSM_KEY = "osm_key"; public static final String OSM_VALUE = "osm_value"; public static final String OBJECT_TYPE = "object_type"; + public static final String CLASSIFICATION = "classification"; } diff --git a/src/main/java/de/komoot/photon/Utils.java b/src/main/java/de/komoot/photon/Utils.java index f099cb0eb..7625492a4 100644 --- a/src/main/java/de/komoot/photon/Utils.java +++ b/src/main/java/de/komoot/photon/Utils.java @@ -33,6 +33,11 @@ public static XContentBuilder convert(PhotonDoc doc, String[] languages, String[ .field(Constants.OBJECT_TYPE, atype == null ? "locality" : atype.getName()) .field(Constants.IMPORTANCE, doc.getImportance()); + String classification = buildClassificationString(doc.getTagKey(), doc.getTagValue()); + if (classification != null) { + builder.field(Constants.CLASSIFICATION, classification); + } + if (doc.getCentroid() != null) { builder.startObject("coordinate") .field("lat", doc.getCentroid().getY()) @@ -200,4 +205,26 @@ public static String stripNonDigits( } return sb.toString(); } + + public static String buildClassificationString(String key, String value) { + if ("place".equals(key) || "building".equals(key)) { + return null; + } + + if ("highway".equals(key) + && ("unclassified".equals(value) || "residential".equals(value))) { + return null; + } + + for (char c : value.toCharArray()) { + if (!(c == '_' + || ((c >= 'a') && (c <= 'z')) + || ((c >= 'A') && (c <= 'Z')) + || ((c >= '0') && (c <= '9')))) { + return null; + } + } + + return "tpfld" + value.replaceAll("_", "").toLowerCase() + "clsfld" + key.replaceAll("_", "").toLowerCase(); + } } diff --git a/src/main/java/de/komoot/photon/elasticsearch/DatabaseProperties.java b/src/main/java/de/komoot/photon/elasticsearch/DatabaseProperties.java index 6cab85113..aa11d251e 100644 --- a/src/main/java/de/komoot/photon/elasticsearch/DatabaseProperties.java +++ b/src/main/java/de/komoot/photon/elasticsearch/DatabaseProperties.java @@ -26,7 +26,7 @@ public class DatabaseProperties { * changes in an incompatible way. If it is alredy at the next released * version, increase the dev version. */ - private static final String DATABASE_VERSION = "0.3.4-0"; + private static final String DATABASE_VERSION = "0.3.6-0"; public static final String PROPERTY_DOCUMENT_ID = "DATABASE_PROPERTIES"; private static final String BASE_FIELD = "document_properties"; diff --git a/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java b/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java index 84f1a65f8..2135fe963 100644 --- a/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java +++ b/src/main/java/de/komoot/photon/elasticsearch/IndexSettings.java @@ -1,11 +1,16 @@ package de.komoot.photon.elasticsearch; +import de.komoot.photon.Utils; import org.elasticsearch.client.Client; import org.elasticsearch.common.xcontent.XContentType; +import org.json.JSONArray; import org.json.JSONObject; import org.json.JSONTokener; +import java.io.FileReader; +import java.io.IOException; import java.io.InputStream; +import java.util.*; /** * Encapsulates the ES index settings for the photon index. Adds functions to @@ -41,6 +46,92 @@ public IndexSettings setShards(Integer numShards) { return this; } + + /** + * Add query-time synonyms and classification terms from a file. + * + * Synonyms need to be supplied in a simple text file with one synonym entry per line. + * Synonyms need to be comma-separated. Only single-term synonyms are supported at this + * time. Spaces in the synonym list are considered a syntax error. + * + * @param synonymFile File containing the synonyms. + * + * @return This object for chaining. + */ + public IndexSettings setSynonymFile(String synonymFile) throws IOException { + if (synonymFile == null) { + return this; + } + + JSONObject synonymConfig = new JSONObject(new JSONTokener(new FileReader(synonymFile))); + + setSearchTimeSynonyms(synonymConfig.optJSONArray("search_synonyms")); + setClassificationTerms(synonymConfig.optJSONArray("classification_terms")); + + return this; + } + + public IndexSettings setSearchTimeSynonyms(JSONArray synonyms) { + if (synonyms != null) { + insertSynonymFilter("extra_synonyms", synonyms); + } + + return this; + } + + public IndexSettings setClassificationTerms(JSONArray terms) { + if (terms == null) { + return this; + } + + // Collect for each term in the list the possible classification expansions. + Map> collector = new HashMap<>(); + for (int i = 0; i < terms.length(); i++) { + JSONObject descr = terms.getJSONObject(i); + + String classString = Utils.buildClassificationString(descr.getString("key"), descr.getString("value")).toLowerCase(); + + if (classString != null) { + JSONArray jsonTerms = descr.getJSONArray("terms"); + for (int j = 0; j < jsonTerms.length(); j++) { + String term = jsonTerms.getString(j).toLowerCase().trim(); + if (term.indexOf(' ') >= 0) { + throw new RuntimeException("Syntax error in synonym file: only single word classification terms allowed."); + } + + if (term.length() > 1) { + collector.computeIfAbsent(term, k -> new HashSet<>()).add(classString); + } + } + } + } + + // Create the final list of synonyms. A term can expand to any classificator or not at all. + JSONArray synonyms = new JSONArray(); + collector.forEach((term, classificators) -> + synonyms.put(term + " => " + term + "," + String.join(",", classificators))); + + insertSynonymFilter("classification_synonyms", synonyms); + insertJsonArrayAfter("/analysis/analyzer/search_classification", "filter", "lowercase", "classification_synonyms"); + + return this; + } + + private void insertSynonymFilter(String filterName, JSONArray synonyms) { + if (!synonyms.isEmpty()) { + // Create a filter for the synonyms. + JSONObject filters = (JSONObject) settings.optQuery("/analysis/filter"); + if (filters == null) { + throw new RuntimeException("Analyser update: cannot find filter definition"); + } + filters.put(filterName, new JSONObject().put("type", "synonym").put("synonyms", synonyms)); + + // add synonym filter to the search analyzers + insertJsonArrayAfter("/analysis/analyzer/search_ngram", "filter", "lowercase", filterName); + insertJsonArrayAfter("/analysis/analyzer/search_raw", "filter", "lowercase", filterName); + } + } + /** * Create a new index using the current index settings. * @@ -65,4 +156,37 @@ public void updateIndex(Client client, String indexName) { client.admin().indices().prepareUpdateSettings(PhotonIndex.NAME).setSettings(settings.toString(), XContentType.JSON).execute().actionGet(); client.admin().indices().prepareOpen(PhotonIndex.NAME).execute().actionGet(); } + + /** + * Insert the given value into the array after the string given by positionString. + * If the position string is not found, throws a runtime error. + * + * @param jsonPointer Path description of the array to insert into. + * @param positionString Marker string after which to insert. + * @param value Value to insert. + */ + private void insertJsonArrayAfter(String jsonPointer, String field, String positionString, String value) { + JSONObject parent = (JSONObject) settings.optQuery(jsonPointer); + JSONArray array = parent == null ? null : parent.optJSONArray(field); + if (array == null) { + throw new RuntimeException("Analyser update: cannot find JSON array at" + jsonPointer); + } + + // We can't just insert items, so build a new array instead. + JSONArray new_array = new JSONArray(); + boolean done = false; + for (int i = 0; i < array.length(); i++) { + new_array.put(array.get(i)); + if (!done && positionString.equals(array.getString(i))) { + new_array.put(value); + done = true; + } + } + + if (!done) { + throw new RuntimeException("Analyser update: cannot find position string " + positionString); + } + + parent.put(field, new_array); + } } diff --git a/src/main/java/de/komoot/photon/elasticsearch/Server.java b/src/main/java/de/komoot/photon/elasticsearch/Server.java index 9cdf45474..d281f4ab4 100644 --- a/src/main/java/de/komoot/photon/elasticsearch/Server.java +++ b/src/main/java/de/komoot/photon/elasticsearch/Server.java @@ -1,13 +1,11 @@ package de.komoot.photon.elasticsearch; import lombok.extern.slf4j.Slf4j; -import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.SystemUtils; import org.elasticsearch.client.Client; import org.elasticsearch.client.transport.TransportClient; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.transport.InetSocketTransportAddress; -import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.IndexNotFoundException; import org.elasticsearch.node.InternalSettingsPreparer; import org.elasticsearch.node.Node; @@ -15,16 +13,12 @@ import org.elasticsearch.plugins.Plugin; import org.elasticsearch.transport.Netty4Plugin; import org.elasticsearch.transport.client.PreBuiltTransportClient; -import org.json.JSONArray; -import org.json.JSONObject; import java.io.File; import java.io.IOException; -import java.io.InputStream; import java.net.InetSocketAddress; import java.net.URISyntaxException; import java.net.URL; -import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.StandardCopyOption; import java.util.Arrays; @@ -178,14 +172,14 @@ public DatabaseProperties recreateIndex(String[] languages) throws IOException { return dbProperties; } - public void updateIndexSettings() { + public void updateIndexSettings(String synonymFile) throws IOException { // Load the settings from the database to make sure it is at the right // version. If the version is wrong, we should not be messing with the // index. DatabaseProperties dbProperties = new DatabaseProperties(); dbProperties.loadFromDatabase(getClient()); - loadIndexSettings().updateIndex(getClient(), PhotonIndex.NAME); + loadIndexSettings().setSynonymFile(synonymFile).updateIndex(getClient(), PhotonIndex.NAME); // Sanity check: legacy databases don't save the languages, so there is no way to update // the mappings consistently. diff --git a/src/main/java/de/komoot/photon/query/PhotonQueryBuilder.java b/src/main/java/de/komoot/photon/query/PhotonQueryBuilder.java index 629d3072d..d3fb3bf78 100644 --- a/src/main/java/de/komoot/photon/query/PhotonQueryBuilder.java +++ b/src/main/java/de/komoot/photon/query/PhotonQueryBuilder.java @@ -117,6 +117,7 @@ private PhotonQueryBuilder(String query, String language, List languages query4QueryBuilder.must(QueryBuilders.boolQuery() .should(nameNgramQuery) .should(QueryBuilders.matchQuery("housenumber", query).analyzer("standard")) + .should(QueryBuilders.matchQuery("classification", query).boost(0.1f)) .minimumShouldMatch("1")); } @@ -128,8 +129,9 @@ private PhotonQueryBuilder(String query, String language, List languages // Weigh the resulting score by importance. Use a linear scale function that ensures that the weight // never drops to 0 and cancels out the ES score. finalQueryWithoutTagFilterBuilder = QueryBuilders.functionScoreQuery(query4QueryBuilder, new FilterFunctionBuilder[]{ - new FilterFunctionBuilder(ScoreFunctionBuilders.linearDecayFunction("importance", "1.0", "0.6")) - }); + new FilterFunctionBuilder(ScoreFunctionBuilders.linearDecayFunction("importance", "1.0", "0.6")), + new FilterFunctionBuilder(QueryBuilders.matchQuery("classification", query), ScoreFunctionBuilders.weightFactorFunction(0.1f)) + }).scoreMode(ScoreMode.SUM); // Filter for later: records that have a housenumber and no name must only appear when the housenumber matches. queryBuilderForTopLevelFilter = QueryBuilders.boolQuery() diff --git a/src/test/java/de/komoot/photon/query/QueryByClassificationTest.java b/src/test/java/de/komoot/photon/query/QueryByClassificationTest.java new file mode 100644 index 000000000..609082aee --- /dev/null +++ b/src/test/java/de/komoot/photon/query/QueryByClassificationTest.java @@ -0,0 +1,154 @@ +package de.komoot.photon.query; + +import com.google.common.collect.ImmutableMap; +import de.komoot.photon.*; +import de.komoot.photon.elasticsearch.IndexSettings; +import de.komoot.photon.elasticsearch.PhotonIndex; +import org.elasticsearch.action.get.GetResponse; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.action.search.SearchType; +import org.elasticsearch.index.query.QueryBuilder; +import org.json.JSONArray; +import org.json.JSONObject; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.Collections; + +import static org.junit.Assert.*; + + +public class QueryByClassificationTest extends ESBaseTester { + private int testDocId = 10000; + + @Before + public void setup() throws IOException { + setUpES(); + } + + private PhotonDoc createDoc(String key, String value, String name) { + ImmutableMap nameMap = ImmutableMap.of("name", name); + + ++testDocId; + return new PhotonDoc(testDocId, "W", testDocId, key, value).names(nameMap); + } + + private SearchResponse search(String query) { + QueryBuilder builder = PhotonQueryBuilder.builder(query, "en", Collections.singletonList("en"), false).buildQuery(); + return getClient().prepareSearch("photon") + .setSearchType(SearchType.QUERY_THEN_FETCH) + .setQuery(builder) + .execute() + .actionGet(); + } + + @Test + public void testQueryByClassificationString() { + Importer instance = makeImporter(); + instance.add(createDoc("amenity", "restaurant", "curliflower")); + instance.finish(); + refresh(); + + String class_term = Utils.buildClassificationString("amenity", "restaurant"); + + assertNotNull(class_term); + + GetResponse response = getById(testDocId); + String classification = (String) response.getSource().get(Constants.CLASSIFICATION); + assertEquals(classification, class_term); + + SearchResponse result = search(class_term + " curli"); + + assertTrue(result.getHits().getTotalHits() > 0); + assertEquals(Integer.toString(testDocId), result.getHits().getHits()[0].getId()); + } + + @Test + public void testQueryByClassificationSynonym() { + Importer instance = makeImporter(); + instance.add(createDoc("amenity", "restaurant", "curliflower")); + instance.finish(); + refresh(); + + JSONArray terms = new JSONArray() + .put(new JSONObject() + .put("key", "amenity") + .put("value", "restaurant") + .put("terms", new JSONArray().put("pub").put("kneipe")) + ); + new IndexSettings().setClassificationTerms(terms).updateIndex(getClient(), PhotonIndex.NAME); + getClient().admin().cluster().prepareHealth().setWaitForYellowStatus().get(); + + SearchResponse result = search("pub curli"); + assertTrue(result.getHits().getTotalHits() > 0); + assertEquals(Integer.toString(testDocId), result.getHits().getHits()[0].getId()); + + + result = search("curliflower kneipe"); + assertTrue(result.getHits().getTotalHits() > 0); + assertEquals(Integer.toString(testDocId), result.getHits().getHits()[0].getId()); + } + + + @Test + public void testSynonymDoNotInterfereWithWords() { + Importer instance = makeImporter(); + instance.add(createDoc("amenity", "restaurant", "airport")); + instance.add(createDoc("aeroway", "terminal", "Houston")); + instance.finish(); + refresh(); + + JSONArray terms = new JSONArray() + .put(new JSONObject() + .put("key", "aeroway") + .put("value", "terminal") + .put("terms", new JSONArray().put("airport")) + ); + new IndexSettings().setClassificationTerms(terms).updateIndex(getClient(), PhotonIndex.NAME); + getClient().admin().cluster().prepareHealth().setWaitForYellowStatus().get(); + + SearchResponse result = search("airport"); + assertTrue(result.getHits().getTotalHits() > 0); + assertEquals(Integer.toString(testDocId - 1), result.getHits().getHits()[0].getId()); + + + result = search("airport houston"); + assertTrue(result.getHits().getTotalHits() > 0); + assertEquals(Integer.toString(testDocId), result.getHits().getHits()[0].getId()); + } + + @Test + public void testSameSynonymForDifferentTags() { + Importer instance = makeImporter(); + instance.add(createDoc("railway", "halt", "Newtown")); + instance.add(createDoc("railway", "station", "King's Cross")); + instance.finish(); + refresh(); + + JSONArray terms = new JSONArray() + .put(new JSONObject() + .put("key", "railway") + .put("value", "station") + .put("terms", new JSONArray().put("Station")) + ).put(new JSONObject() + .put("key", "railway") + .put("value", "halt") + .put("terms", new JSONArray().put("Station").put("Stop")) + ); + new IndexSettings().setClassificationTerms(terms).updateIndex(getClient(), PhotonIndex.NAME); + getClient().admin().cluster().prepareHealth().setWaitForYellowStatus().get(); + + SearchResponse result = search("Station newtown"); + assertTrue(result.getHits().getTotalHits() > 0); + assertEquals(Integer.toString(testDocId - 1), result.getHits().getHits()[0].getId()); + + result = search("newtown stop"); + assertTrue(result.getHits().getTotalHits() > 0); + assertEquals(Integer.toString(testDocId - 1), result.getHits().getHits()[0].getId()); + + result = search("king's cross Station"); + assertTrue(result.getHits().getTotalHits() > 0); + assertEquals(Integer.toString(testDocId), result.getHits().getHits()[0].getId()); + } +}