Skip to content

Commit

Permalink
Merge pull request #379 from pelias/remove-complicated-housenumber-fi…
Browse files Browse the repository at this point in the history
…lter

Remove complicated housenumber filter
  • Loading branch information
orangejulius authored Nov 8, 2019
2 parents 83e0cf2 + e7adf7e commit 2afa1e3
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 102 deletions.
16 changes: 8 additions & 8 deletions integration/analyzer_peliasIndexOneEdgeGram.js
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ module.exports.tests.functional = function(test, common){
]);

assertAnalysis('address', '101 mapzen place', [
'0:101', '1:m', '1:ma', '1:map', '1:mapz', '1:mapze', '1:mapzen', '2:p', '2:pl', '2:pla', '2:plac', '2:place'
'0:1', '0:10', '0:101', '1:m', '1:ma', '1:map', '1:mapz', '1:mapze', '1:mapzen', '2:p', '2:pl', '2:pla', '2:plac', '2:place'
]);

suite.run( t.end );
Expand Down Expand Up @@ -189,23 +189,23 @@ module.exports.tests.address = function(test, common){
suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up

assertAnalysis( 'address', '101 mapzen place', [
'0:101',
'0:1', '0:10', '0:101',
'1:m', '1:ma', '1:map', '1:mapz', '1:mapze', '1:mapzen',
'2:p', '2:pl', '2:pla', '2:plac', '2:place'
]);

assertAnalysis( 'address', '30 w 26 st', [
'0:30',
'0:3', '0:30',
'1:w', '1:we', '1:wes', '1:west',
'2:26',
'2:2', '2:26',
'3:s', '3:st', '3:str', '3:stre', '3:stree', '3:street'
]);

assertAnalysis( 'address', '4B 921 83 st', [
'0:4b',
'2:921', // @todo: this token position is incorrect
'3:83',
'4:s', '4:st', '4:str', '4:stre', '4:stree', '4:street'
'0:4', '0:4b',
'1:9', '1:92', '1:921',
'2:8', '2:83',
'3:s', '3:st', '3:str', '3:stre', '3:stree', '3:street'
]);

suite.run( t.end );
Expand Down
47 changes: 0 additions & 47 deletions settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,7 @@ function generate(){
"ampersand",
"remove_ordinals",
"removeAllZeroNumericPrefix",
"surround_single_characters_with_word_markers",
"house_number_word_delimiter",
"remove_single_characters",
"surround_house_numbers_with_word_markers",
"peliasOneEdgeGramFilter",
"eliminate_tokens_starting_with_word_marker",
"remove_encapsulating_word_markers",
"unique_only_same_position",
"notnull"
]
Expand Down Expand Up @@ -216,47 +210,6 @@ function generate(){
"pattern": " +",
"replacement": " "
},

// START OF COMPLICATED FILTERS TO ANALYZE HOUSE NUMBERS
// @see: https://github.com/pelias/schema/pull/133
// note: we use \x02 (start-of-text) and \x03 (end-of-text) characters to mark word borders
"surround_single_characters_with_word_markers":{
"description": "wraps single characters with markers, needed to protect valid single characters and not those extracted from house numbers (14a creates an 'a' token)",
"type": "pattern_replace",
"pattern": "^(.{1})$",
"replacement": "\x02$1\x03"
},
"house_number_word_delimiter": {
"description": "splits on letter-to-number transition and vice versa, splits 14a -> [14, 14a, a]",
"type": "word_delimiter",
"split_on_numerics": "true",
"preserve_original": "true"
},
"remove_single_characters": {
"description": "removes single characters created from house_number_word_delimiter, removes the letter portion of a house number",
"type": "length",
"min": 2
},
"surround_house_numbers_with_word_markers": {
"description": "surrounds house numbers with markers, needed to protect whole house numbers from elimination step after prefix n-gramming",
"type": "pattern_replace",
"pattern": "^([0-9]+[a-z]?)$",
"replacement": "\x02$1\x03"
},
"eliminate_tokens_starting_with_word_marker": {
"description": "remove tokens starting but not ending with markers, saves whole house numbers wrapped in markers",
"type": "pattern_replace",
"pattern": "^\x02(.*[^\x03])?$",
"replacement": ""
},
"remove_encapsulating_word_markers": {
"description": "extract the stuff between the markers, extract 14 from \x0214\x03 since we're done the prefix n-gramming step",
"type": "pattern_replace",
"pattern": "^\x02(.*)\x03$",
"replacement": "$1"
}
// END OF COMPLICATED FILTERS TO ANALYZE HOUSE NUMBERS

// more generated below
},
"char_filter": {
Expand Down
41 changes: 0 additions & 41 deletions test/fixtures/expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,7 @@
"ampersand",
"remove_ordinals",
"removeAllZeroNumericPrefix",
"surround_single_characters_with_word_markers",
"house_number_word_delimiter",
"remove_single_characters",
"surround_house_numbers_with_word_markers",
"peliasOneEdgeGramFilter",
"eliminate_tokens_starting_with_word_marker",
"remove_encapsulating_word_markers",
"unique_only_same_position",
"notnull"
]
Expand Down Expand Up @@ -220,41 +214,6 @@
"pattern": " +",
"replacement": " "
},
"surround_single_characters_with_word_markers": {
"description": "wraps single characters with markers, needed to protect valid single characters and not those extracted from house numbers (14a creates an 'a' token)",
"type": "pattern_replace",
"pattern": "^(.{1})$",
"replacement": "\u0002$1\u0003"
},
"house_number_word_delimiter": {
"description": "splits on letter-to-number transition and vice versa, splits 14a -> [14, 14a, a]",
"type": "word_delimiter",
"split_on_numerics": "true",
"preserve_original": "true"
},
"remove_single_characters": {
"description": "removes single characters created from house_number_word_delimiter, removes the letter portion of a house number",
"type": "length",
"min": 2
},
"surround_house_numbers_with_word_markers": {
"description": "surrounds house numbers with markers, needed to protect whole house numbers from elimination step after prefix n-gramming",
"type": "pattern_replace",
"pattern": "^([0-9]+[a-z]?)$",
"replacement": "\u0002$1\u0003"
},
"eliminate_tokens_starting_with_word_marker": {
"description": "remove tokens starting but not ending with markers, saves whole house numbers wrapped in markers",
"type": "pattern_replace",
"pattern": "^\u0002(.*[^\u0003])?$",
"replacement": ""
},
"remove_encapsulating_word_markers": {
"description": "extract the stuff between the markers, extract 14 from \u000214\u0003 since we're done the prefix n-gramming step",
"type": "pattern_replace",
"pattern": "^\u0002(.*)\u0003$",
"replacement": "$1"
},
"ampersand": {
"type": "synonym",
"synonyms": [
Expand Down
6 changes: 0 additions & 6 deletions test/settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,7 @@ module.exports.tests.peliasIndexOneEdgeGramAnalyzer = function(test, common) {
"ampersand",
"remove_ordinals",
"removeAllZeroNumericPrefix",
"surround_single_characters_with_word_markers",
"house_number_word_delimiter",
"remove_single_characters",
"surround_house_numbers_with_word_markers",
"peliasOneEdgeGramFilter",
"eliminate_tokens_starting_with_word_marker",
"remove_encapsulating_word_markers",
"unique_only_same_position",
"notnull"
]);
Expand Down

0 comments on commit 2afa1e3

Please sign in to comment.