Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add synonyms linter #447

Merged
merged 2 commits into from
Jun 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions settings.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,23 @@ const fs = require('fs');
const path = require('path');
const peliasConfig = require('pelias-config');
const punctuation = require('./punctuation');
const synonymFile = require('./synonyms/parser');
const synonymParser = require('./synonyms/parser');
const synonymLinter = require('./synonyms/linter');

// load synonyms from disk
const synonyms = fs.readdirSync(path.join(__dirname, 'synonyms'))
.sort()
.filter( f => f.match(/\.txt$/) )
.reduce(( acc, cur ) => {
acc[cur.replace('.txt','')] = synonymFile(
acc[cur.replace('.txt', '')] = synonymParser(
path.join(__dirname, 'synonyms', cur)
);
return acc;
}, {});

// emit synonym warnings
synonymLinter(synonyms);

require('./configValidation').validate(peliasConfig.generate());

function generate(){
Expand Down
20 changes: 10 additions & 10 deletions synonyms/custom_name.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ greater,grtr,gtr
greens,grns
groves,grvs
heights,hghts,hgts,hieghts,ht,hts,hgths
international,intl,int'l
international,intl
lake,lk
lakes,lks
little,ltl,lttl,littl,litl
Expand All @@ -60,7 +60,7 @@ mount,mt,mnt
mountain,mtn
mountains,mtns
municipal,mun,mpal
national,natl,nat'l
national,natl
neck,nck
orchard,orch
paradise,pde,pdse
Expand Down Expand Up @@ -115,7 +115,7 @@ wiese,ws

# Spanish
abril,abr,abl
agosto,ag,agto,ag.to,agt
agosto,ag,agto,agt
altura,alt
alturas,alts
arboleda,arb
Expand All @@ -132,9 +132,9 @@ corral,crral
corralillo,crrlo
diseminado,disem
enero,en,eno,ene,en o
diciembre,dic,dicbre,dic.bre,dice,dic.e,dbre,d.bre,10bre,10.bre,10 bre,xbre,x.bre,x bre
febrero,febo,feb.o,febro,feb.ro,febr,feb
gobierno,gob,gobno,gob.no
diciembre,dic,dicbre,dice,dbre,10bre,10 bre,xbre,x bre
febrero,febo,febro,febr,feb
gobierno,gob,gobno
grande,gr
guerra,ga
independencia,indep
Expand All @@ -154,8 +154,8 @@ militar,milr
monte,mt,mte,mnte
montes,mts,mtes,mntes,mnts
nacional,nal,nacl
noviembre,nbre,n.bre,nvre,n.vre,nove,nov.e,novre,nov.re,novbre,nov.bre,9bre,9.bre,9 bre
octubre,oct,octbre,oct.bre,octe,oct.e,8bre,8.bre,8 bre
noviembre,nbre,nvre,nove,novre,novbre,9bre,9 bre
octubre,oct,octbre,octe,8bre,8 bre
portillo,ptilo,ptllo
prado,prdo
primeros,pros
Expand All @@ -167,8 +167,8 @@ republica,rep
revolucion,rev
ribera,ribr
río,rio
septiembre,setbre,set.bre,sepe,sep.e,sepbre,sep.bre,7bre,7 re,7re,7.re,7 bre,7.bre,sep,set
septiembre,setbre,sepe,sepbre,7bre,7 re,7re,7 bre,sep,set
sierra,srra
valle,vlle
volcan,vlcn
voluntarios,voluntos
voluntarios,voluntos
76 changes: 76 additions & 0 deletions synonyms/linter.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
const _ = require('lodash');
const logger = require('pelias-logger').get('schema-synonyms');
const punctuation = require('../punctuation');

/**
* The synonyms linter attempts to warn the user when making
* common mistakes with synonyms.
*
* Warnings:
* - Puntuation: Synonyms should not contain characters in the punctuation blacklist
* - Letter Casing: Synonyms should be lowercase
* - Sanity Checks: At least one synonym should exist, duplicates should be removed
* - Multi Word: Multi-word synonyms can generate unexpected token positions
*/

function linter(synonyms) {
_.each(synonyms, (lines, filename) => {
logger.debug(`[lint] ${filename}`);

lines.forEach((line, idx) => {
const logprefix = `[${filename} line ${idx+1}]`;
logger.debug(`[line] ${line}`);

// split the lines by delimeter
let tokens = line.split(/,|=>/g).map(t => t.trim());

// strip blacklisted punctuation from synonyms
// the 'punctuation.blacklist' contains a list of characters which are
// stripped from the tokens before indexing.
tokens = _.map(tokens, token => {
punctuation.blacklist.forEach(char => {
let replacement = token.split(char).join('');
if(replacement.length != token.length){
logger.warn(`${logprefix} punctunation removed: ${token} --> ${replacement}`);
}
token = replacement;
});
return token
});

letterCasing(line, logprefix, tokens);
tokensSanityCheck(line, logprefix, tokens);
// multiWordCheck(line, logprefix, tokens);
})
})
}

function letterCasing(line, logprefix){
if (line.toLowerCase() !== line) {
logger.warn(`${logprefix} should be lowercase:`, line);
}
}

function tokensSanityCheck(line, logprefix, tokens) {
switch (tokens.length){
case 0:
return logger.warn(`${logprefix} no tokens:`, line);
case 1:
return logger.warn(`${logprefix} only one token:`, line);
default:
let dupes = _.filter(tokens, (val, i, t) => _.includes(t, val, i + 1));
if (dupes.length){
logger.warn(`${logprefix} duplicate tokens:`, dupes);
}
}
}

function multiWordCheck(line, tokens) {
_.each(tokens, token => {
if (/\s/.test(token)){
logger.warn(`multi word synonyms may cause issues with phrase queries:`, token);
}
});
}

module.exports = linter
20 changes: 10 additions & 10 deletions test/fixtures/expected.json
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@
"greens,grns",
"groves,grvs",
"heights,hghts,hgts,hieghts,ht,hts,hgths",
"international,intl,int'l",
"international,intl",
"lake,lk",
"lakes,lks",
"little,ltl,lttl,littl,litl",
Expand All @@ -246,7 +246,7 @@
"mountain,mtn",
"mountains,mtns",
"municipal,mun,mpal",
"national,natl,nat'l",
"national,natl",
"neck,nck",
"orchard,orch",
"paradise,pde,pdse",
Expand Down Expand Up @@ -295,7 +295,7 @@
"vordere,vd,vord",
"wiese,ws",
"abril,abr,abl",
"agosto,ag,agto,ag.to,agt",
"agosto,ag,agto,agt",
"altura,alt",
"alturas,alts",
"arboleda,arb",
Expand All @@ -312,9 +312,9 @@
"corralillo,crrlo",
"diseminado,disem",
"enero,en,eno,ene,en o",
"diciembre,dic,dicbre,dic.bre,dice,dic.e,dbre,d.bre,10bre,10.bre,10 bre,xbre,x.bre,x bre",
"febrero,febo,feb.o,febro,feb.ro,febr,feb",
"gobierno,gob,gobno,gob.no",
"diciembre,dic,dicbre,dice,dbre,10bre,10 bre,xbre,x bre",
"febrero,febo,febro,febr,feb",
"gobierno,gob,gobno",
"grande,gr",
"guerra,ga",
"independencia,indep",
Expand All @@ -334,8 +334,8 @@
"monte,mt,mte,mnte",
"montes,mts,mtes,mntes,mnts",
"nacional,nal,nacl",
"noviembre,nbre,n.bre,nvre,n.vre,nove,nov.e,novre,nov.re,novbre,nov.bre,9bre,9.bre,9 bre",
"octubre,oct,octbre,oct.bre,octe,oct.e,8bre,8.bre,8 bre",
"noviembre,nbre,nvre,nove,novre,novbre,9bre,9 bre",
"octubre,oct,octbre,octe,8bre,8 bre",
"portillo,ptilo,ptllo",
"prado,prdo",
"primeros,pros",
Expand All @@ -347,7 +347,7 @@
"revolucion,rev",
"ribera,ribr",
"río,rio",
"septiembre,setbre,set.bre,sepe,sep.e,sepbre,sep.bre,7bre,7 re,7re,7.re,7 bre,7.bre,sep,set",
"septiembre,setbre,sepe,sepbre,7bre,7 re,7re,7 bre,sep,set",
"sierra,srra",
"valle,vlle",
"volcan,vlcn",
Expand Down Expand Up @@ -1226,4 +1226,4 @@
},
"dynamic": "strict"
}
}
}