Skip to content

Commit

Permalink
search: enhance elasticsearch mappings
Browse files Browse the repository at this point in the history
For a better user experience, a serie of enhancements have been applied in this PR.

* Creates a custom docker image for elasticsearch with the ICU filter plugin.
* Improves mappings for all records types.
* Changes the way to sort records to put the best scores first.
* Changes the way to sort records to put the most recents first, when no query is specified.
* Sets a default analyzer in elasticsearch template.
* Adds a default search factory applied to all records types.
* Adds property to explain how elasticsearch set the score for a hit, if `debug` parameter is set.
* Creates a custom query parser for documents, to avoid to search in full-text by default.
* Closes #139.

Co-Authored-by: Sébastien Délèze <[email protected]>
  • Loading branch information
Sébastien Délèze committed Sep 7, 2020
1 parent 6831f07 commit 86bb6fb
Show file tree
Hide file tree
Showing 23 changed files with 716 additions and 141 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ include babel.ini
include docker/haproxy/Dockerfile
include docker/nginx/Dockerfile
include docker/postgres/Dockerfile
include docker/elasticsearch/Dockerfile
include pytest.ini
include scripts/bootstrap
include scripts/console
Expand Down
3 changes: 2 additions & 1 deletion docker-services.yml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ services:
- "15672:15672"
- "5672:5672"
es:
image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.6.0
build: ./docker/elasticsearch/
image: elasticsearch-icu
restart: "always"
environment:
- bootstrap.memory_lock=true
Expand Down
2 changes: 2 additions & 0 deletions docker/elasticsearch/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
FROM docker.elastic.co/elasticsearch/elasticsearch-oss:6.6.2
RUN bin/elasticsearch-plugin install analysis-icu
31 changes: 16 additions & 15 deletions sonar/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from datetime import timedelta

from invenio_oauthclient.contrib import orcid
from invenio_records_rest.facets import range_filter, terms_filter
from invenio_records_rest.facets import range_filter

from sonar.modules.deposits.api import DepositRecord, DepositSearch
from sonar.modules.deposits.permissions import DepositPermission
Expand All @@ -40,6 +40,7 @@
from sonar.modules.organisations.permissions import OrganisationPermission
from sonar.modules.permissions import record_permission_factory, \
wiki_edit_permission
from sonar.modules.query import and_term_filter
from sonar.modules.users.api import UserRecord, UserSearch
from sonar.modules.users.permissions import UserPermission
from sonar.modules.utils import get_current_language
Expand Down Expand Up @@ -448,19 +449,19 @@ def _(x):
))),
filters={
'organisation':
terms_filter('organisation.pid'),
and_term_filter('organisation.pid'),
'language':
terms_filter('language.value'),
and_term_filter('language.value'),
'subject':
terms_filter('facet_subjects'),
and_term_filter('facet_subjects'),
'specific_collection':
terms_filter('specificCollections'),
and_term_filter('specificCollections'),
'document_type':
terms_filter('documentType'),
and_term_filter('documentType'),
'controlled_affiliation':
terms_filter('contribution.controlledAffiliation.raw'),
and_term_filter('contribution.controlledAffiliation.raw'),
'author':
terms_filter('contribution.agent.preferred_name.raw'),
and_term_filter('contribution.agent.preferred_name.raw'),
'year':
range_filter('provisionActivity.startDate',
format='yyyy',
Expand All @@ -471,25 +472,25 @@ def _(x):
user=dict(terms=dict(field='user.full_name.keyword')),
contributor=dict(terms=dict(field='facet_contributors'))),
filters={
_('pid'): terms_filter('pid'),
_('status'): terms_filter('status'),
_('user'): terms_filter('user.full_name.keyword'),
_('contributor'): terms_filter('facet_contributors'),
_('pid'): and_term_filter('pid'),
_('status'): and_term_filter('status'),
_('user'): and_term_filter('user.full_name.keyword'),
_('contributor'): and_term_filter('facet_contributors'),
})
}
"""REST search facets."""

RECORDS_REST_SORT_OPTIONS = dict(documents=dict(
bestmatch=dict(
title=_('Best match'),
fields=['_score'],
default_order='desc',
fields=['-_score'],
default_order='asc',
order=2,
),
mostrecent=dict(
title=_('Most recent'),
fields=['-_created'],
default_order='asc',
default_order='desc',
order=1,
),
))
Expand Down
5 changes: 3 additions & 2 deletions sonar/es_templates/v6/record.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
}
},
"analyzer": {
"global_lowercase_asciifolding": {
"default": {
"type": "custom",
"tokenizer": "char_group_tokenizer",
"filter": [
"lowercase",
"asciifolding"
"icu_folding",
"german_normalization"
]
}
}
Expand Down
212 changes: 205 additions & 7 deletions sonar/modules/deposits/mappings/v6/deposits/deposit-v1.0.0.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,41 +5,239 @@
"numeric_detection": false,
"properties": {
"$schema": {
"type": "text",
"index": false
"type": "keyword"
},
"pid": {
"type": "keyword"
},
"_bucket": {
"type": "keyword"
},
"_files": {
"type": "object",
"properties": {
"bucket": {
"type": "keyword"
},
"file_id": {
"type": "keyword"
},
"version_id": {
"type": "keyword"
},
"key": {
"type": "keyword"
},
"checksum": {
"type": "keyword"
},
"size": {
"type": "integer"
},
"label": {
"type": "text"
},
"category": {
"type": "keyword"
},
"type": {
"type": "keyword"
},
"embargo": {
"type": "boolean"
},
"embargoDate": {
"type": "date",
"format": "yyyy-MM-dd"
},
"exceptInOrganisation": {
"type": "boolean"
}
}
},
"user": {
"type": "object"
"type": "object",
"properties": {
"$ref": {
"type": "keyword"
}
}
},
"status": {
"type": "keyword"
},
"step": {
"type": "keyword"
},
"logs": {
"type": "object",
"properties": {
"user": {
"type": "object",
"properties": {
"$ref": {
"type": "keyword"
}
}
},
"action": {
"type": "keyword"
},
"date": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss"
},
"comment": {
"type": "text"
}
}
},
"metadata": {
"type": "object"
"type": "object",
"properties": {
"documentType": {
"type": "keyword"
},
"title": {
"type": "text"
},
"subtitle": {
"type": "text"
},
"otherLanguageTitle": {
"type": "object",
"properties": {
"title": {
"type": "text"
},
"language": {
"type": "keyword"
}
}
},
"language": {
"type": "keyword"
},
"documentDate": {
"type": "date",
"format": "yyyy-MM-dd||yyyy"
},
"publication": {
"type": "object",
"properties": {
"publishedIn": {
"type": "text"
},
"year": {
"type": "text"
},
"volume": {
"type": "text"
},
"number": {
"type": "text"
},
"pages": {
"type": "text"
},
"editors": {
"type": "text"
},
"publisher": {
"type": "text"
}
}
},
"otherElectronicVersions": {
"type": "object",
"properties": {
"publicNote": {
"type": "text"
},
"url": {
"type": "keyword"
}
}
},
"specificCollections": {
"type": "text"
},
"classification": {
"type": "keyword"
},
"abstracts": {
"type": "object",
"properties": {
"language": {
"type": "keyword"
},
"abstract": {
"type": "text"
}
}
},
"subjects": {
"type": "object",
"properties": {
"language": {
"type": "keyword"
},
"subjects": {
"type": "keyword"
}
}
},
"dissertation": {
"type": "object",
"properties": {
"degree": {
"type": "text"
},
"jury_note": {
"type": "text"
},
"grantingInstitution": {
"type": "text"
},
"date": {
"type": "date",
"format": "yyyy-MM-dd||yyyy"
}
}
}
}
},
"contributors": {
"type": "object",
"properties": {
"name": {
"type": "text",
"copy_to": "facet_contributors"
},
"affiliation": {
"type": "keyword"
},
"role": {
"type": "keyword"
},
"orcid": {
"type": "keyword"
}
}
},
"facet_contributors": {
"type": "keyword"
},
"projects": {
"type": "object"
},
"diffusion": {
"type": "object"
},
"document": {
"type": "object",
"properties": {
"$ref": {
"type": "keyword"
}
}
}
}
}
Expand Down
1 change: 1 addition & 0 deletions sonar/modules/deposits/marshmallow/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,4 @@ class DepositSchemaV1(StrictKeysMixin):
created = fields.Str(dump_only=True)
updated = fields.Str(dump_only=True)
links = fields.Dict(dump_only=True)
explanation = fields.Raw(dump_only=True)
4 changes: 2 additions & 2 deletions sonar/modules/deposits/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
"""Query for deposits."""

from flask import current_app
from invenio_records_rest.query import es_search_factory

from sonar.modules.organisations.api import current_organisation
from sonar.modules.query import default_search_factory
from sonar.modules.users.api import current_user_record


Expand All @@ -31,7 +31,7 @@ def search_factory(self, search, query_parser=None):
:param query_parser: Url arguments.
:returns: Tuple with search instance and URL arguments.
"""
search, urlkwargs = es_search_factory(self, search)
search, urlkwargs = default_search_factory(self, search)

if current_app.config.get('SONAR_APP_DISABLE_PERMISSION_CHECKS'):
return (search, urlkwargs)
Expand Down
Loading

0 comments on commit 86bb6fb

Please sign in to comment.