Skip to content

Commit

Permalink
Merge pull request #310 from ncats/name_validation_speed
Browse files Browse the repository at this point in the history
Provide option in namesvalidator to speed up searches and validation
  • Loading branch information
ChemMitch authored Jan 26, 2024
2 parents 4c98bd9 + b700509 commit bcc831b
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,14 @@ default Substance findBySubstanceReference(SubstanceReference substanceReference
Optional<SubstanceSummary> findSummaryByUuid(UUID uuid);

List<SubstanceSummary> findByNames_NameIgnoreCase(String name);

//use an explicit query to prevent Hibernate from inserting a call to UPPER() which
// slows down processing on some RDBMSs
@Query("select s from Substance s join s.names n where n.name = ?1")
List<SubstanceSummary> findByNames_NameIgnoreCaseImplicit(String name);

List<SubstanceSummary> findByNames_Name(String name);

List<SubstanceSummary> findByNames_StdNameIgnoreCase(String stdName);

List<SubstanceSummary> findByCodes_CodeIgnoreCase(String code);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ public class NamesValidator extends AbstractValidatorPlugin<Substance> {
private SubstanceRepository substanceRepository;
// Currently, this is false at FDA; it maybe confusing if used together with TagsValidator.
boolean extractLocators = false;
private boolean duplicateNameIsError = false;

private String caseSearchType = "Explicit";

// Keep consistent with NamesUtilities
// This and other replacers should be handled later in a new NameStandardizer class similar to HTMLNameStandardizer
Expand Down Expand Up @@ -105,7 +108,7 @@ public void validate(Substance s, Substance objold, ValidatorCallback callback)
boolean preferred = false;
int display = 0;
Iterator<Name> nameIterator = s.names.iterator();
while(nameIterator.hasNext()){
while (nameIterator.hasNext()) {
Name n = nameIterator.next();
if (n == null) {
GinasProcessingMessage mes = GinasProcessingMessage
Expand All @@ -120,7 +123,7 @@ public void validate(Substance s, Substance objold, ValidatorCallback callback)
continue;

}
if(n.getName() ==null){
if (n.getName() == null) {
callback.addMessage(GinasProcessingMessage.ERROR_MESSAGE("name can not be null"));
continue;
}
Expand All @@ -140,13 +143,13 @@ public void validate(Substance s, Substance objold, ValidatorCallback callback)
// shown which name(s) have been changed in the warning.
TagUtilities.BracketExtraction be = TagUtilities.getBracketExtraction(n.getName());
List<String> locators = be.getTagTerms();
if(!locators.isEmpty()){
if (!locators.isEmpty()) {
GinasProcessingMessage mes = GinasProcessingMessage
.WARNING_MESSAGE(
"Names of form \"<NAME> [<TEXT>]\" are transformed to locators. The following locators will be added:%s",
locators.toString())
locators.toString())
.appliableChange(true);
callback.addMessage(mes, ()->{
callback.addMessage(mes, () -> {
for (String loc : locators) {
// Name is changed to just the namePart!
n.name = be.getNamePart();
Expand Down Expand Up @@ -182,7 +185,7 @@ public void validate(Substance s, Substance objold, ValidatorCallback callback)
for (Replacer r : replacers.get()) {
//check for Null
String name = n.getName();
if(name!=null && r.matches(name)) {
if (name != null && r.matches(name)) {
GinasProcessingMessage mes = GinasProcessingMessage
.WARNING_MESSAGE(
r.getMessage(name))
Expand All @@ -191,17 +194,17 @@ public void validate(Substance s, Substance objold, ValidatorCallback callback)

}
}
if(n.getAccess().isEmpty()){
if (n.getAccess().isEmpty()) {
boolean hasPublicReference = n.getReferences().stream()
.map(r->r.getValue())
.map(r->s.getReferenceByUUID(r))
.map(r -> r.getValue())
.map(r -> s.getReferenceByUUID(r))
.filter(Objects::nonNull)
.filter(r->r.isPublic())
.filter(r->r.isPublicDomain())
.filter(r -> r.isPublic())
.filter(r -> r.isPublicDomain())
.findAny()
.isPresent();

if(!hasPublicReference){
if (!hasPublicReference) {
GinasProcessingMessage mes = GinasProcessingMessage
.ERROR_MESSAGE("The name :\"%s\" needs an unprotected reference marked \"Public Domain\" in order to be made public.",
n.getName());
Expand Down Expand Up @@ -238,61 +241,71 @@ public void validate(Substance s, Substance objold, ValidatorCallback callback)

Map<String, Set<String>> nameSetByLanguage = new HashMap<>();

Optional<Name> oldDisplayName= (objold!=null && objold.names !=null) ? objold.names.stream().filter(n->n!=null && n.displayName).findFirst() : Optional.empty();
LogUtil.trace(()->String.format("oldDisplayName: present: %b; value: %s", oldDisplayName.isPresent(),
Optional<Name> oldDisplayName = (objold != null && objold.names != null) ? objold.names.stream().filter(n -> n != null && n.displayName).findFirst() : Optional.empty();
LogUtil.trace(() -> String.format("oldDisplayName: present: %b; value: %s", oldDisplayName.isPresent(),
oldDisplayName.isPresent() ? oldDisplayName.get().getName() : ""));

for (Name n : s.names) {
if( n==null || n.getName() == null) {
if (n == null || n.getName() == null) {
//skip over null names
continue;
}
String name = n.getName();
Iterator<Keyword> iter = n.languages.iterator();
String uppercasedName = name.toUpperCase();

while(iter.hasNext()){
while (iter.hasNext()) {
String language = iter.next().getValue();
// System.out.println("language for " + n + " = " + language);
Set<String> names = nameSetByLanguage.computeIfAbsent(language, k->new HashSet<>());
if(!names.add(uppercasedName)){
Set<String> names = nameSetByLanguage.computeIfAbsent(language, k -> new HashSet<>());
if (!names.add(uppercasedName)) {
GinasProcessingMessage mes;
mes = GinasProcessingMessage
.WARNING_MESSAGE("Name '%s' is a duplicate name in the record.", name)
.markPossibleDuplicate();
.WARNING_MESSAGE("Name '%s' is a duplicate name in the record.", name)
.markPossibleDuplicate();
callback.addMessage(mes);
}

}
//nameSet.add(n.getName());
try {
List<SubstanceRepository.SubstanceSummary> sr = substanceRepository.findByNames_NameIgnoreCase(n.name);
List<SubstanceRepository.SubstanceSummary> sr =
(!this.caseSearchType.equalsIgnoreCase("IMPLICIT"))
? substanceRepository.findByNames_NameIgnoreCase(n.name)
: substanceRepository.findByNames_Name(n.name);
if (sr != null && !sr.isEmpty()) {
SubstanceRepository.SubstanceSummary s2 = sr.iterator().next();
if (!s2.getUuid().equals(s.getOrGenerateUUID())) {
GinasProcessingMessage mes = GinasProcessingMessage
.WARNING_MESSAGE("Name '%s' collides (possible duplicate) with existing name for substance:", n.name)
//TODO katzelda Feb 2021: add link back
. addLink(ValidationUtils.createSubstanceLink(s2.toSubstanceReference()))
;
//TODO katzelda Feb 2021: add link back
.addLink(ValidationUtils.createSubstanceLink(s2.toSubstanceReference()));
callback.addMessage(mes);
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
if(oldDisplayName.isPresent() && n.displayName && !oldDisplayName.get().getName().equalsIgnoreCase(n.getName())
&& (s.changeReason==null || !s.changeReason.equalsIgnoreCase(CHANGE_REASON_DISPLAYNAME_CHANGED))) {
if (oldDisplayName.isPresent() && n.displayName && !oldDisplayName.get().getName().equalsIgnoreCase(n.getName())
&& (s.changeReason == null || !s.changeReason.equalsIgnoreCase(CHANGE_REASON_DISPLAYNAME_CHANGED))) {
GinasProcessingMessage mes = GinasProcessingMessage
.WARNING_MESSAGE(
"Preferred Name has been changed from '%s' to '%s'. Please confirm that this change is intentional by submitting.",
oldDisplayName.get().getName(), n.getName());
callback.addMessage(mes);
}
}
}

public String getCaseSearchType() {
return caseSearchType;
}

public void setCaseSearchType(String caseSearchType) {
this.caseSearchType = caseSearchType;
}


public void setReplaceSingleLinefeedPrecededByCertainCharactersWithBlank(boolean replaceSingleLinefeedPrecededByCertainCharactersWithBlank) {
this.replaceSingleLinefeedPrecededByCertainCharactersWithBlank = replaceSingleLinefeedPrecededByCertainCharactersWithBlank;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,10 @@ gsrs.validators.substances = [
"validatorClass" = "ix.ginas.utils.validation.validators.NamesValidator",
"newObjClass" = "ix.ginas.models.v1.Substance",
"type" = "PRIMARY"
"configClass" = "SubstanceValidatorConfig"
"configClass" = "SubstanceValidatorConfig",
"parameters"= {
"caseSearchType": "Explicit"
}
},

{
Expand Down

0 comments on commit bcc831b

Please sign in to comment.