Skip to content

Commit

Permalink
add --species-group-cut flag
Browse files Browse the repository at this point in the history
  • Loading branch information
dimus committed Oct 12, 2023
1 parent a17f4c3 commit 111cc40
Show file tree
Hide file tree
Showing 15 changed files with 152 additions and 73 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## Unreleased

## [v1.9.0] - 2023-10-12 Thu

- Add: restore backward compatibility by creating a new flag
`--species-group-cut`.

## [v1.8.0] - 2023-10-11 Wed

Expand Down
19 changes: 9 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,13 +158,7 @@ of names. Three versions of canonical forms are included:
| - | *Spiraea alba var. alba* Du Roi | Best for disambiguation, but has many lexical variants |
| Full | *Spiraea alba var. alba* | Presentation, infraspecies disambiguation |
| Simple | *Spiraea alba alba* | Name matching, presentation |
| Stem | *Spiraea alb* | Best for matching fem./masc. inconsistencies |

Note than stemmed version loses infraspecific epithet in cases where
it is the same as specific epithet. If they are not identical, both will be
present (e.g. `Bus alba albus` will produce stemmed canonical as
`Bus alb alb`, because `alba` and `albus` are not identical. This helps to
match nominotypical infraspecies (ICN)/species groups (ICZN).
| Stem | *Spiraea alb alb* | Best for matching fem./masc. inconsistencies |

The ``canonicalName -> full`` is good for presentation, as it keeps more
details.
Expand Down Expand Up @@ -406,15 +400,17 @@ performance.
``--port -p``
: set a port to run web-interface and [RESTful API][OpenAPI].

`` --web-logs``
: requires `--port`. Enables output of logs for web-services.

`` --nsqd-tcp``
: requires `--port`. Allows to redirect web-service log output to [NSQ]
messaging server's TCP-based endpoint. It is handy for aggregations of logs
from GNparser web-services running inside of Docker containers or
in Kubernetes pods.
``--species-group-cut``
: Changes stemmed canonical for autonym or species group names (e.g. `Aus bus bus`). It cuts infraspecific epithet, leaving only genus and specific
epithet. All other data stays the same. This feature might be useful to
match names like `Aus bus` and `Aus bus bus`.
``--stream -s``
: ``GNparser`` can be used from any language using pipe-in/pipe-out of the
command line application. This approach requires sending 1 name at a time
Expand All @@ -427,6 +423,9 @@ achieve that.
``--version -V``
: shows the version number of ``GNparser``.
`` --web-logs``
: requires `--port`. Enables output of logs for web-services.
To parse one name:
```bash
Expand Down
12 changes: 12 additions & 0 deletions config.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@ type Config struct {
// WithWebLogs flag enables logs when running web-service. This flag is
// ignored if `Port` value is not set.
WithWebLogs bool

// WithSpeciesGroupCut flag means that stemmed version of autonyms (ICN) and
// species group names (ICZN) will be truncated to species. It helps to
// simplify matching names like `Aus bus` and `Aus bus bus`.
WithSpeciesGroupCut bool
}

// Option is a type that has to be returned by all Option functions. Such
Expand Down Expand Up @@ -199,6 +204,13 @@ func OptWithWebLogs(b bool) Option {
}
}

// OptWithSpeciesGroupCut sets WithSpeciesGroupCut field.
func OptWithSpeciesGroupCut(b bool) Option {
return func(cfg *Config) {
cfg.WithSpeciesGroupCut = b
}
}

// NewConfig generates a new Config object. It can take an arbitrary number
// of `Option` functions to modify default configuration settings.
func NewConfig(opts ...Option) Config {
Expand Down
1 change: 1 addition & 0 deletions ent/parser/ast.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ type scientificNameNode struct {
ambiguousEpithet string
ambiguousModif string
warnings map[parsed.Warning]struct{}
withSpGroup bool
}

func (p *Engine) newScientificNameNode() {
Expand Down
2 changes: 1 addition & 1 deletion ent/parser/interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ type Parser interface {
// It contains a method to convert AST into final output.
type ScientificNameNode interface {
// ToOutput converts AST into final output object.
ToOutput(withDetails bool) parsed.Parsed
ToOutput(withDetails, withSpGr bool) parsed.Parsed
}

// nameData is the interface for converting AST to output elements.
Expand Down
26 changes: 23 additions & 3 deletions ent/parser/name.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package parser

import (
"fmt"
"strings"

"github.com/gnames/gnparser/ent/parsed"
"github.com/gnames/gnparser/ent/stemmer"
Expand Down Expand Up @@ -38,17 +39,36 @@ func (sn *scientificNameNode) Normalized() string {
// Canonical returns canonical forms of scientific name. There are
// three forms: Stemmed, the most normalized, Simple, and Full (the least
// normalized).
func (sn *scientificNameNode) Canonical() *parsed.Canonical {
func (sn *scientificNameNode) Canonical(withSpGr bool) *parsed.Canonical {
var res *parsed.Canonical
if sn.nameData == nil {
return res
}
c := sn.canonical()
return &parsed.Canonical{
Stemmed: stemmer.StemCanonical(c.Value, sn.cardinality),
res = &parsed.Canonical{
Stemmed: stemmer.StemCanonical(c.Value),
Simple: c.Value,
Full: c.ValueRanked,
}
if withSpGr && sn.cardinality == 3 {
res.Stemmed = spGrStemmed(res.Stemmed, res.Simple)
}
return res
}

func spGrStemmed(stem, simple string) string {

eStem := strings.Split(stem, " ")
if len(eStem) != 3 || eStem[1] != eStem[2] {
return stem
}

sStem := strings.Split(simple, " ")
if len(sStem) != 3 || sStem[1] != sStem[2] {
return stem
}

return eStem[0] + " " + eStem[1]
}

// Details returns additional details of about a scientific names.
Expand Down
5 changes: 3 additions & 2 deletions ent/parser/output.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@ import (

// ToOutput converts Abstract Syntax Tree of scientific name to a
// final output object.
func (sn *scientificNameNode) ToOutput(withDetails bool) parsed.Parsed {
func (sn *scientificNameNode) ToOutput(
withDetails, withSpGr bool) parsed.Parsed {
res := parsed.Parsed{
Verbatim: sn.verbatim,
Canonical: sn.Canonical(),
Canonical: sn.Canonical(withSpGr),
Virus: sn.virus,
DaggerChar: sn.daggerChar,
VerbatimID: sn.verbatimID,
Expand Down
53 changes: 47 additions & 6 deletions ent/parser/parser_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package parser_test

import (
"fmt"
"testing"

"github.com/gnames/gnparser/ent/parser"
Expand All @@ -18,7 +19,7 @@ func TestPreNParse(t *testing.T) {
}
for _, v := range testData {
sn := p.PreprocessAndParse(v.name, "test_version", true, false, false, false)
parsed := sn.ToOutput(false)
parsed := sn.ToOutput(false, false)
can := parsed.Canonical
msg := v.name
if v.can == "" {
Expand All @@ -29,7 +30,7 @@ func TestPreNParse(t *testing.T) {
}
}

// TTestToOutput tests ToOutput method of ScientificNameNode
// TestToOutput tests ToOutput method of ScientificNameNode
func TestToOutput(t *testing.T) {
p := parser.New()
testData := []struct {
Expand All @@ -39,21 +40,24 @@ func TestToOutput(t *testing.T) {
{"Pardosa moesta L.", "Pardosa moesta", "L.", false, true},
{
"Bacillus subtilis (Ehrenberg, 1835) Cohn, 1872",
"Bacillus subtilis", "(Ehrenberg 1835) Cohn 1872", false, true,
"Bacillus subtilis", "(Ehrenberg 1835) Cohn 1872",
false, true,
},
{
"Bacillus subtilis (Ehrenberg, 1835) Cohn, 1872 sec. Miller",
"Bacillus subtilis", "(Ehrenberg 1835) Cohn 1872", false, true,
"Bacillus subtilis", "(Ehrenberg 1835) Cohn 1872",
false, true,
},
{
"Aconitum napellus var. formosum (Rchb.) W. D. J. Koch (nom. ambig.)",
"Aconitum napellus formosum", "(Rchb.) W. D. J. Koch", true, true,
"Aconitum napellus formosum", "(Rchb.) W. D. J. Koch",
true, true,
},
{"something", "", "", false, false},
}
for _, v := range testData {
sn := p.PreprocessAndParse(v.name, "test_version", true, false, false, false)
out := sn.ToOutput(v.det)
out := sn.ToOutput(v.det, false)
msg := v.name
if !out.Parsed {
assert.Nil(t, out.Canonical, msg)
Expand All @@ -63,3 +67,40 @@ func TestToOutput(t *testing.T) {
assert.Equal(t, v.au, out.Authorship.Normalized, msg)
}
}

// TestSpecGroupOption checks if stem is cut when WithSpeciesGroupCut is true.
func TestSpecGroupOption(t *testing.T) {
assert := assert.New(t)

p := parser.New()
testData := []struct {
name, stemmed string
spGrp bool
}{
{"Aus alba alba", "Aus alb alb", false},
{"Aus alba alba", "Aus alb", true},
{"Aus alba albus", "Aus alb alb", true},
{
"Bacillus subtilis subtilis (Ehrenberg, 1835) Cohn, 1872",
"Bacillus subtil subtil", false,
},
{
"Bacillus subtilis subtilis (Ehrenberg, 1835) Cohn, 1872",
"Bacillus subtil", true,
},
{
"Bacillus subtila subtilis (Ehrenberg, 1835) Cohn, 1872",
"Bacillus subtil subtil", true,
},
}
for _, v := range testData {
sn := p.PreprocessAndParse(
v.name, "test_version",
true, false, false, false,
)
out := sn.ToOutput(false, v.spGrp)
msg := v.name
fmt.Println(out.Canonical.Simple)
assert.Equal(v.stemmed, out.Canonical.Stemmed, msg)
}
}
18 changes: 1 addition & 17 deletions ent/stemmer/stemmer.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,7 @@ type StemmedWord struct {
// 3. All characters in the string are ASCII with exception of the
// hybrid sign.
// 4. The string always starts with a capitalized word.
func StemCanonical(c string, card int) string {
if card == 3 {
c = normalizeSpGroup(c, card)
}
func StemCanonical(c string) string {
graftChimeraFormulaParts := strings.Split(c, " + ")
for gci, gcv := range graftChimeraFormulaParts {
hybridFormulaParts := strings.Split(gcv, " × ")
Expand Down Expand Up @@ -162,19 +159,6 @@ func StemCanonical(c string, card int) string {
return str.TransliterateDiaereses(strings.Join(graftChimeraFormulaParts, " + "))
}

func normalizeSpGroup(c string, card int) string {
if card != 3 {
return c
}

es := strings.Split(c, " ")
if len(es) != 3 || es[1] != es[2] {
return c
}

return es[0] + " " + es[1]
}

// Stem takes a word and, assuming the word is noun, removes its latin suffix
// if such suffix is detected.
func Stem(wrd string) StemmedWord {
Expand Down
27 changes: 13 additions & 14 deletions ent/stemmer/stemmer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,22 @@ func TestStemmer(t *testing.T) {

t.Run("StemCanonical", func(t *testing.T) {
data := []struct {
msg string
in string
out string
card int
msg string
in string
out string
}{
{"Uninomial", "Pomatomus", "Pomatomus", 1},
{"Binomial1", "Betula naturae", "Betula natur", 2},
{"Binomial2", "Betula alba", "Betula alb", 2},
{"Binomial3", "Leptochloöpsis virgata", "Leptochloopsis uirgat", 2},
{"Trinomial", "Betula alba naturae", "Betula alb natur", 3},
{"SpGroup", "Betula alba alba", "Betula alb", 3},
{"SpGroup", "Betula alba albus", "Betula alb alb", 3},
{"GraftChimeraFormula", "Crataegus + Mespilus", "Crataegus + Mespilus", 0},
{"GraftChimeraFormula2", "Cytisus purpureus + Laburnum anagyroides", "Cytisus purpure + Laburnum anagyroid", 0},
{"Uninomial", "Pomatomus", "Pomatomus"},
{"Binomial1", "Betula naturae", "Betula natur"},
{"Binomial2", "Betula alba", "Betula alb"},
{"Binomial3", "Leptochloöpsis virgata", "Leptochloopsis uirgat"},
{"Trinomial", "Betula alba naturae", "Betula alb natur"},
{"SpGroup", "Betula alba alba", "Betula alb alb"},
{"SpGroup", "Betula alba albus", "Betula alb alb"},
{"GraftChimeraFormula", "Crataegus + Mespilus", "Crataegus + Mespilus"},
{"GraftChimeraFormula2", "Cytisus purpureus + Laburnum anagyroides", "Cytisus purpure + Laburnum anagyroid"},
}
for _, v := range data {
assert.Equal(t, v.out, stemmer.StemCanonical(v.in, v.card), v.msg)
assert.Equal(t, v.out, stemmer.StemCanonical(v.in), v.msg)
}
})
}
Expand Down
5 changes: 4 additions & 1 deletion gnparser.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,10 @@ func (gnp gnparser) ParseName(s string) parsed.Parsed {
sciNameNode := gnp.parser.PreprocessAndParse(
s, ver, gnp.cfg.IgnoreHTMLTags, gnp.cfg.WithCapitalization, gnp.cfg.WithCultivars, gnp.cfg.WithPreserveDiaereses,
)
res := sciNameNode.ToOutput(gnp.cfg.WithDetails)
res := sciNameNode.ToOutput(
gnp.cfg.WithDetails,
gnp.cfg.WithSpeciesGroupCut,
)
return res
}

Expand Down
12 changes: 12 additions & 0 deletions gnparser/cmd/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,18 @@ func withPreserveDiaeresesFlag(cmd *cobra.Command) {
}
}

func spGrCutFlag(cmd *cobra.Command) {
b, err := cmd.Flags().GetBool("species-group-cut")
if err != nil {
fmt.Println(err)
os.Exit(1)
}
if b {
opts = append(opts, gnparser.OptWithSpeciesGroupCut(true))
}

}

func withStreamFlag(cmd *cobra.Command) {
withDet, err := cmd.Flags().GetBool("stream")
if err != nil {
Expand Down
3 changes: 3 additions & 0 deletions gnparser/cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ gnparser -j 5 -p 8080
withEnableCultivarsFlag(cmd)
withPreserveDiaeresesFlag(cmd)
batchSizeFlag(cmd)
spGrCutFlag(cmd)
port := portFlag(cmd)
cfg := gnparser.NewConfig(opts...)
batchSize = cfg.BatchSize
Expand Down Expand Up @@ -171,6 +172,8 @@ func init() {

rootCmd.Flags().BoolP("web-logs", "", false, "enable logs for the web service")

rootCmd.Flags().BoolP("species-group-cut", "", false, "cut autonym/species group names to species for stemmed version")

rootCmd.Flags().StringP("nsqd-tcp", "", "", "an addresss pointing to NSQ TCP service for logs redirection (e.g. 127.0.0.1:4150)")
}

Expand Down
Loading

0 comments on commit 111cc40

Please sign in to comment.