Skip to content

Commit

Permalink
Merge pull request #33 from lfoppiano/extended-api
Browse files Browse the repository at this point in the history
Extends API
  • Loading branch information
lfoppiano authored Oct 25, 2021
2 parents 34002a0 + 442f60a commit 573b8d9
Show file tree
Hide file tree
Showing 21 changed files with 985 additions and 167 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ and [SuperMat](https://github.com/lfoppiano/SuperMat) for superconductors materi

| Space | sentences | tokens |
|-------|---|---|
| ~21Gb | 100001987 (100M) | 3260171825 (3.2B) |
| ~21Gb | 142306511 (142M) | 3286118146 (3.2B) |

- Useful references:
- SciBERT's'[cheatsheet](https://github.com/allenai/scibert/blob/master/scripts/cheatsheet.txt).
Expand Down Expand Up @@ -149,8 +149,8 @@ This final task reduce the domain-specific vocabulary to the only terms that are

Starting from a text file containing one paragraph per line, we performed the following operations:

1. Split paragraphs in sentences using [BlingFire](https://github.com/Microsoft/BlingFire), a sentence splitter.
2. Shard the obtained resulted large file, into several smaller (max 250000 sentences).
1. Split paragraphs in sentences using [BlingFire](https://github.com/Microsoft/BlingFire), a sentence splitter ([script](sentence-splitter.py)).
2. Shard the obtained resulted large file, into several smaller (max 250000 sentences). E.g. ``split --lines=250000 --numeric-suffixes input prefix``.
See [ref](https://github.com/google-research/bert/issues/117).
```
-rw-r--r-- 1 lfoppian0 tdm 50M Aug 12 10:29 SciCorpora+SuperMat.sentences.sharded00
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
## requires Blingfire https://github.com/Microsoft/BlingFire

import argparse
import os
from pathlib import Path

from blingfire import text_to_sentences

if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Sentence segmentation")

parser.add_argument("--input", help="Input file", required=True)
parser.add_argument("--output", help="Output file", required=True)

args = parser.parse_args()

input = args.input
output = args.output

if os.path.isfile(input):
input_path = Path(input)
output_path = Path(output)

with open(output_path, 'w') as fo:
with open(input_path, 'r') as fi:
for line in fi:
for sentence in text_to_sentences(line).split("\n"):
fo.write(sentence + "\n")


else:
parser.print_help()
75 changes: 75 additions & 0 deletions src/main/java/org/grobid/core/data/ChemicalComposition.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package org.grobid.core.data;

import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import org.apache.commons.lang3.ArrayUtils;

import java.util.Map;
import java.util.StringJoiner;

@JsonInclude(JsonInclude.Include.NON_EMPTY)
public class ChemicalComposition {
private Map<String, String> elements;

@JsonProperty("elements_vars")
private Map<String, Object> elementsVars;

@JsonProperty("amounts_vars")
private Map<String, Object> amountsVars;

private String formula;

@JsonProperty("oxygen_deficiency")
private Map<String, String> oxygenDeficency;

public Map<String, String> getElements() {
return elements;
}

public void setElements(Map<String, String> elements) {
this.elements = elements;
}

public Map<String, Object> getAmountsVars() {
return amountsVars;
}

public void setAmountsVars(Map<String, Object> amountsVars) {
this.amountsVars = amountsVars;
}

public Map<String, Object> getElementsVars() {
return elementsVars;
}

public void setElementsVars(Map<String, Object> elementsVars) {
this.elementsVars = elementsVars;
}

public String getFormula() {
return formula;
}

public void setFormula(String formula) {
this.formula = formula;
}

public Map<String, String> getOxygenDeficency() {
return oxygenDeficency;
}

public void setOxygenDeficency(Map<String, String> oxygenDeficency) {
this.oxygenDeficency = oxygenDeficency;
}

@Override
public String toString() {
return new StringJoiner(", ", ChemicalComposition.class.getSimpleName() + "[", "]")
.add("elements=" + ArrayUtils.toString(elements))
.add("elementsVars=" + ArrayUtils.toString(elementsVars))
.add("amountsVars=" + ArrayUtils.toString(amountsVars))
.add("formula='" + formula + "'")
.add("oxygenDeficency=" + ArrayUtils.toString(oxygenDeficency))
.toString();
}
}
35 changes: 35 additions & 0 deletions src/main/java/org/grobid/core/data/Formula.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package org.grobid.core.data;

import java.util.HashMap;
import java.util.Map;

public class Formula {

private String rawValue;
private Map<String, String> formulaComposition = new HashMap<>();

public Formula(String rawValue) {
this.rawValue = rawValue;
}

public Formula(String rawValue, Map<String, String> formulaComposition) {
this.rawValue = rawValue;
this.formulaComposition = formulaComposition;
}

public Map<String, String> getFormulaComposition() {
return formulaComposition;
}

public void setFormulaComposition(Map<String, String> formulaComposition) {
this.formulaComposition = formulaComposition;
}

public String getRawValue() {
return rawValue;
}

public void setRawValue(String rawValue) {
this.rawValue = rawValue;
}
}
Loading

0 comments on commit 573b8d9

Please sign in to comment.