Skip to content

Commit

Permalink
Merge pull request #1 from allenai/kylel/readme
Browse files Browse the repository at this point in the history
README, tests
  • Loading branch information
soldni authored Jun 30, 2023
2 parents 0d9fa2e + b24cec3 commit 79ad7c0
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 6 deletions.
44 changes: 44 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,50 @@
Data and tools for generating and inspecting OLMo pre-training data.


## Setup

Install Rust
```
curl https://sh.rustup.rs -sSf | sh
```

Install [CMake](https://cmake.org/install/)

* On **Mac OSX** with `brew install cmake`
* On **Linux** with `apt-get install cmake`


Install [OpenSSL](https://www.openssl.org/)

* On **Mac OSX** with `brew install openssl re2`
* On **Linux** with `apt-get install openssl`

Install [Protobuf]()

* On **Mac OSX** with `brew install protobuf`
* On **Linux** with `apt-get install protobuf-compiler`

Setting up Python
```
conda create -n dolma python=3.10
```


Install [Maturin](https://www.maturin.rs/)

```
pip install maturin
maturin develop
```


Installing this repository
```
cd dolma
pip install -e .
```


## Citation

If you use this repository, please cite it as:
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dependencies = [
"smashed[remote]>=0.20.0",
"presidio_analyzer==2.2.32",
"pycld2==0.41",
"pycld3==0.22",
# "pycld3==0.22",
"fasttext>=0.9.2",
"tokenizers>=0.13.3,<1.0.0",
"omegaconf>=2.3.0",
Expand All @@ -25,6 +25,7 @@ dependencies = [
"detect-secrets==1.4.0",
"termcolor==2.3.0",
"smart-open>=6.3.0",
"nltk==3.8.1"
]
classifiers = [
"Development Status :: 3 - Veta",
Expand Down
16 changes: 11 additions & 5 deletions python/dolma/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

from .data_types import TextSlice


sent_tokenizer = PunktSentenceTokenizer()


Expand All @@ -37,18 +36,22 @@ def make_variable_name(name: str, remove_multiple_underscores: bool = False) ->
return name


def split_paragraphs(text: str) -> List[TextSlice]:
def split_paragraphs(text: str, remove_empty: bool = True) -> List[TextSlice]:
"""
Split a string into paragraphs. A paragraph is defined as a sequence of zero or more characters, followed
by a newline character, or a sequence of one or more characters, followed by the end of the string.
"""
return [
text_slices = [
TextSlice(doc=text, start=match.start(), end=match.end())
for match in re.finditer(r"([^\n]*\n|[^\n]+$)", text)
]
if remove_empty is True:
text_slices = [text_slice for text_slice in text_slices if text_slice.text.strip()]
return text_slices



def split_sentences(text: str) -> List[TextSlice]:
def split_sentences(text: str, remove_empty: bool = True) -> List[TextSlice]:
"""
Split a string into sentences.
"""
Expand All @@ -59,4 +62,7 @@ def split_sentences(text: str) -> List[TextSlice]:
else:
offsets = []

return [TextSlice(doc=text, start=start, end=end) for (start, end) in offsets]
if remove_empty is True:
return [TextSlice(doc=text, start=start, end=end) for (start, end) in offsets]
else:
raise NotImplementedError("remove_empty=False is not implemented yet")
91 changes: 91 additions & 0 deletions tests/python/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""
Tests for the utils module.
@kylel
"""


from unittest import TestCase

from dolma.core.data_types import DocResult, Document, Span, TextSlice
from dolma.core.utils import make_variable_name, split_paragraphs, split_sentences


class TestUtils(TestCase):
def test_make_variable_name(self):
pass

def test_split_paragraphs(self):
text = "This is a paragraph.\nThis is another paragraph.\nThis is a third paragraph."
paragraphs = split_paragraphs(text=text)
self.assertIsInstance(paragraphs[0], TextSlice)
self.assertEqual(len(paragraphs), 3)
self.assertEqual(paragraphs[0].text, "This is a paragraph.\n")
self.assertEqual(text[paragraphs[0].start : paragraphs[0].end], paragraphs[0].text)
self.assertEqual(paragraphs[1].text, "This is another paragraph.\n")
self.assertEqual(text[paragraphs[1].start : paragraphs[1].end], paragraphs[1].text)

paragraphs2 = split_paragraphs(text=text, remove_empty=False)
self.assertListEqual([p.text for p in paragraphs], [p.text for p in paragraphs2])

def test_split_paragraphs_empty(self):
text = ""
paragraphs = split_paragraphs(text=text)
self.assertEqual(len(paragraphs), 0)

def test_split_paragraphs_with_newline_and_spaces(self):
text = "This is a sentence. \n \n This is another sentence.\n\n This is a third sentence."

paragraphs = split_paragraphs(text=text)
self.assertEqual(len(paragraphs), 3)
self.assertIsInstance(paragraphs[0], TextSlice)
self.assertEqual(len(paragraphs), 3)
self.assertEqual(paragraphs[0].text, "This is a sentence. \n")
self.assertEqual(text[paragraphs[0].start : paragraphs[0].end], paragraphs[0].text)
self.assertEqual(paragraphs[1].text, " This is another sentence.\n")
self.assertEqual(text[paragraphs[1].start : paragraphs[1].end], paragraphs[1].text)

paragraphs = split_paragraphs(text=text, remove_empty=False)
self.assertEqual(len(paragraphs), 5)
self.assertIsInstance(paragraphs[0], TextSlice)
self.assertEqual(len(paragraphs), 5)
self.assertEqual(paragraphs[0].text, "This is a sentence. \n")
self.assertEqual(text[paragraphs[0].start : paragraphs[0].end], paragraphs[0].text)
self.assertEqual(paragraphs[1].text, " \n")
self.assertEqual(text[paragraphs[1].start : paragraphs[1].end], paragraphs[1].text)
self.assertEqual(paragraphs[2].text, " This is another sentence.\n")
self.assertEqual(text[paragraphs[2].start : paragraphs[2].end], paragraphs[2].text)

def test_split_sentences(self):
text = "This is a sentence. This is another sentence. This is a third sentence."

sentences = split_sentences(text=text)
self.assertIsInstance(sentences[0], TextSlice)
self.assertEqual(len(sentences), 3)
self.assertEqual(sentences[0].text, "This is a sentence.")
self.assertEqual(text[sentences[0].start : sentences[0].end], sentences[0].text)
self.assertEqual(sentences[1].text, "This is another sentence.")
self.assertEqual(text[sentences[1].start : sentences[1].end], sentences[1].text)

sentences2 = split_sentences(text=text, remove_empty=False)
self.assertListEqual([s.text for s in sentences], [s.text for s in sentences2])

def test_split_sentences_empty(self):
text = ""
sentences = split_sentences(text=text)
self.assertEqual(len(sentences), 0)

def test_split_sentences_with_newline_and_spaces(self):
text = "This is a sentence. \n \n This is another sentence.\n\n This is a third sentence."

sentences = split_sentences(text=text)
self.assertEqual(len(sentences), 3)
self.assertIsInstance(sentences[0], TextSlice)
self.assertEqual(len(sentences), 3)
self.assertEqual(sentences[0].text, "This is a sentence.")
self.assertEqual(text[sentences[0].start : sentences[0].end], sentences[0].text)
self.assertEqual(sentences[1].text, "This is another sentence.")
self.assertEqual(text[sentences[1].start : sentences[1].end], sentences[1].text)

0 comments on commit 79ad7c0

Please sign in to comment.