Merge pull request #1 from allenai/kylel/readme

README, tests
allenai · Jun 30, 2023 · 79ad7c0 · 79ad7c0
2 parents 0d9fa2e + b24cec3
commit 79ad7c0
Show file tree

Hide file tree

Showing 4 changed files with 148 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -8,6 +8,50 @@
 Data and tools for generating and inspecting OLMo pre-training data.
 
 
+## Setup
+
+Install Rust
+```
+curl https://sh.rustup.rs -sSf | sh
+```
+
+Install [CMake](https://cmake.org/install/)
+
+  * On **Mac OSX** with `brew install cmake`
+  * On **Linux** with `apt-get install cmake`
+
+
+Install [OpenSSL](https://www.openssl.org/)
+
+  * On **Mac OSX** with `brew install openssl re2`
+  * On **Linux** with `apt-get install openssl`
+
+Install [Protobuf]()
+
+  * On **Mac OSX** with `brew install protobuf`
+  * On **Linux** with `apt-get install protobuf-compiler`
+
+Setting up Python
+```
+conda create -n dolma python=3.10
+```
+
+
+Install [Maturin](https://www.maturin.rs/)
+
+```
+pip install maturin
+maturin develop
+```
+
+
+Installing this repository
+```
+cd dolma
+pip install -e .
+```
+
+
 ## Citation
 
 If you use this repository, please cite it as:

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,7 @@ dependencies = [
     "smashed[remote]>=0.20.0",
     "presidio_analyzer==2.2.32",
     "pycld2==0.41",
-    "pycld3==0.22",
+    # "pycld3==0.22",
     "fasttext>=0.9.2",
     "tokenizers>=0.13.3,<1.0.0",
     "omegaconf>=2.3.0",
@@ -25,6 +25,7 @@ dependencies = [
     "detect-secrets==1.4.0",
     "termcolor==2.3.0",
     "smart-open>=6.3.0",
+    "nltk==3.8.1"
 ]
 classifiers = [
     "Development Status :: 3 - Veta",

diff --git a/python/dolma/core/utils.py b/python/dolma/core/utils.py
@@ -19,7 +19,6 @@
 
 from .data_types import TextSlice
 
-
 sent_tokenizer = PunktSentenceTokenizer()
 
 
@@ -37,18 +36,22 @@ def make_variable_name(name: str, remove_multiple_underscores: bool = False) ->
     return name
 
 
-def split_paragraphs(text: str) -> List[TextSlice]:
+def split_paragraphs(text: str, remove_empty: bool = True) -> List[TextSlice]:
     """
     Split a string into paragraphs. A paragraph is defined as a sequence of zero or more characters, followed
     by a newline character, or a sequence of one or more characters, followed by the end of the string.
     """
-    return [
+    text_slices = [
         TextSlice(doc=text, start=match.start(), end=match.end())
         for match in re.finditer(r"([^\n]*\n|[^\n]+$)", text)
     ]
+    if remove_empty is True:
+        text_slices = [text_slice for text_slice in text_slices if text_slice.text.strip()]
+    return text_slices
+
 
 
-def split_sentences(text: str) -> List[TextSlice]:
+def split_sentences(text: str, remove_empty: bool = True) -> List[TextSlice]:
     """
     Split a string into sentences.
     """
@@ -59,4 +62,7 @@ def split_sentences(text: str) -> List[TextSlice]:
     else:
         offsets = []
 
-    return [TextSlice(doc=text, start=start, end=end) for (start, end) in offsets]
+    if remove_empty is True:
+        return [TextSlice(doc=text, start=start, end=end) for (start, end) in offsets]
+    else:
+        raise NotImplementedError("remove_empty=False is not implemented yet")
diff --git a/tests/python/test_utils.py b/tests/python/test_utils.py
@@ -0,0 +1,91 @@
+"""
+
+Tests for the utils module.
+
+@kylel
+
+"""
+
+
+from unittest import TestCase
+
+from dolma.core.data_types import DocResult, Document, Span, TextSlice
+from dolma.core.utils import make_variable_name, split_paragraphs, split_sentences
+
+
+class TestUtils(TestCase):
+    def test_make_variable_name(self):
+        pass
+
+    def test_split_paragraphs(self):
+        text = "This is a paragraph.\nThis is another paragraph.\nThis is a third paragraph."
+        paragraphs = split_paragraphs(text=text)
+        self.assertIsInstance(paragraphs[0], TextSlice)
+        self.assertEqual(len(paragraphs), 3)
+        self.assertEqual(paragraphs[0].text, "This is a paragraph.\n")
+        self.assertEqual(text[paragraphs[0].start : paragraphs[0].end], paragraphs[0].text)
+        self.assertEqual(paragraphs[1].text, "This is another paragraph.\n")
+        self.assertEqual(text[paragraphs[1].start : paragraphs[1].end], paragraphs[1].text)
+
+        paragraphs2 = split_paragraphs(text=text, remove_empty=False)
+        self.assertListEqual([p.text for p in paragraphs], [p.text for p in paragraphs2])
+
+    def test_split_paragraphs_empty(self):
+        text = ""
+        paragraphs = split_paragraphs(text=text)
+        self.assertEqual(len(paragraphs), 0)
+
+    def test_split_paragraphs_with_newline_and_spaces(self):
+        text = "This is a sentence. \n  \n This is another sentence.\n\n  This is a third sentence."
+
+        paragraphs = split_paragraphs(text=text)
+        self.assertEqual(len(paragraphs), 3)
+        self.assertIsInstance(paragraphs[0], TextSlice)
+        self.assertEqual(len(paragraphs), 3)
+        self.assertEqual(paragraphs[0].text, "This is a sentence. \n")
+        self.assertEqual(text[paragraphs[0].start : paragraphs[0].end], paragraphs[0].text)
+        self.assertEqual(paragraphs[1].text, " This is another sentence.\n")
+        self.assertEqual(text[paragraphs[1].start : paragraphs[1].end], paragraphs[1].text)
+
+        paragraphs = split_paragraphs(text=text, remove_empty=False)
+        self.assertEqual(len(paragraphs), 5)
+        self.assertIsInstance(paragraphs[0], TextSlice)
+        self.assertEqual(len(paragraphs), 5)
+        self.assertEqual(paragraphs[0].text, "This is a sentence. \n")
+        self.assertEqual(text[paragraphs[0].start : paragraphs[0].end], paragraphs[0].text)
+        self.assertEqual(paragraphs[1].text, "  \n")
+        self.assertEqual(text[paragraphs[1].start : paragraphs[1].end], paragraphs[1].text)
+        self.assertEqual(paragraphs[2].text, " This is another sentence.\n")
+        self.assertEqual(text[paragraphs[2].start : paragraphs[2].end], paragraphs[2].text)
+
+    def test_split_sentences(self):
+        text = "This is a sentence. This is another sentence. This is a third sentence."
+
+        sentences = split_sentences(text=text)
+        self.assertIsInstance(sentences[0], TextSlice)
+        self.assertEqual(len(sentences), 3)
+        self.assertEqual(sentences[0].text, "This is a sentence.")
+        self.assertEqual(text[sentences[0].start : sentences[0].end], sentences[0].text)
+        self.assertEqual(sentences[1].text, "This is another sentence.")
+        self.assertEqual(text[sentences[1].start : sentences[1].end], sentences[1].text)
+
+        sentences2 = split_sentences(text=text, remove_empty=False)
+        self.assertListEqual([s.text for s in sentences], [s.text for s in sentences2])
+
+    def test_split_sentences_empty(self):
+        text = ""
+        sentences = split_sentences(text=text)
+        self.assertEqual(len(sentences), 0)
+
+    def test_split_sentences_with_newline_and_spaces(self):
+        text = "This is a sentence. \n  \n This is another sentence.\n\n  This is a third sentence."
+
+        sentences = split_sentences(text=text)
+        self.assertEqual(len(sentences), 3)
+        self.assertIsInstance(sentences[0], TextSlice)
+        self.assertEqual(len(sentences), 3)
+        self.assertEqual(sentences[0].text, "This is a sentence.")
+        self.assertEqual(text[sentences[0].start : sentences[0].end], sentences[0].text)
+        self.assertEqual(sentences[1].text, "This is another sentence.")
+        self.assertEqual(text[sentences[1].start : sentences[1].end], sentences[1].text)
+