Merge pull request #140 from allenai/armanc/scitldr

Adding summarization datasets, scitldr and xsum
allenai · Oct 5, 2023 · cd70f0e · cd70f0e
2 parents cf23bf0 + 176f751
commit cd70f0e
Show file tree

Hide file tree

Showing 12 changed files with 753 additions and 7 deletions.
diff --git a/catwalk/dependencies/lm_eval/datasets/scitldr/__init__.py b/catwalk/dependencies/lm_eval/datasets/scitldr/__init__.py
diff --git a/catwalk/dependencies/lm_eval/datasets/scitldr/dataset_infos.json b/catwalk/dependencies/lm_eval/datasets/scitldr/dataset_infos.json
@@ -0,0 +1 @@
+{"Abstract": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n  title={{TLDR}: Extreme Summarization of Scientific Documents},\n  author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n  journal={arXiv:2004.15011},\n  year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": ["non-oracle", "oracle"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "Abstract", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2738065, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 1073656, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 994876, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/train.jsonl": {"num_bytes": 3155015, "checksum": "b222771d387be585cfdf5ae957b36757138415a352e0a3e3b23f73f87c3b1119"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/dev.jsonl": {"num_bytes": 1124865, "checksum": "3191fa98ccc09521332b7a1cd63b1930be4e8df125a235ccd31e40329709525e"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/test.jsonl": {"num_bytes": 1204107, "checksum": "fb42dd6cd4f4a1928ae8a01a189456fbfe994a07e938bd49f68653933f6503c9"}}, "download_size": 5483987, "post_processing_size": null, "dataset_size": 4806597, "size_in_bytes": 10290584}, "AIC": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n  title={{TLDR}: Extreme Summarization of Scientific Documents},\n  author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n  journal={arXiv:2004.15011},\n  year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": [0, 1], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "ic": {"dtype": "bool_", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "AIC", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 14473822, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 4822026, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 4476237, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/train.jsonl": {"num_bytes": 15569568, "checksum": "64b08af6de479671a12afd04770f66bcbc1c2c5f3098a08392b0fd7c1070d621"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/dev.jsonl": {"num_bytes": 4811551, "checksum": "ac5168c27d25181fc17bb6f1fb41d11dbe30c627bebee14457feb3bad2c839dd"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/test.jsonl": {"num_bytes": 5163989, "checksum": "7cb9230d3eb4863884762154918360d1c063aa18fc76de928801a14f4bcf4d37"}}, "download_size": 25545108, "post_processing_size": null, "dataset_size": 23772085, "size_in_bytes": 49317193}, "FullText": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n  title={{TLDR}: Extreme Summarization of Scientific Documents},\n  author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n  journal={arXiv:2004.15011},\n  year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": ["non-oracle", "oracle"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "FullText", "version": "0.0.0", "splits": {"train": {"name": "train", "num_bytes": 66917363, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 20182554, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 18790651, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/train.jsonl": {"num_bytes": 71263949, "checksum": "e35461c1665cb4f7b46daba6dd5ac3cff03a61eb196e6ce9983edda44d867604"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/dev.jsonl": {"num_bytes": 19111616, "checksum": "11c3fd77a7ec447adc44ca34c0fa41a7ab6bdacdf3b8e15748e6f8b8e4f698bf"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/test.jsonl": {"num_bytes": 20528987, "checksum": "1584bd3f5fff5859cb8428cfbacc8d38c671f5fc6a24a8140ea5350cbd86a751"}}, "download_size": 110904552, "post_processing_size": null, "dataset_size": 105890568, "size_in_bytes": 216795120}}
diff --git a/catwalk/dependencies/lm_eval/datasets/scitldr/scitldr.py b/catwalk/dependencies/lm_eval/datasets/scitldr/scitldr.py
@@ -0,0 +1,169 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dataset for TLDR: Extreme Summarization of Scientific Documents"""
+
+
+import json
+import os
+
+import datasets
+
+
+_SOURCE = "source"
+_TARGET = "target"
+
+_CITATION = """\
+@article{cachola2020tldr,
+  title={{TLDR}: Extreme Summarization of Scientific Documents},
+  author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},
+  journal={arXiv:2004.15011},
+  year={2020},
+}
+"""
+
+_DESCRIPTION = """\
+A new multi-target dataset of 5.4K TLDRs over 3.2K papers.
+SCITLDR contains both author-written and expert-derived TLDRs,
+where the latter are collected using a novel annotation protocol
+that produces high-quality summaries while minimizing annotation burden.
+"""
+
+
+_LICENSE = "Apache License 2.0"
+
+# TODO: Add link to the official dataset URLs here
+# The HuggingFace dataset library don't host the datasets but only point to the original files
+# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
+_URLs = {
+    "Abstract": "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/",
+    "AIC": "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/",
+    "FullText": "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/",
+}
+
+_TRAIN_DATA = "train.jsonl"
+_TEST_DATA = "test.jsonl"
+_VALID_DATA = "dev.jsonl"
+
+
+# There are several preprocessing scripts given in the original SciTLDR GitHub repository to preprocess this data.
+class Scitldr(datasets.GeneratorBasedBuilder):
+    """Dataset for TLDR: Extreme Summarization of Scientific Documents."""
+
+    VERSION = datasets.Version("1.1.0")
+
+    # You will be able to load one or the other configurations in the following list with
+    # data = datasets.load_dataset('scitldr', 'Abstract')
+    # data = datasets.load_dataset('scitldr', 'AIC')
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(name="Abstract", description="This part contains only abstracts of the paper"),
+        datasets.BuilderConfig(
+            name="AIC",
+            description="This part contains Abstracts, Introduction and Conclusion (AIC) sections of the paper",
+        ),
+        datasets.BuilderConfig(name="FullText", description="This part contains the full text of the paper"),
+    ]
+
+    DEFAULT_CONFIG_NAME = (
+        "Abstract"  # It's not mandatory to have a default configuration. Just use one if it make sense.
+    )
+
+    def _info(self):
+        if self.config.name == "AIC":  # This is the name of the configuration selected in BUILDER_CONFIGS above
+            features = datasets.Features(
+                {
+                    "source": datasets.Sequence(datasets.Value("string")),
+                    "source_labels": datasets.Sequence(datasets.ClassLabel(num_classes=2, names=[0, 1])),
+                    "rouge_scores": datasets.Sequence(datasets.Value("float32")),
+                    "paper_id": datasets.Value("string"),
+                    "ic": datasets.Value("bool_"),
+                    "target": datasets.features.Sequence(datasets.Value("string"))
+                    # These are the features of your dataset like images, labels ...
+                }
+            )
+        else:
+            features = datasets.Features(
+                {
+                    "source": datasets.Sequence(datasets.Value("string")),
+                    "source_labels": datasets.Sequence(
+                        datasets.ClassLabel(num_classes=2, names=["non-oracle", "oracle"])
+                    ),
+                    "rouge_scores": datasets.Sequence(datasets.Value("float32")),
+                    "paper_id": datasets.Value("string"),
+                    "target": datasets.Sequence(datasets.Value("string"))
+                    # These are the features of your dataset like images, labels ...
+                }
+            )
+        return datasets.DatasetInfo(
+            # This is the description that will appear on the datasets page.
+            description=_DESCRIPTION,
+            # This defines the different columns of the dataset and their types
+            features=features,  # Here we define them above because they are different between the two configurations
+            # If there's a common (input, target) tuple from the features,
+            # specify them here. They'll be used if as_supervised=True in
+            # builder.as_dataset.
+            supervised_keys=(_SOURCE, _TARGET),
+            # Homepage of the dataset for documentation
+            homepage="https://github.com/allenai/scitldr",
+            # License for the dataset if available
+            license=_LICENSE,
+            # Citation for the dataset
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        urls = {
+            "train": _URLs[self.config.name] + _TRAIN_DATA,
+            "valid": _URLs[self.config.name] + _VALID_DATA,
+            "test": _URLs[self.config.name] + _TEST_DATA,
+        }
+        data_dir = dl_manager.download(urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": os.path.join(data_dir["train"])},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"filepath": os.path.join(data_dir["test"])},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"filepath": os.path.join(data_dir["valid"])},
+            ),
+        ]
+
+    def _generate_examples(self, filepath):
+        """Yields examples."""
+        with open(filepath, encoding="utf-8") as f:
+            for id_, row in enumerate(f):
+                data = json.loads(row)
+                if self.config.name == "AIC":
+                    yield id_, {
+                        "source": data["source"],
+                        "source_labels": data["source_labels"],
+                        "rouge_scores": data["rouge_scores"],
+                        "paper_id": data["paper_id"],
+                        "ic": True if data["ic"] else False,
+                        "target": data["target"],
+                    }
+                else:
+                    yield id_, {
+                        "source": data["source"],
+                        "source_labels": data["source_labels"],
+                        "rouge_scores": data["rouge_scores"],
+                        "paper_id": data["paper_id"],
+                        "target": data["target"],
+                    }
diff --git a/catwalk/dependencies/lm_eval/datasets/xsum/__init__.py b/catwalk/dependencies/lm_eval/datasets/xsum/__init__.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"Abstract": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n title={{TLDR}: Extreme Summarization of Scientific Documents},\n author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n journal={arXiv:2004.15011},\n year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": ["non-oracle", "oracle"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "Abstract", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2738065, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 1073656, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 994876, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/train.jsonl": {"num_bytes": 3155015, "checksum": "b222771d387be585cfdf5ae957b36757138415a352e0a3e3b23f73f87c3b1119"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/dev.jsonl": {"num_bytes": 1124865, "checksum": "3191fa98ccc09521332b7a1cd63b1930be4e8df125a235ccd31e40329709525e"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/test.jsonl": {"num_bytes": 1204107, "checksum": "fb42dd6cd4f4a1928ae8a01a189456fbfe994a07e938bd49f68653933f6503c9"}}, "download_size": 5483987, "post_processing_size": null, "dataset_size": 4806597, "size_in_bytes": 10290584}, "AIC": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n title={{TLDR}: Extreme Summarization of Scientific Documents},\n author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n journal={arXiv:2004.15011},\n year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": [0, 1], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "ic": {"dtype": "bool_", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "AIC", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 14473822, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 4822026, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 4476237, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/train.jsonl": {"num_bytes": 15569568, "checksum": "64b08af6de479671a12afd04770f66bcbc1c2c5f3098a08392b0fd7c1070d621"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/dev.jsonl": {"num_bytes": 4811551, "checksum": "ac5168c27d25181fc17bb6f1fb41d11dbe30c627bebee14457feb3bad2c839dd"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/test.jsonl": {"num_bytes": 5163989, "checksum": "7cb9230d3eb4863884762154918360d1c063aa18fc76de928801a14f4bcf4d37"}}, "download_size": 25545108, "post_processing_size": null, "dataset_size": 23772085, "size_in_bytes": 49317193}, "FullText": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n title={{TLDR}: Extreme Summarization of Scientific Documents},\n author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n journal={arXiv:2004.15011},\n year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": ["non-oracle", "oracle"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "FullText", "version": "0.0.0", "splits": {"train": {"name": "train", "num_bytes": 66917363, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 20182554, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 18790651, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/train.jsonl": {"num_bytes": 71263949, "checksum": "e35461c1665cb4f7b46daba6dd5ac3cff03a61eb196e6ce9983edda44d867604"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/dev.jsonl": {"num_bytes": 19111616, "checksum": "11c3fd77a7ec447adc44ca34c0fa41a7ab6bdacdf3b8e15748e6f8b8e4f698bf"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/test.jsonl": {"num_bytes": 20528987, "checksum": "1584bd3f5fff5859cb8428cfbacc8d38c671f5fc6a24a8140ea5350cbd86a751"}}, "download_size": 110904552, "post_processing_size": null, "dataset_size": 105890568, "size_in_bytes": 216795120}}