-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #140 from allenai/armanc/scitldr
Adding summarization datasets, scitldr and xsum
- Loading branch information
Showing
12 changed files
with
753 additions
and
7 deletions.
There are no files selected for viewing
Empty file.
1 change: 1 addition & 0 deletions
1
catwalk/dependencies/lm_eval/datasets/scitldr/dataset_infos.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"Abstract": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n title={{TLDR}: Extreme Summarization of Scientific Documents},\n author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n journal={arXiv:2004.15011},\n year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": ["non-oracle", "oracle"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "Abstract", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2738065, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 1073656, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 994876, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/train.jsonl": {"num_bytes": 3155015, "checksum": "b222771d387be585cfdf5ae957b36757138415a352e0a3e3b23f73f87c3b1119"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/dev.jsonl": {"num_bytes": 1124865, "checksum": "3191fa98ccc09521332b7a1cd63b1930be4e8df125a235ccd31e40329709525e"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/test.jsonl": {"num_bytes": 1204107, "checksum": "fb42dd6cd4f4a1928ae8a01a189456fbfe994a07e938bd49f68653933f6503c9"}}, "download_size": 5483987, "post_processing_size": null, "dataset_size": 4806597, "size_in_bytes": 10290584}, "AIC": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n title={{TLDR}: Extreme Summarization of Scientific Documents},\n author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n journal={arXiv:2004.15011},\n year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": [0, 1], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "ic": {"dtype": "bool_", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "AIC", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 14473822, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 4822026, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 4476237, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/train.jsonl": {"num_bytes": 15569568, "checksum": "64b08af6de479671a12afd04770f66bcbc1c2c5f3098a08392b0fd7c1070d621"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/dev.jsonl": {"num_bytes": 4811551, "checksum": "ac5168c27d25181fc17bb6f1fb41d11dbe30c627bebee14457feb3bad2c839dd"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/test.jsonl": {"num_bytes": 5163989, "checksum": "7cb9230d3eb4863884762154918360d1c063aa18fc76de928801a14f4bcf4d37"}}, "download_size": 25545108, "post_processing_size": null, "dataset_size": 23772085, "size_in_bytes": 49317193}, "FullText": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n title={{TLDR}: Extreme Summarization of Scientific Documents},\n author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n journal={arXiv:2004.15011},\n year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": ["non-oracle", "oracle"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "FullText", "version": "0.0.0", "splits": {"train": {"name": "train", "num_bytes": 66917363, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 20182554, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 18790651, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/train.jsonl": {"num_bytes": 71263949, "checksum": "e35461c1665cb4f7b46daba6dd5ac3cff03a61eb196e6ce9983edda44d867604"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/dev.jsonl": {"num_bytes": 19111616, "checksum": "11c3fd77a7ec447adc44ca34c0fa41a7ab6bdacdf3b8e15748e6f8b8e4f698bf"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/test.jsonl": {"num_bytes": 20528987, "checksum": "1584bd3f5fff5859cb8428cfbacc8d38c671f5fc6a24a8140ea5350cbd86a751"}}, "download_size": 110904552, "post_processing_size": null, "dataset_size": 105890568, "size_in_bytes": 216795120}} |
169 changes: 169 additions & 0 deletions
169
catwalk/dependencies/lm_eval/datasets/scitldr/scitldr.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
# coding=utf-8 | ||
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
"""Dataset for TLDR: Extreme Summarization of Scientific Documents""" | ||
|
||
|
||
import json | ||
import os | ||
|
||
import datasets | ||
|
||
|
||
_SOURCE = "source" | ||
_TARGET = "target" | ||
|
||
_CITATION = """\ | ||
@article{cachola2020tldr, | ||
title={{TLDR}: Extreme Summarization of Scientific Documents}, | ||
author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld}, | ||
journal={arXiv:2004.15011}, | ||
year={2020}, | ||
} | ||
""" | ||
|
||
_DESCRIPTION = """\ | ||
A new multi-target dataset of 5.4K TLDRs over 3.2K papers. | ||
SCITLDR contains both author-written and expert-derived TLDRs, | ||
where the latter are collected using a novel annotation protocol | ||
that produces high-quality summaries while minimizing annotation burden. | ||
""" | ||
|
||
|
||
_LICENSE = "Apache License 2.0" | ||
|
||
# TODO: Add link to the official dataset URLs here | ||
# The HuggingFace dataset library don't host the datasets but only point to the original files | ||
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method) | ||
_URLs = { | ||
"Abstract": "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/", | ||
"AIC": "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/", | ||
"FullText": "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/", | ||
} | ||
|
||
_TRAIN_DATA = "train.jsonl" | ||
_TEST_DATA = "test.jsonl" | ||
_VALID_DATA = "dev.jsonl" | ||
|
||
|
||
# There are several preprocessing scripts given in the original SciTLDR GitHub repository to preprocess this data. | ||
class Scitldr(datasets.GeneratorBasedBuilder): | ||
"""Dataset for TLDR: Extreme Summarization of Scientific Documents.""" | ||
|
||
VERSION = datasets.Version("1.1.0") | ||
|
||
# You will be able to load one or the other configurations in the following list with | ||
# data = datasets.load_dataset('scitldr', 'Abstract') | ||
# data = datasets.load_dataset('scitldr', 'AIC') | ||
BUILDER_CONFIGS = [ | ||
datasets.BuilderConfig(name="Abstract", description="This part contains only abstracts of the paper"), | ||
datasets.BuilderConfig( | ||
name="AIC", | ||
description="This part contains Abstracts, Introduction and Conclusion (AIC) sections of the paper", | ||
), | ||
datasets.BuilderConfig(name="FullText", description="This part contains the full text of the paper"), | ||
] | ||
|
||
DEFAULT_CONFIG_NAME = ( | ||
"Abstract" # It's not mandatory to have a default configuration. Just use one if it make sense. | ||
) | ||
|
||
def _info(self): | ||
if self.config.name == "AIC": # This is the name of the configuration selected in BUILDER_CONFIGS above | ||
features = datasets.Features( | ||
{ | ||
"source": datasets.Sequence(datasets.Value("string")), | ||
"source_labels": datasets.Sequence(datasets.ClassLabel(num_classes=2, names=[0, 1])), | ||
"rouge_scores": datasets.Sequence(datasets.Value("float32")), | ||
"paper_id": datasets.Value("string"), | ||
"ic": datasets.Value("bool_"), | ||
"target": datasets.features.Sequence(datasets.Value("string")) | ||
# These are the features of your dataset like images, labels ... | ||
} | ||
) | ||
else: | ||
features = datasets.Features( | ||
{ | ||
"source": datasets.Sequence(datasets.Value("string")), | ||
"source_labels": datasets.Sequence( | ||
datasets.ClassLabel(num_classes=2, names=["non-oracle", "oracle"]) | ||
), | ||
"rouge_scores": datasets.Sequence(datasets.Value("float32")), | ||
"paper_id": datasets.Value("string"), | ||
"target": datasets.Sequence(datasets.Value("string")) | ||
# These are the features of your dataset like images, labels ... | ||
} | ||
) | ||
return datasets.DatasetInfo( | ||
# This is the description that will appear on the datasets page. | ||
description=_DESCRIPTION, | ||
# This defines the different columns of the dataset and their types | ||
features=features, # Here we define them above because they are different between the two configurations | ||
# If there's a common (input, target) tuple from the features, | ||
# specify them here. They'll be used if as_supervised=True in | ||
# builder.as_dataset. | ||
supervised_keys=(_SOURCE, _TARGET), | ||
# Homepage of the dataset for documentation | ||
homepage="https://github.com/allenai/scitldr", | ||
# License for the dataset if available | ||
license=_LICENSE, | ||
# Citation for the dataset | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager): | ||
"""Returns SplitGenerators.""" | ||
urls = { | ||
"train": _URLs[self.config.name] + _TRAIN_DATA, | ||
"valid": _URLs[self.config.name] + _VALID_DATA, | ||
"test": _URLs[self.config.name] + _TEST_DATA, | ||
} | ||
data_dir = dl_manager.download(urls) | ||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
gen_kwargs={"filepath": os.path.join(data_dir["train"])}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TEST, | ||
gen_kwargs={"filepath": os.path.join(data_dir["test"])}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.VALIDATION, | ||
gen_kwargs={"filepath": os.path.join(data_dir["valid"])}, | ||
), | ||
] | ||
|
||
def _generate_examples(self, filepath): | ||
"""Yields examples.""" | ||
with open(filepath, encoding="utf-8") as f: | ||
for id_, row in enumerate(f): | ||
data = json.loads(row) | ||
if self.config.name == "AIC": | ||
yield id_, { | ||
"source": data["source"], | ||
"source_labels": data["source_labels"], | ||
"rouge_scores": data["rouge_scores"], | ||
"paper_id": data["paper_id"], | ||
"ic": True if data["ic"] else False, | ||
"target": data["target"], | ||
} | ||
else: | ||
yield id_, { | ||
"source": data["source"], | ||
"source_labels": data["source_labels"], | ||
"rouge_scores": data["rouge_scores"], | ||
"paper_id": data["paper_id"], | ||
"target": data["target"], | ||
} |
Empty file.
Oops, something went wrong.