Skip to content

Commit

Permalink
Merge pull request #140 from allenai/armanc/scitldr
Browse files Browse the repository at this point in the history
Adding summarization datasets, scitldr and xsum
  • Loading branch information
armancohan authored Oct 5, 2023
2 parents cf23bf0 + 176f751 commit cd70f0e
Show file tree
Hide file tree
Showing 12 changed files with 753 additions and 7 deletions.
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"Abstract": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n title={{TLDR}: Extreme Summarization of Scientific Documents},\n author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n journal={arXiv:2004.15011},\n year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": ["non-oracle", "oracle"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "Abstract", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 2738065, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 1073656, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 994876, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/train.jsonl": {"num_bytes": 3155015, "checksum": "b222771d387be585cfdf5ae957b36757138415a352e0a3e3b23f73f87c3b1119"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/dev.jsonl": {"num_bytes": 1124865, "checksum": "3191fa98ccc09521332b7a1cd63b1930be4e8df125a235ccd31e40329709525e"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/test.jsonl": {"num_bytes": 1204107, "checksum": "fb42dd6cd4f4a1928ae8a01a189456fbfe994a07e938bd49f68653933f6503c9"}}, "download_size": 5483987, "post_processing_size": null, "dataset_size": 4806597, "size_in_bytes": 10290584}, "AIC": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n title={{TLDR}: Extreme Summarization of Scientific Documents},\n author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n journal={arXiv:2004.15011},\n year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": [0, 1], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "ic": {"dtype": "bool_", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "AIC", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 14473822, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 4822026, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 4476237, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/train.jsonl": {"num_bytes": 15569568, "checksum": "64b08af6de479671a12afd04770f66bcbc1c2c5f3098a08392b0fd7c1070d621"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/dev.jsonl": {"num_bytes": 4811551, "checksum": "ac5168c27d25181fc17bb6f1fb41d11dbe30c627bebee14457feb3bad2c839dd"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/test.jsonl": {"num_bytes": 5163989, "checksum": "7cb9230d3eb4863884762154918360d1c063aa18fc76de928801a14f4bcf4d37"}}, "download_size": 25545108, "post_processing_size": null, "dataset_size": 23772085, "size_in_bytes": 49317193}, "FullText": {"description": "A new multi-target dataset of 5.4K TLDRs over 3.2K papers.\nSCITLDR contains both author-written and expert-derived TLDRs,\nwhere the latter are collected using a novel annotation protocol\nthat produces high-quality summaries while minimizing annotation burden.\n", "citation": "@article{cachola2020tldr,\n title={{TLDR}: Extreme Summarization of Scientific Documents},\n author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},\n journal={arXiv:2004.15011},\n year={2020},\n}\n", "homepage": "https://github.com/allenai/scitldr", "license": "Apache License 2.0", "features": {"source": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_labels": {"feature": {"num_classes": 2, "names": ["non-oracle", "oracle"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "rouge_scores": {"feature": {"dtype": "float32", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "paper_id": {"dtype": "string", "id": null, "_type": "Value"}, "target": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": {"input": "source", "output": "target"}, "builder_name": "scitldr", "config_name": "FullText", "version": "0.0.0", "splits": {"train": {"name": "train", "num_bytes": 66917363, "num_examples": 1992, "dataset_name": "scitldr"}, "test": {"name": "test", "num_bytes": 20182554, "num_examples": 618, "dataset_name": "scitldr"}, "validation": {"name": "validation", "num_bytes": 18790651, "num_examples": 619, "dataset_name": "scitldr"}}, "download_checksums": {"https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/train.jsonl": {"num_bytes": 71263949, "checksum": "e35461c1665cb4f7b46daba6dd5ac3cff03a61eb196e6ce9983edda44d867604"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/dev.jsonl": {"num_bytes": 19111616, "checksum": "11c3fd77a7ec447adc44ca34c0fa41a7ab6bdacdf3b8e15748e6f8b8e4f698bf"}, "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/test.jsonl": {"num_bytes": 20528987, "checksum": "1584bd3f5fff5859cb8428cfbacc8d38c671f5fc6a24a8140ea5350cbd86a751"}}, "download_size": 110904552, "post_processing_size": null, "dataset_size": 105890568, "size_in_bytes": 216795120}}
169 changes: 169 additions & 0 deletions catwalk/dependencies/lm_eval/datasets/scitldr/scitldr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataset for TLDR: Extreme Summarization of Scientific Documents"""


import json
import os

import datasets


_SOURCE = "source"
_TARGET = "target"

_CITATION = """\
@article{cachola2020tldr,
title={{TLDR}: Extreme Summarization of Scientific Documents},
author={Isabel Cachola and Kyle Lo and Arman Cohan and Daniel S. Weld},
journal={arXiv:2004.15011},
year={2020},
}
"""

_DESCRIPTION = """\
A new multi-target dataset of 5.4K TLDRs over 3.2K papers.
SCITLDR contains both author-written and expert-derived TLDRs,
where the latter are collected using a novel annotation protocol
that produces high-quality summaries while minimizing annotation burden.
"""


_LICENSE = "Apache License 2.0"

# TODO: Add link to the official dataset URLs here
# The HuggingFace dataset library don't host the datasets but only point to the original files
# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
_URLs = {
"Abstract": "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-A/",
"AIC": "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-AIC/",
"FullText": "https://raw.githubusercontent.com/allenai/scitldr/master/SciTLDR-Data/SciTLDR-FullText/",
}

_TRAIN_DATA = "train.jsonl"
_TEST_DATA = "test.jsonl"
_VALID_DATA = "dev.jsonl"


# There are several preprocessing scripts given in the original SciTLDR GitHub repository to preprocess this data.
class Scitldr(datasets.GeneratorBasedBuilder):
"""Dataset for TLDR: Extreme Summarization of Scientific Documents."""

VERSION = datasets.Version("1.1.0")

# You will be able to load one or the other configurations in the following list with
# data = datasets.load_dataset('scitldr', 'Abstract')
# data = datasets.load_dataset('scitldr', 'AIC')
BUILDER_CONFIGS = [
datasets.BuilderConfig(name="Abstract", description="This part contains only abstracts of the paper"),
datasets.BuilderConfig(
name="AIC",
description="This part contains Abstracts, Introduction and Conclusion (AIC) sections of the paper",
),
datasets.BuilderConfig(name="FullText", description="This part contains the full text of the paper"),
]

DEFAULT_CONFIG_NAME = (
"Abstract" # It's not mandatory to have a default configuration. Just use one if it make sense.
)

def _info(self):
if self.config.name == "AIC": # This is the name of the configuration selected in BUILDER_CONFIGS above
features = datasets.Features(
{
"source": datasets.Sequence(datasets.Value("string")),
"source_labels": datasets.Sequence(datasets.ClassLabel(num_classes=2, names=[0, 1])),
"rouge_scores": datasets.Sequence(datasets.Value("float32")),
"paper_id": datasets.Value("string"),
"ic": datasets.Value("bool_"),
"target": datasets.features.Sequence(datasets.Value("string"))
# These are the features of your dataset like images, labels ...
}
)
else:
features = datasets.Features(
{
"source": datasets.Sequence(datasets.Value("string")),
"source_labels": datasets.Sequence(
datasets.ClassLabel(num_classes=2, names=["non-oracle", "oracle"])
),
"rouge_scores": datasets.Sequence(datasets.Value("float32")),
"paper_id": datasets.Value("string"),
"target": datasets.Sequence(datasets.Value("string"))
# These are the features of your dataset like images, labels ...
}
)
return datasets.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# This defines the different columns of the dataset and their types
features=features, # Here we define them above because they are different between the two configurations
# If there's a common (input, target) tuple from the features,
# specify them here. They'll be used if as_supervised=True in
# builder.as_dataset.
supervised_keys=(_SOURCE, _TARGET),
# Homepage of the dataset for documentation
homepage="https://github.com/allenai/scitldr",
# License for the dataset if available
license=_LICENSE,
# Citation for the dataset
citation=_CITATION,
)

def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
urls = {
"train": _URLs[self.config.name] + _TRAIN_DATA,
"valid": _URLs[self.config.name] + _VALID_DATA,
"test": _URLs[self.config.name] + _TEST_DATA,
}
data_dir = dl_manager.download(urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"filepath": os.path.join(data_dir["train"])},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={"filepath": os.path.join(data_dir["test"])},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={"filepath": os.path.join(data_dir["valid"])},
),
]

def _generate_examples(self, filepath):
"""Yields examples."""
with open(filepath, encoding="utf-8") as f:
for id_, row in enumerate(f):
data = json.loads(row)
if self.config.name == "AIC":
yield id_, {
"source": data["source"],
"source_labels": data["source_labels"],
"rouge_scores": data["rouge_scores"],
"paper_id": data["paper_id"],
"ic": True if data["ic"] else False,
"target": data["target"],
}
else:
yield id_, {
"source": data["source"],
"source_labels": data["source_labels"],
"rouge_scores": data["rouge_scores"],
"paper_id": data["paper_id"],
"target": data["target"],
}
Empty file.
Loading

0 comments on commit cd70f0e

Please sign in to comment.