Skip to content

Commit

Permalink
[Feature] Support MMLU-CF Benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
fistyee committed Jan 8, 2025
1 parent 2329a5f commit 93c4411
Showing 1 changed file with 22 additions and 28 deletions.
50 changes: 22 additions & 28 deletions opencompass/datasets/mmlu_cf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,34 +10,28 @@ def load(path: str, name: str):
"""
Loading HuggingFace datasets
"""
try:
# Use HuggingFace's load_dataset method to load the dataset
hf_dataset = load_dataset("microsoft/MMLU-CF")
columns_to_keep = ["Question", "A", "B", "C", "D", "Answer"]
hf_dataset = hf_dataset.map(lambda x: {key: x[key] for key in columns_to_keep})
splits = ['dev', 'val']
# Use HuggingFace's load_dataset method to load the dataset
hf_dataset = load_dataset("microsoft/MMLU-CF")
columns_to_keep = ["Question", "A", "B", "C", "D", "Answer"]
hf_dataset = hf_dataset.map(lambda x: {key: x[key] for key in columns_to_keep})
splits = ['dev', 'val']

for split in splits:
sub_set = f'{name}_{split}'

# Rename fields here if they don't match the expected names
hf_dataset[sub_set] = hf_dataset[sub_set].map(lambda example: {
"input": example["Question"],
"A": example["A"],
"B": example["B"],
"C": example["C"],
"D": example["D"],
"target": example["Answer"]
})

# Create a DatasetDict and return it
dataset = DatasetDict({
"dev": hf_dataset[f'{name}_{splits[0]}'],
"test": hf_dataset[f'{name}_{splits[1]}'] # Use 'val' as 'test'
for split in splits:
sub_set = f'{name}_{split}'

# Rename fields here if they don't match the expected names
hf_dataset[sub_set] = hf_dataset[sub_set].map(lambda example: {
"input": example["Question"],
"A": example["A"],
"B": example["B"],
"C": example["C"],
"D": example["D"],
"target": example["Answer"]
})
return dataset

except Exception as e:
print(f"Failed to load the dataset from HuggingFace: {e}")
print("Please check if the dataset exists and if the network connection is stable.")
raise
# Create a DatasetDict and return it
dataset = DatasetDict({
"dev": hf_dataset[f'{name}_{splits[0]}'],
"test": hf_dataset[f'{name}_{splits[1]}'] # Use 'val' as 'test'
})
return dataset

0 comments on commit 93c4411

Please sign in to comment.