Skip to content

Commit

Permalink
[UPD] Adapted data functionality to align with issue #3 requirements.…
Browse files Browse the repository at this point in the history
… Updated framework to handle metadata files with fields like prompts, targets, etc. Users can now prepare their datasets following the specified format.
  • Loading branch information
Gerard Sant Muniesa authored and Gerard Sant Muniesa committed Jan 6, 2025
1 parent 7f0c0ab commit efd635a
Show file tree
Hide file tree
Showing 24 changed files with 429 additions and 243 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,9 @@ training:
fp16: True

data:
data_dir: "/path/to/data/directory"
train_split_name: "train"
dev_split_name: "dev"
test_split_name: "devtest"
src_lang: "he"
tgt_lang: "en"
train_metadata_dir: "/path/to/train/metadata_file.tsv"
validation_metadata_dir: "/path/to/validation/metadata_file.tsv"
test_metadata_dir: "/path/to/test/metadata_file.tsv"
font_path: "/examples/multimodal_translation/Image2text_translation/other/Arial.ttf"
as_numpy: false
shuffle: True
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import pandas as pd
import os
import argparse

# Parse command-line arguments
parser = argparse.ArgumentParser(description="Process and transform a CSV file.")
parser.add_argument("sources_file", type=str, help="Path to the file containing source texts.")
parser.add_argument("targets_file", type=str, help="Path to the file containing target texts.")
parser.add_argument("source_prompts", type=str, help="Path to the file containing source prompt texts. If the prompt is fixed, it can be also written here.")
parser.add_argument("generation_prompts", type=str, help="Path to the file containing generation prompt texts. If the prompt is fixed, it can be also written here.")
parser.add_argument("output_file", type=str, help="Path to the output TSV file.")
args = parser.parse_args()

def create_dataframe(sources_file, targets_file, source_prompts, generation_prompts):

# Read the source and target files
with open(sources_file, 'r', encoding='utf-8') as source_file:
sources = source_file.read().splitlines()

with open(targets_file, 'r', encoding='utf-8') as target_file:
targets = target_file.read().splitlines()

# Ensure the two files have the same number of lines
if len(sources) != len(targets):
raise ValueError("Source and target files must have the same number of lines.")

if os.path.exists(source_prompts):
with open(source_prompts, 'r', encoding='utf-8') as source_prompts:
source_prompt = source_prompts.read().splitlines()
else:
source_prompt = [source_prompts] * len(sources)

if os.path.exists(generation_prompts):
with open(generation_prompts, 'r', encoding='utf-8') as generation_prompts:
generation_prompts = generation_prompts.read().splitlines()
else:
generation_prompts = [generation_prompts] * len(sources)

# Create the DataFrame
dataframe = pd.DataFrame({
'source_text': sources,
'source_prompt': source_prompt,
'generation_prompt': generation_prompts,
'output_text': targets
})

return dataframe

try:
data = create_dataframe(
args.sources_file,
args.targets_file,
args.source_prompts,
args.generation_prompts
)

except Exception as e:
print(f"Error: {e}")


# Select the desired columns for the new dataset
output_columns = [
'source_text',
'source_prompt',
'generation_prompt',
'output_text'
]

# Save the transformed dataset to a new file, determining format by extension
if args.output_file.endswith('.tsv'):
data[output_columns].to_csv(args.output_file, sep='\t', index=False)
else:
data[output_columns].to_csv(args.output_file, index=False)

print(f"Transformed dataset saved to {args.output_file}")




# He de acabar el script que preprocessa les dades de hebrew, despres acabar de comprobar les classes datasets, i els experiments setup de la carpeta de examples
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ def main(config_path):
height=dataset_config.preprocess.height,
normalize_image=dataset_config.preprocess.do_normalize,
mean=dataset_config.preprocess.dataset_mean,
std=dataset_config.preprocess.dataset_std,
target_lang_on_source=True,
task_prefixes=[],
std=dataset_config.preprocess.dataset_std,
)

# Save processor and set PROCESSOR_PATH environment variable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ training:
fp16: True

data:
data_dir: "/path/to/data/directory"
train_metadata_dir: "/path/to/train/metadata_file.tsv"
validation_metadata_dir: "/path/to/validation/metadata_file.tsv"
test_metadata_dir: "/path/to/test/metadata_file.tsv"
filter_empty_samples: True
shuffle: True
src_lang_tokenizer_path: "/examples/multimodal_translation/pose2text_translation/other/new_languages_how2sign.txt"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import pandas as pd
import os
import argparse

# Parse command-line arguments
parser = argparse.ArgumentParser(description="Process and transform a CSV file.")
parser.add_argument("input_file", type=str, help="Path to the input CSV file.")
parser.add_argument("output_file", type=str, help="Path to the output CSV file.")
args = parser.parse_args()

# Placeholder functions for constructing new fields
def construct_input_pose(row):
return "" # Replace with the actual implementation or keep empty

def construct_source_start(row):
return "" # Replace with the actual implementation or keep empty

def construct_source_end(row):
return "" # Replace with the actual implementation or keep empty

def construct_input_clip(row):
return "" # Replace with the actual implementation or keep empty

def construct_source_prompt(row):
return "__slt__ __asl__ __en__"

def construct_input_text(row):
return "" # Replace with the actual implementation or keep empty

def construct_generation_prompts(row):
return "" # Replace with the actual implementation or keep empty

def construct_output_text(row):
return "" # Replace with the actual implementation or keep empty

def map_column_to_new_field(original_column, new_column_name, data):
if original_column in data.columns:
data[new_column_name] = data[original_column]
else:
data[new_column_name] = "" # Fill with empty if column does not exist

# Read the input CSV file
data = pd.read_csv(args.input_file, delimiter="\t")

# Create new columns using the placeholder functions
data['input_pose'] = data.apply(construct_input_pose, axis=1)
data['source_start'] = data.apply(construct_source_start, axis=1)
data['source_end'] = data.apply(construct_source_end, axis=1)
data['input_clip'] = data.apply(construct_input_clip, axis=1)
data['source_prompt'] = data.apply(construct_source_prompt, axis=1)
data['input_text'] = data.apply(construct_input_text, axis=1)
data['generation_prompt'] = data.apply(construct_generation_prompts, axis=1)
data['output_text'] = data.apply(construct_output_text, axis=1)

# Example of mapping original columns to new ones
map_column_to_new_field('VIDEO_NAME', 'input_pose', data)
map_column_to_new_field('START', 'source_start', data)
map_column_to_new_field('END', 'source_end', data)
map_column_to_new_field('SENTENCE', 'output_text', data)
map_column_to_new_field('SENTENCE_NAME', 'input_clip', data)


# Select the desired columns for the new dataset
output_columns = [
'input_pose',
'source_start',
'source_end',
'input_clip',
'input_text',
'source_prompt',
'generation_prompt',
'output_text'
]

# Save the transformed dataset to a new file, determining format by extension
if args.output_file.endswith('.tsv'):
data[output_columns].to_csv(args.output_file, sep='\t', index=False)
else:
data[output_columns].to_csv(args.output_file, index=False)

print(f"Transformed dataset saved to {args.output_file}")
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,28 @@ def main(config_path):
dataset.as_dataset().save_to_disk(data_path)

m2m_tokenizer = AutoTokenizer.from_pretrained(dataset_config.text_tokenizer_path)
tokenizer = add_new_special_tokens_from_vocab_file(
tokenizer=copy.deepcopy(m2m_tokenizer),
vocab_file=dataset_config.src_lang_tokenizer_path,
output_dir=config.training.output_dir + "/" + config.model.name,
)

tokenizer = m2m_tokenizer

vocab_files = []
if dataset_config.src_lang_tokenizer_path:
vocab_files.append(dataset_config.src_lang_tokenizer_path)
if dataset_config.new_task_tokens_dictionary_path:
vocab_files.append(dataset_config.new_task_tokens_dictionary_path)

for i, vocab_file in enumerate(vocab_files):
output_dir = None
if i == len(vocab_files) - 1:
output_dir = f"{config.training.output_dir}/{config.model.name}"
tokenizer = add_new_special_tokens_from_vocab_file(
tokenizer=copy.deepcopy(tokenizer),
vocab_file=vocab_file,
output_dir=output_dir,
)

input_processor = Pose2TextTranslationProcessor(
tokenizer=tokenizer,
reduce_holistic_poses=True,
target_lang_on_source=True,
task_prefixes=[],
)

# Save processor and set PROCESSOR_PATH environment variable
Expand Down
13 changes: 0 additions & 13 deletions examples/multimodal_translation/run_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,12 +153,6 @@ class ProcessorArguments:
processor_name_or_path: Optional[str] = field(
default=None, metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
)
task_prefixes: List[str] = field(
default_factory=list,
metadata={
"help": "A list of prefixes to prepend to input text for each task."
}
)
target_lang_on_source: bool = field(
default=False,
metadata={
Expand Down Expand Up @@ -299,12 +293,6 @@ class DataTrainingArguments:
)
},
)
insert_langtok_on_target: bool = field(
default=True,
metadata={
"help": "Whether to insert tgt_lang_token to the labels in the loss computation or not."
},
)

def __post_init__(self):
if self.dataset_name is None and self.dataset_dir is None and self.train_file is None and self.validation_file is None:
Expand Down Expand Up @@ -654,7 +642,6 @@ def preprocess_function(examples):
model=model,
pad_to_multiple_of=8 if training_args.fp16 else None,
label_pad_token_id=label_pad_token_id,
insert_langtok_on_target=data_args.insert_langtok_on_target
)
elif data_args.pad_to_max_length:
data_collator = default_data_collator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ model:
freeze_lang_embeddings: False
freeze_backbone: True
freeze_lm_head: True
lang_embeddings_vocab_size: 79 # N_new_languages + N_special_tokens. N_special_tokens is normally 4: <bos>, <eos>, <unk>, <pad>
lang_embeddings_vocab_size: 128182 # N_new_languages + N_special_tokens. N_special_tokens is normally 4: <bos>, <eos>, <unk>, <pad>
feature_extractor_cfg: # Specify args to be modified in the feature extractor architecture
feature_extractor_arguments: "<config-arguments>"

Expand Down Expand Up @@ -59,10 +59,9 @@ training:
fp16: True

data:
data_dir: "/path/to/data/directory"
train_split_name: "<train_file>"
dev_split_name: "<dev_file>"
test_split_name: "<test_file>"
train_metadata_dir: "/path/to/train/metadata_file.tsv"
validation_metadata_dir: "/path/to/validation/metadata_file.tsv"
test_metadata_dir: "/path/to/test/metadata_file.tsv"
filter_empty_samples: True
shuffle: True
src_lang_tokenizer_path: "/examples/multimodal_translation/signwritting2text_translation/other/new_languages_sign_bank_plus.txt"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pandas as pd
import os
import argparse

from multimodalhugs.custom_datasets import properly_format_signbank_plus

# Parse command-line arguments
parser = argparse.ArgumentParser(description="Process and transform a CSV file.")
parser.add_argument("metadata_file", type=str, help="Path to the file containing the split metadata.")
parser.add_argument("output_file", type=str, help="Path to the output TSV file.")
args = parser.parse_args()

# Placeholder functions for constructing new fields
def construct_source_sequence(row):
return "" # Replace with the actual implementation or keep empty

def construct_source_prompt(row):
return row['src_lang'] # Replace with the actual implementation or keep empty

def construct_generation_prompt(row):
return row['tgt_lang'] # Replace with the actual implementation or keep empty

def construct_output_text(row):
return "" # Replace with the actual implementation or keep empty

def map_column_to_new_field(original_column, new_column_name, data):
if original_column in data.columns:
data[new_column_name] = data[original_column]
else:
data[new_column_name] = "" # Fill with empty if column does not exist

data = properly_format_signbank_plus(args.metadata_file, False)

data['source_prompt'] = data.apply(construct_source_prompt, axis=1)
data['generation_prompt'] = data.apply(construct_generation_prompt, axis=1)

# Example of mapping original columns to new ones
map_column_to_new_field('source', 'source_sequence', data)
map_column_to_new_field('target', 'output_text', data)

# Select the desired columns for the new dataset
output_columns = [
'source_sequence',
'source_prompt',
'generation_prompt',
'output_text'
]

# Save the transformed dataset to a new file, determining format by extension
if args.output_file.endswith('.tsv'):
data[output_columns].to_csv(args.output_file, sep='\t', index=False)
else:
data[output_columns].to_csv(args.output_file, index=False)

print(f"Transformed dataset saved to {args.output_file}")
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,24 @@ def main(config_path):
)

m2m_tokenizer = AutoTokenizer.from_pretrained(dataset_config.text_tokenizer_path)
tokenizer = add_new_special_tokens_from_vocab_file(
tokenizer=copy.deepcopy(m2m_tokenizer),
vocab_file=dataset_config.src_lang_tokenizer_path,
output_dir=config.training.output_dir + "/" + config.model.name,
)

tokenizer = m2m_tokenizer

vocab_files = []
if dataset_config.src_lang_tokenizer_path:
vocab_files.append(dataset_config.src_lang_tokenizer_path)
if dataset_config.new_task_tokens_dictionary_path:
vocab_files.append(dataset_config.new_task_tokens_dictionary_path)

for i, vocab_file in enumerate(vocab_files):
output_dir = None
if i == len(vocab_files) - 1:
output_dir = f"{config.training.output_dir}/{config.model.name}"
tokenizer = add_new_special_tokens_from_vocab_file(
tokenizer=copy.deepcopy(tokenizer),
vocab_file=vocab_file,
output_dir=output_dir,
)

input_processor = SignwritingProcessor(
width=dataset_config.preprocess.width,
Expand All @@ -51,8 +64,6 @@ def main(config_path):
dataset_std=dataset_config.preprocess.dataset_std,
frame_preprocessor=frame_preprocessor,
tokenizer=tokenizer,
target_lang_on_source=True,
task_prefixes=[],
)

# Save processor and set PROCESSOR_PATH environment variable
Expand Down
Loading

0 comments on commit efd635a

Please sign in to comment.