[UPD] Adapted data functionality to align with issue #3 requirements.…

… Updated framework to handle metadata files with fields like prompts, targets, etc. Users can now prepare their datasets following the specified format.
GerrySant · Jan 6, 2025 · efd635a · efd635a
1 parent 7f0c0ab
commit efd635a
Show file tree

Hide file tree

Showing 24 changed files with 429 additions and 243 deletions.
diff --git a/examples/multimodal_translation/Image2text_translation/configs/example_config.yaml b/examples/multimodal_translation/Image2text_translation/configs/example_config.yaml
@@ -59,12 +59,9 @@ training:
   fp16: True
 
 data:
-  data_dir: "/path/to/data/directory"
-  train_split_name: "train"
-  dev_split_name: "dev"
-  test_split_name: "devtest"
-  src_lang: "he"
-  tgt_lang: "en"
+  train_metadata_dir: "/path/to/train/metadata_file.tsv"
+  validation_metadata_dir: "/path/to/validation/metadata_file.tsv"
+  test_metadata_dir: "/path/to/test/metadata_file.tsv"
   font_path: "/examples/multimodal_translation/Image2text_translation/other/Arial.ttf"
   as_numpy: false
   shuffle: True

diff --git a/...translation/Image2text_translation/example_scripts/hebrew_dataset_preprocessing_script.py b/...translation/Image2text_translation/example_scripts/hebrew_dataset_preprocessing_script.py
@@ -0,0 +1,80 @@
+import pandas as pd
+import os
+import argparse
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser(description="Process and transform a CSV file.")
+parser.add_argument("sources_file", type=str, help="Path to the file containing source texts.")
+parser.add_argument("targets_file", type=str, help="Path to the file containing target texts.")
+parser.add_argument("source_prompts", type=str, help="Path to the file containing source prompt texts. If the prompt is fixed, it can be also written here.")
+parser.add_argument("generation_prompts", type=str, help="Path to the file containing generation prompt texts. If the prompt is fixed, it can be also written here.")
+parser.add_argument("output_file", type=str, help="Path to the output TSV file.")
+args = parser.parse_args()
+
+def create_dataframe(sources_file, targets_file, source_prompts, generation_prompts):
+
+    # Read the source and target files
+    with open(sources_file, 'r', encoding='utf-8') as source_file:
+        sources = source_file.read().splitlines()
+
+    with open(targets_file, 'r', encoding='utf-8') as target_file:
+        targets = target_file.read().splitlines()
+
+    # Ensure the two files have the same number of lines
+    if len(sources) != len(targets):
+        raise ValueError("Source and target files must have the same number of lines.")
+
+    if os.path.exists(source_prompts):
+        with open(source_prompts, 'r', encoding='utf-8') as source_prompts:
+            source_prompt = source_prompts.read().splitlines()
+    else:
+        source_prompt = [source_prompts] * len(sources)
+
+    if os.path.exists(generation_prompts):
+        with open(generation_prompts, 'r', encoding='utf-8') as generation_prompts:
+            generation_prompts = generation_prompts.read().splitlines()
+    else:
+        generation_prompts = [generation_prompts] * len(sources)
+
+    # Create the DataFrame
+    dataframe = pd.DataFrame({
+        'source_text': sources,
+        'source_prompt': source_prompt,
+        'generation_prompt': generation_prompts,
+        'output_text': targets
+    })
+
+    return dataframe
+
+try:
+    data = create_dataframe(
+        args.sources_file, 
+        args.targets_file, 
+        args.source_prompts, 
+        args.generation_prompts
+        )
+
+except Exception as e:
+    print(f"Error: {e}")
+
+
+# Select the desired columns for the new dataset
+output_columns = [
+    'source_text',
+    'source_prompt',
+    'generation_prompt',
+    'output_text'
+]
+
+# Save the transformed dataset to a new file, determining format by extension
+if args.output_file.endswith('.tsv'):
+    data[output_columns].to_csv(args.output_file, sep='\t', index=False)
+else:
+    data[output_columns].to_csv(args.output_file, index=False)
+
+print(f"Transformed dataset saved to {args.output_file}")
+
+
+
+
+# He de acabar el script que preprocessa les dades de hebrew, despres acabar de comprobar les classes datasets, i els experiments setup de la carpeta de examples
diff --git a/...es/multimodal_translation/Image2text_translation/example_scripts/hebrew_training_setup.py b/...es/multimodal_translation/Image2text_translation/example_scripts/hebrew_training_setup.py
@@ -51,9 +51,7 @@ def main(config_path):
         height=dataset_config.preprocess.height,
         normalize_image=dataset_config.preprocess.do_normalize,
         mean=dataset_config.preprocess.dataset_mean,
-        std=dataset_config.preprocess.dataset_std,   
-        target_lang_on_source=True,
-        task_prefixes=[],
+        std=dataset_config.preprocess.dataset_std,
     )
 
     # Save processor and set PROCESSOR_PATH environment variable

diff --git a/examples/multimodal_translation/pose2text_translation/configs/example_config.yaml b/examples/multimodal_translation/pose2text_translation/configs/example_config.yaml
@@ -55,7 +55,9 @@ training:
   fp16: True
 
 data:
-  data_dir: "/path/to/data/directory"
+  train_metadata_dir: "/path/to/train/metadata_file.tsv"
+  validation_metadata_dir: "/path/to/validation/metadata_file.tsv"
+  test_metadata_dir: "/path/to/test/metadata_file.tsv"
   filter_empty_samples: True
   shuffle: True
   src_lang_tokenizer_path: "/examples/multimodal_translation/pose2text_translation/other/new_languages_how2sign.txt"

diff --git a/...ranslation/pose2text_translation/example_scripts/how2sign_dataset_preprocessing_script.py b/...ranslation/pose2text_translation/example_scripts/how2sign_dataset_preprocessing_script.py
@@ -0,0 +1,81 @@
+import pandas as pd
+import os
+import argparse
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser(description="Process and transform a CSV file.")
+parser.add_argument("input_file", type=str, help="Path to the input CSV file.")
+parser.add_argument("output_file", type=str, help="Path to the output CSV file.")
+args = parser.parse_args()
+
+# Placeholder functions for constructing new fields
+def construct_input_pose(row):
+    return ""  # Replace with the actual implementation or keep empty
+
+def construct_source_start(row):
+    return ""  # Replace with the actual implementation or keep empty
+
+def construct_source_end(row):
+    return ""  # Replace with the actual implementation or keep empty
+
+def construct_input_clip(row):
+    return ""  # Replace with the actual implementation or keep empty
+
+def construct_source_prompt(row):
+    return "__slt__ __asl__ __en__"
+
+def construct_input_text(row):
+    return ""  # Replace with the actual implementation or keep empty
+
+def construct_generation_prompts(row):
+    return ""  # Replace with the actual implementation or keep empty
+
+def construct_output_text(row):
+    return ""  # Replace with the actual implementation or keep empty
+
+def map_column_to_new_field(original_column, new_column_name, data):
+    if original_column in data.columns:
+        data[new_column_name] = data[original_column]
+    else:
+        data[new_column_name] = ""  # Fill with empty if column does not exist
+
+# Read the input CSV file
+data = pd.read_csv(args.input_file, delimiter="\t")
+
+# Create new columns using the placeholder functions
+data['input_pose'] = data.apply(construct_input_pose, axis=1)
+data['source_start'] = data.apply(construct_source_start, axis=1)
+data['source_end'] = data.apply(construct_source_end, axis=1)
+data['input_clip'] = data.apply(construct_input_clip, axis=1)
+data['source_prompt'] = data.apply(construct_source_prompt, axis=1)
+data['input_text'] = data.apply(construct_input_text, axis=1)
+data['generation_prompt'] = data.apply(construct_generation_prompts, axis=1)
+data['output_text'] = data.apply(construct_output_text, axis=1)
+
+# Example of mapping original columns to new ones
+map_column_to_new_field('VIDEO_NAME', 'input_pose', data)
+map_column_to_new_field('START', 'source_start', data)
+map_column_to_new_field('END', 'source_end', data)
+map_column_to_new_field('SENTENCE', 'output_text', data)
+map_column_to_new_field('SENTENCE_NAME', 'input_clip', data)
+
+
+# Select the desired columns for the new dataset
+output_columns = [
+    'input_pose',
+    'source_start',
+    'source_end',
+    'input_clip',
+    'input_text',
+    'source_prompt',
+    'generation_prompt',
+    'output_text'
+]
+
+# Save the transformed dataset to a new file, determining format by extension
+if args.output_file.endswith('.tsv'):
+    data[output_columns].to_csv(args.output_file, sep='\t', index=False)
+else:
+    data[output_columns].to_csv(args.output_file, index=False)
+
+print(f"Transformed dataset saved to {args.output_file}")
diff --git a/...s/multimodal_translation/pose2text_translation/example_scripts/how2sign_training_setup.py b/...s/multimodal_translation/pose2text_translation/example_scripts/how2sign_training_setup.py
@@ -25,17 +25,28 @@ def main(config_path):
     dataset.as_dataset().save_to_disk(data_path)
 
     m2m_tokenizer = AutoTokenizer.from_pretrained(dataset_config.text_tokenizer_path)
-    tokenizer = add_new_special_tokens_from_vocab_file(
-        tokenizer=copy.deepcopy(m2m_tokenizer), 
-        vocab_file=dataset_config.src_lang_tokenizer_path,
-        output_dir=config.training.output_dir + "/" + config.model.name,
-    )
+
+    tokenizer = m2m_tokenizer
+
+    vocab_files = []
+    if dataset_config.src_lang_tokenizer_path:
+        vocab_files.append(dataset_config.src_lang_tokenizer_path)
+    if dataset_config.new_task_tokens_dictionary_path:
+        vocab_files.append(dataset_config.new_task_tokens_dictionary_path)
+
+    for i, vocab_file in enumerate(vocab_files):
+        output_dir = None
+        if i == len(vocab_files) - 1:
+            output_dir = f"{config.training.output_dir}/{config.model.name}"
+        tokenizer = add_new_special_tokens_from_vocab_file(
+            tokenizer=copy.deepcopy(tokenizer),
+            vocab_file=vocab_file,
+            output_dir=output_dir,
+        )
 
     input_processor = Pose2TextTranslationProcessor(
             tokenizer=tokenizer,
             reduce_holistic_poses=True,
-            target_lang_on_source=True,
-            task_prefixes=[],
     )
 
     # Save processor and set PROCESSOR_PATH environment variable

diff --git a/examples/multimodal_translation/run_translation.py b/examples/multimodal_translation/run_translation.py
@@ -153,12 +153,6 @@ class ProcessorArguments:
     processor_name_or_path: Optional[str] = field(
         default=None, metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
     )
-    task_prefixes: List[str] = field(
-        default_factory=list,
-        metadata={
-            "help": "A list of prefixes to prepend to input text for each task."
-        }
-    )
     target_lang_on_source: bool = field(
         default=False,
         metadata={
@@ -299,12 +293,6 @@ class DataTrainingArguments:
             )
         },
     )
-    insert_langtok_on_target: bool = field(
-        default=True,
-        metadata={
-            "help": "Whether to insert tgt_lang_token to the labels in the loss computation or not."
-        },
-    )
 
     def __post_init__(self):
         if self.dataset_name is None and self.dataset_dir is None and self.train_file is None and self.validation_file is None:
@@ -654,7 +642,6 @@ def preprocess_function(examples):
             model=model,
             pad_to_multiple_of=8 if training_args.fp16 else None,
             label_pad_token_id=label_pad_token_id,
-            insert_langtok_on_target=data_args.insert_langtok_on_target
             )
     elif data_args.pad_to_max_length:
         data_collator = default_data_collator

diff --git a/examples/multimodal_translation/signwriting2text_translation/configs/example_config.yaml b/examples/multimodal_translation/signwriting2text_translation/configs/example_config.yaml
@@ -23,7 +23,7 @@ model:
   freeze_lang_embeddings: False
   freeze_backbone: True
   freeze_lm_head: True
-  lang_embeddings_vocab_size: 79 # N_new_languages + N_special_tokens. N_special_tokens is normally 4: <bos>, <eos>, <unk>, <pad>
+  lang_embeddings_vocab_size: 128182 # N_new_languages + N_special_tokens. N_special_tokens is normally 4: <bos>, <eos>, <unk>, <pad>
   feature_extractor_cfg: # Specify args to be modified in the feature extractor architecture
     feature_extractor_arguments: "<config-arguments>"
 
@@ -59,10 +59,9 @@ training:
   fp16: True
 
 data:
-  data_dir: "/path/to/data/directory"
-  train_split_name: "<train_file>"
-  dev_split_name: "<dev_file>"
-  test_split_name: "<test_file>"
+  train_metadata_dir: "/path/to/train/metadata_file.tsv"
+  validation_metadata_dir: "/path/to/validation/metadata_file.tsv"
+  test_metadata_dir: "/path/to/test/metadata_file.tsv"
   filter_empty_samples: True
   shuffle: True
   src_lang_tokenizer_path: "/examples/multimodal_translation/signwritting2text_translation/other/new_languages_sign_bank_plus.txt"

diff --git a/...signwriting2text_translation/example_scripts/signbankplus_dataset_preprocessing_script.py b/...signwriting2text_translation/example_scripts/signbankplus_dataset_preprocessing_script.py
@@ -0,0 +1,55 @@
+import pandas as pd
+import os
+import argparse
+
+from multimodalhugs.custom_datasets import properly_format_signbank_plus
+
+# Parse command-line arguments
+parser = argparse.ArgumentParser(description="Process and transform a CSV file.")
+parser.add_argument("metadata_file", type=str, help="Path to the file containing the split metadata.")
+parser.add_argument("output_file", type=str, help="Path to the output TSV file.")
+args = parser.parse_args()
+
+# Placeholder functions for constructing new fields
+def construct_source_sequence(row):
+    return ""  # Replace with the actual implementation or keep empty
+
+def construct_source_prompt(row):
+    return row['src_lang']  # Replace with the actual implementation or keep empty
+
+def construct_generation_prompt(row):
+    return row['tgt_lang']  # Replace with the actual implementation or keep empty
+
+def construct_output_text(row):
+    return ""  # Replace with the actual implementation or keep empty
+
+def map_column_to_new_field(original_column, new_column_name, data):
+    if original_column in data.columns:
+        data[new_column_name] = data[original_column]
+    else:
+        data[new_column_name] = ""  # Fill with empty if column does not exist
+
+data = properly_format_signbank_plus(args.metadata_file, False)
+
+data['source_prompt'] = data.apply(construct_source_prompt, axis=1)
+data['generation_prompt'] = data.apply(construct_generation_prompt, axis=1)
+
+# Example of mapping original columns to new ones
+map_column_to_new_field('source', 'source_sequence', data)
+map_column_to_new_field('target', 'output_text', data)
+
+# Select the desired columns for the new dataset
+output_columns = [
+    'source_sequence',
+    'source_prompt',
+    'generation_prompt',
+    'output_text'
+]
+
+# Save the transformed dataset to a new file, determining format by extension
+if args.output_file.endswith('.tsv'):
+    data[output_columns].to_csv(args.output_file, sep='\t', index=False)
+else:
+    data[output_columns].to_csv(args.output_file, index=False)
+
+print(f"Transformed dataset saved to {args.output_file}")
diff --git a/...l_translation/signwriting2text_translation/example_scripts/signbankplus_training_setup.py b/...l_translation/signwriting2text_translation/example_scripts/signbankplus_training_setup.py
@@ -36,11 +36,24 @@ def main(config_path):
             )
 
     m2m_tokenizer = AutoTokenizer.from_pretrained(dataset_config.text_tokenizer_path)
-    tokenizer = add_new_special_tokens_from_vocab_file(
-        tokenizer=copy.deepcopy(m2m_tokenizer), 
-        vocab_file=dataset_config.src_lang_tokenizer_path,
-        output_dir=config.training.output_dir + "/" + config.model.name,
-    )
+
+    tokenizer = m2m_tokenizer
+
+    vocab_files = []
+    if dataset_config.src_lang_tokenizer_path:
+        vocab_files.append(dataset_config.src_lang_tokenizer_path)
+    if dataset_config.new_task_tokens_dictionary_path:
+        vocab_files.append(dataset_config.new_task_tokens_dictionary_path)
+
+    for i, vocab_file in enumerate(vocab_files):
+        output_dir = None
+        if i == len(vocab_files) - 1:
+            output_dir = f"{config.training.output_dir}/{config.model.name}"
+        tokenizer = add_new_special_tokens_from_vocab_file(
+            tokenizer=copy.deepcopy(tokenizer),
+            vocab_file=vocab_file,
+            output_dir=output_dir,
+        )
 
     input_processor = SignwritingProcessor(
             width=dataset_config.preprocess.width,
@@ -51,8 +64,6 @@ def main(config_path):
             dataset_std=dataset_config.preprocess.dataset_std,
             frame_preprocessor=frame_preprocessor,
             tokenizer=tokenizer,
-            target_lang_on_source=True,
-            task_prefixes=[],
     )
 
     # Save processor and set PROCESSOR_PATH environment variable