-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[UPD] Adapted data functionality to align with issue #3 requirements.…
… Updated framework to handle metadata files with fields like prompts, targets, etc. Users can now prepare their datasets following the specified format.
- Loading branch information
Gerard Sant Muniesa
authored and
Gerard Sant Muniesa
committed
Jan 6, 2025
1 parent
7f0c0ab
commit efd635a
Showing
24 changed files
with
429 additions
and
243 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
80 changes: 80 additions & 0 deletions
80
...translation/Image2text_translation/example_scripts/hebrew_dataset_preprocessing_script.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import pandas as pd | ||
import os | ||
import argparse | ||
|
||
# Parse command-line arguments | ||
parser = argparse.ArgumentParser(description="Process and transform a CSV file.") | ||
parser.add_argument("sources_file", type=str, help="Path to the file containing source texts.") | ||
parser.add_argument("targets_file", type=str, help="Path to the file containing target texts.") | ||
parser.add_argument("source_prompts", type=str, help="Path to the file containing source prompt texts. If the prompt is fixed, it can be also written here.") | ||
parser.add_argument("generation_prompts", type=str, help="Path to the file containing generation prompt texts. If the prompt is fixed, it can be also written here.") | ||
parser.add_argument("output_file", type=str, help="Path to the output TSV file.") | ||
args = parser.parse_args() | ||
|
||
def create_dataframe(sources_file, targets_file, source_prompts, generation_prompts): | ||
|
||
# Read the source and target files | ||
with open(sources_file, 'r', encoding='utf-8') as source_file: | ||
sources = source_file.read().splitlines() | ||
|
||
with open(targets_file, 'r', encoding='utf-8') as target_file: | ||
targets = target_file.read().splitlines() | ||
|
||
# Ensure the two files have the same number of lines | ||
if len(sources) != len(targets): | ||
raise ValueError("Source and target files must have the same number of lines.") | ||
|
||
if os.path.exists(source_prompts): | ||
with open(source_prompts, 'r', encoding='utf-8') as source_prompts: | ||
source_prompt = source_prompts.read().splitlines() | ||
else: | ||
source_prompt = [source_prompts] * len(sources) | ||
|
||
if os.path.exists(generation_prompts): | ||
with open(generation_prompts, 'r', encoding='utf-8') as generation_prompts: | ||
generation_prompts = generation_prompts.read().splitlines() | ||
else: | ||
generation_prompts = [generation_prompts] * len(sources) | ||
|
||
# Create the DataFrame | ||
dataframe = pd.DataFrame({ | ||
'source_text': sources, | ||
'source_prompt': source_prompt, | ||
'generation_prompt': generation_prompts, | ||
'output_text': targets | ||
}) | ||
|
||
return dataframe | ||
|
||
try: | ||
data = create_dataframe( | ||
args.sources_file, | ||
args.targets_file, | ||
args.source_prompts, | ||
args.generation_prompts | ||
) | ||
|
||
except Exception as e: | ||
print(f"Error: {e}") | ||
|
||
|
||
# Select the desired columns for the new dataset | ||
output_columns = [ | ||
'source_text', | ||
'source_prompt', | ||
'generation_prompt', | ||
'output_text' | ||
] | ||
|
||
# Save the transformed dataset to a new file, determining format by extension | ||
if args.output_file.endswith('.tsv'): | ||
data[output_columns].to_csv(args.output_file, sep='\t', index=False) | ||
else: | ||
data[output_columns].to_csv(args.output_file, index=False) | ||
|
||
print(f"Transformed dataset saved to {args.output_file}") | ||
|
||
|
||
|
||
|
||
# He de acabar el script que preprocessa les dades de hebrew, despres acabar de comprobar les classes datasets, i els experiments setup de la carpeta de examples |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
81 changes: 81 additions & 0 deletions
81
...ranslation/pose2text_translation/example_scripts/how2sign_dataset_preprocessing_script.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
import pandas as pd | ||
import os | ||
import argparse | ||
|
||
# Parse command-line arguments | ||
parser = argparse.ArgumentParser(description="Process and transform a CSV file.") | ||
parser.add_argument("input_file", type=str, help="Path to the input CSV file.") | ||
parser.add_argument("output_file", type=str, help="Path to the output CSV file.") | ||
args = parser.parse_args() | ||
|
||
# Placeholder functions for constructing new fields | ||
def construct_input_pose(row): | ||
return "" # Replace with the actual implementation or keep empty | ||
|
||
def construct_source_start(row): | ||
return "" # Replace with the actual implementation or keep empty | ||
|
||
def construct_source_end(row): | ||
return "" # Replace with the actual implementation or keep empty | ||
|
||
def construct_input_clip(row): | ||
return "" # Replace with the actual implementation or keep empty | ||
|
||
def construct_source_prompt(row): | ||
return "__slt__ __asl__ __en__" | ||
|
||
def construct_input_text(row): | ||
return "" # Replace with the actual implementation or keep empty | ||
|
||
def construct_generation_prompts(row): | ||
return "" # Replace with the actual implementation or keep empty | ||
|
||
def construct_output_text(row): | ||
return "" # Replace with the actual implementation or keep empty | ||
|
||
def map_column_to_new_field(original_column, new_column_name, data): | ||
if original_column in data.columns: | ||
data[new_column_name] = data[original_column] | ||
else: | ||
data[new_column_name] = "" # Fill with empty if column does not exist | ||
|
||
# Read the input CSV file | ||
data = pd.read_csv(args.input_file, delimiter="\t") | ||
|
||
# Create new columns using the placeholder functions | ||
data['input_pose'] = data.apply(construct_input_pose, axis=1) | ||
data['source_start'] = data.apply(construct_source_start, axis=1) | ||
data['source_end'] = data.apply(construct_source_end, axis=1) | ||
data['input_clip'] = data.apply(construct_input_clip, axis=1) | ||
data['source_prompt'] = data.apply(construct_source_prompt, axis=1) | ||
data['input_text'] = data.apply(construct_input_text, axis=1) | ||
data['generation_prompt'] = data.apply(construct_generation_prompts, axis=1) | ||
data['output_text'] = data.apply(construct_output_text, axis=1) | ||
|
||
# Example of mapping original columns to new ones | ||
map_column_to_new_field('VIDEO_NAME', 'input_pose', data) | ||
map_column_to_new_field('START', 'source_start', data) | ||
map_column_to_new_field('END', 'source_end', data) | ||
map_column_to_new_field('SENTENCE', 'output_text', data) | ||
map_column_to_new_field('SENTENCE_NAME', 'input_clip', data) | ||
|
||
|
||
# Select the desired columns for the new dataset | ||
output_columns = [ | ||
'input_pose', | ||
'source_start', | ||
'source_end', | ||
'input_clip', | ||
'input_text', | ||
'source_prompt', | ||
'generation_prompt', | ||
'output_text' | ||
] | ||
|
||
# Save the transformed dataset to a new file, determining format by extension | ||
if args.output_file.endswith('.tsv'): | ||
data[output_columns].to_csv(args.output_file, sep='\t', index=False) | ||
else: | ||
data[output_columns].to_csv(args.output_file, index=False) | ||
|
||
print(f"Transformed dataset saved to {args.output_file}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
55 changes: 55 additions & 0 deletions
55
...signwriting2text_translation/example_scripts/signbankplus_dataset_preprocessing_script.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import pandas as pd | ||
import os | ||
import argparse | ||
|
||
from multimodalhugs.custom_datasets import properly_format_signbank_plus | ||
|
||
# Parse command-line arguments | ||
parser = argparse.ArgumentParser(description="Process and transform a CSV file.") | ||
parser.add_argument("metadata_file", type=str, help="Path to the file containing the split metadata.") | ||
parser.add_argument("output_file", type=str, help="Path to the output TSV file.") | ||
args = parser.parse_args() | ||
|
||
# Placeholder functions for constructing new fields | ||
def construct_source_sequence(row): | ||
return "" # Replace with the actual implementation or keep empty | ||
|
||
def construct_source_prompt(row): | ||
return row['src_lang'] # Replace with the actual implementation or keep empty | ||
|
||
def construct_generation_prompt(row): | ||
return row['tgt_lang'] # Replace with the actual implementation or keep empty | ||
|
||
def construct_output_text(row): | ||
return "" # Replace with the actual implementation or keep empty | ||
|
||
def map_column_to_new_field(original_column, new_column_name, data): | ||
if original_column in data.columns: | ||
data[new_column_name] = data[original_column] | ||
else: | ||
data[new_column_name] = "" # Fill with empty if column does not exist | ||
|
||
data = properly_format_signbank_plus(args.metadata_file, False) | ||
|
||
data['source_prompt'] = data.apply(construct_source_prompt, axis=1) | ||
data['generation_prompt'] = data.apply(construct_generation_prompt, axis=1) | ||
|
||
# Example of mapping original columns to new ones | ||
map_column_to_new_field('source', 'source_sequence', data) | ||
map_column_to_new_field('target', 'output_text', data) | ||
|
||
# Select the desired columns for the new dataset | ||
output_columns = [ | ||
'source_sequence', | ||
'source_prompt', | ||
'generation_prompt', | ||
'output_text' | ||
] | ||
|
||
# Save the transformed dataset to a new file, determining format by extension | ||
if args.output_file.endswith('.tsv'): | ||
data[output_columns].to_csv(args.output_file, sep='\t', index=False) | ||
else: | ||
data[output_columns].to_csv(args.output_file, index=False) | ||
|
||
print(f"Transformed dataset saved to {args.output_file}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.