[UPD] Refactor training setup and update build_model interface

Moved modality-specific training setup scripts from examples/ to multimodalhugs/training_setup/. Added dispatcher script (multimodalhugs_cli/training_setup.py) to route --modality to the proper setup script. Introduced a model registry (multimodalhugs/models/registry.py) for dynamic model selection. Added tokenizer utilities (multimodalhugs/utils/tokenizer_utils.py) to centralize tokenizer extension logic. Updated MultiModalEmbedderModel.build_model() usage and related configuration files. Adjusted pyproject.toml entry points accordingly.
GerrySant · Feb 3, 2025 · 06eb546 · 06eb546
1 parent c179990
commit 06eb546
Show file tree

Hide file tree

Showing 26 changed files with 465 additions and 346 deletions.
diff --git a/examples/multimodal_translation/image2text_translation/README.md b/examples/multimodal_translation/image2text_translation/README.md
@@ -49,7 +49,7 @@ Below is an example of how your metadata file should be structured. Each row rep
 
 **Goal**: Initialize tokenizers, create or load the model, and save paths for easy retrieval.
 
-- **Script**: `image2text_training_setup.py`
+- **Command Line**: `multimodalhugs-setup --modality "image2text"`
 - **Input**: A config file (e.g., `configs/example_config.yaml`) specifying:
   - Model parameters
   - Tokenizer paths
@@ -63,7 +63,7 @@ Below is an example of how your metadata file should be structured. Each row rep
 Run the setup:
 
 ```bash
-python image2text_training_setup.py --config_path /path/to/example_config.yaml
+multimodalhugs-setup --modality "image2text" --config_path </path/to/example_config.yaml>
 ```
 The script will print environment variables (`MODEL_PATH`, `PROCESSOR_PATH`, `DATA_PATH`) that you can export for downstream usage.
 

diff --git a/examples/multimodal_translation/image2text_translation/configs/example_config.yaml b/examples/multimodal_translation/image2text_translation/configs/example_config.yaml
@@ -1,4 +1,5 @@
 model:
+  type: "multimodal_embedder" 
   name: "hebrew_multimodalhugs"                           # The name or identifier of the model configuration.
   feature_extractor_type: "<feature_extractor_type>"     # Type of the feature extractor to use (e.g., "clip" for CLIP).
   pretrained_feature_extractor: "<pretrained-clip-model>" # Pretrained weights/checkpoint for the feature extractor (e.g., "openai/clip-vit-base-patch32").

diff --git a/...multimodal_translation/image2text_translation/example_scripts/hebrew_training_pipeline.sh b/...multimodal_translation/image2text_translation/example_scripts/hebrew_training_pipeline.sh
@@ -50,8 +50,7 @@ python ${REPO_PATH}/multimodalhugs/examples/multimodal_translation/image2text_tr
 # ----------------------------------------------------------
 # 3. Prepare Training Environment
 # ----------------------------------------------------------
-output=$(python ${REPO_PATH}/multimodalhugs/examples/multimodal_translation/image2text_translation/example_scripts/image2text_training_setup.py \
-    --config_path $CONFIG_PATH)
+output=$(multimodalhugs-setup --modality "image2text" --config_path $CONFIG_PATH)
 
 export MODEL_PATH=$(echo "$output" | grep 'MODEL_PATH' | cut -d '=' -f 2)
 export PROCESSOR_PATH=$(echo "$output" | grep 'PROCESSOR_PATH' | cut -d '=' -f 2)

diff --git a/...ultimodal_translation/image2text_translation/example_scripts/image2text_training_setup.py b/...ultimodal_translation/image2text_translation/example_scripts/image2text_training_setup.py
diff --git a/examples/multimodal_translation/pose2text_translation/README.md b/examples/multimodal_translation/pose2text_translation/README.md
@@ -52,7 +52,7 @@ Below is an example of how your metadata file should be structured. Each row rep
 
 **Goal**: Initialize tokenizers, create or load the model, and save paths for easy retrieval.
 
-- **Script**: pose2text_training_setup.py
+- **Command Line**: `multimodalhugs-setup --modality "pose2text"`
 - **Input**: A config file (e.g., `configs/example_config.yaml`) specifying:
   - Model parameters
   - Tokenizer paths
@@ -66,12 +66,10 @@ Below is an example of how your metadata file should be structured. Each row rep
 Run the setup:
 
 ```bash
-python pose2text_training_setup.py --config_path /path/to/pose2text_config.yaml
+multimodalhugs-setup --modality "pose2text" --config_path </path/to/example_config.yaml>
 ```
 The script will print environment variables (`MODEL_PATH`, `PROCESSOR_PATH`, `DATA_PATH`) that you can export for downstream usage.
 
-
-
 ## 3. Launching the Training Process
 **Goal**: Start the full Pose2Text training routine using Hugging Face’s Trainer.
 

diff --git a/examples/multimodal_translation/pose2text_translation/configs/example_config.yaml b/examples/multimodal_translation/pose2text_translation/configs/example_config.yaml
@@ -4,6 +4,7 @@
 
 model:
   # Model-specific settings:
+  type: "multimodal_embedder" 
   name: "how2sign_pose_2_text_model"                      # The identifier or name of your model configuration.
   vl_mapper_type: "linear"                                # Type of visual-language mapper (e.g., "linear" or "adapter").
   vl_mapper_layer_norm_before: true                       # Whether to apply Layer Normalization before the VL mapper.

diff --git a/...ultimodal_translation/pose2text_translation/example_scripts/how2sign_training_pipeline.sh b/...ultimodal_translation/pose2text_translation/example_scripts/how2sign_training_pipeline.sh
@@ -42,9 +42,8 @@ python ${REPO_PATH}/examples/multimodal_translation/pose2text_translation/exampl
 # ----------------------------------------------------------
 # 3. Prepare Training Environment
 # ----------------------------------------------------------
-# This Python script sets up environment variables for the model, processor, etc.
-output=$(python ${REPO_PATH}/examples/multimodal_translation/pose2text_translation/example_scripts/pose2text_training_setup.py \
-    --config_path $CONFIG_PATH)
+# This command line sets up environment variables for the model, processor, etc
+output=$(multimodalhugs-setup --modality "pose2text" --config_path $CONFIG_PATH)
 
 # Extract environment variables from the Python script’s output
 export MODEL_PATH=$(echo "$output" | grep 'MODEL_PATH' | cut -d '=' -f 2)

diff --git a/.../multimodal_translation/pose2text_translation/example_scripts/pose2sign_training_setup.py b/.../multimodal_translation/pose2text_translation/example_scripts/pose2sign_training_setup.py
diff --git a/examples/multimodal_translation/signwriting2text_translation/README.md b/examples/multimodal_translation/signwriting2text_translation/README.md
@@ -52,7 +52,7 @@ Below is an example of how your metadata file should be structured. Each row rep
 
 **Goal**: Initialize tokenizers, preprocessors, and models, and save their paths for further usage.
 
-- **Script**: [`signwriting2text_training_setup.py`](./example_scripts/signwriting2text_training_setup.py)
+- **Command Line**: `multimodalhugs-setup --modality "signwriting2text"`
 - **Input**: A configuration file (e.g., `configs/example_config.yaml`) specifying:
   - Model parameters
   - Tokenizer paths
@@ -66,7 +66,7 @@ Below is an example of how your metadata file should be structured. Each row rep
 Run the setup script:
 
 ```bash
-python signwriting2text_training_setup.py --config_path /path/to/signwriting_config.yaml
+multimodalhugs-setup --modality "signwriting2text" --config_path </path/to/signwriting_config.yaml>
 ```
 
 The script outputs environment variables (`MODEL_PATH`, `PROCESSOR_PATH`, `DATA_PATH`) for downstream usage.

diff --git a/examples/multimodal_translation/signwriting2text_translation/configs/example_config.yaml b/examples/multimodal_translation/signwriting2text_translation/configs/example_config.yaml
@@ -1,4 +1,5 @@
 model:
+  type: "multimodal_embedder" 
   name: "multimodal_embedder"                          # The model name or identifier.
   feature_extractor_type: "<feature_extractor_type>"   # Type of feature extractor (e.g., "clip" for CLIP).
   pretrained_feature_extractor: "<pretrained-clip-model>"  # Pretrained weights for the feature extractor (e.g., "openai/clip-vit-base-patch32").

diff --git a/...ranslation/signwriting2text_translation/example_scripts/signbankplus_training_pipeline.sh b/...ranslation/signwriting2text_translation/example_scripts/signbankplus_training_pipeline.sh
@@ -42,9 +42,8 @@ python ${REPO_PATH}/examples/multimodal_translation/signwriting2text_translation
 # ----------------------------------------------------------
 # 3. Prepare Training Environment
 # ----------------------------------------------------------
-# This Python script sets up environment variables for the model, processor, etc.
-output=$(python ${REPO_PATH}/examples/multimodal_translation/signwriting2text_translation/example_scripts/signwriting2text_training_setup.py \
-    --config_path $CONFIG_PATH)
+# This comand line sets up environment variables for the model, processor, etc.
+output=$(multimodalhugs-setup --modality "signwriting2text" --config_path $CONFIG_PATH)
 
 # Extract environment variables from the Python script’s output
 export MODEL_PATH=$(echo "$output" | grep 'MODEL_PATH' | cut -d '=' -f 2)

diff --git a/...anslation/signwriting2text_translation/example_scripts/signwriting2text_training_setup.py b/...anslation/signwriting2text_translation/example_scripts/signwriting2text_training_setup.py
diff --git a/multimodalhugs/data/__init__.py b/multimodalhugs/data/__init__.py
@@ -4,6 +4,7 @@
     MultimodalMTDataConfig, 
     SignLanguageMTDataConfig, 
     BilingualImage2textMTDataConfig,
+    Pose2TextDataConfig,
 )
 from .datasets.signwriting import SignWritingDataset
 from .datasets.pose2text import Pose2TextDataset