Fix Qwen2-Audio-Instruct (#3298)

stanford-crfm · Jan 29, 2025 · f2e8ee1 · f2e8ee1
1 parent 2b1f0d7
commit f2e8ee1
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 2 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -285,6 +285,7 @@ audiolm =
     # For HuggingFace audio datasets
     soundfile~=0.12
     librosa~=0.10
+    einops~=0.7.0
 
     # For LLaMA-Omni
     openai-whisper==20240930

diff --git a/src/helm/clients/audio_language/qwen2_audiolm_client.py b/src/helm/clients/audio_language/qwen2_audiolm_client.py
@@ -98,7 +98,7 @@ def make_request(self, request: Request) -> RequestResult:
         for media_num, media_object in enumerate(request.multimodal_prompt.media_objects):
             if media_object.is_type("audio") and media_object.location:
                 assert media_object.is_local_file, "Only local audio files are supported"
-                query.append({"type": "audio", "audio_loc": media_object.location})
+                query.append({"type": "audio", "audio_url": media_object.location})
 
                 prompt_text += f"<|im_start|>user\nAudio {media_num+1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
             elif media_object.is_type(TEXT_TYPE):
@@ -131,7 +131,7 @@ def do_it() -> Dict[str, Any]:
                                     if element["type"] == "audio":
                                         audios.append(
                                             librosa.load(
-                                                element["audio_loc"],
+                                                element["audio_url"],
                                                 sr=tokenizer.feature_extractor.sampling_rate,
                                             )[0]
                                         )