-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path8_ser_ui_1.py
115 lines (94 loc) · 3.72 KB
/
8_ser_ui_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import gradio as gr
import numpy as np
import librosa
from sklearn.preprocessing import StandardScaler
import joblib
import soundfile as sf
import traceback
# Placeholder functions (replace with actual implementation)
def noise_reduction(audio):
# Perform noise reduction
print("Noise reduction step")
return audio
def feature_extraction(audio, sr):
# Extract features (e.g., MFCCs)
print("Feature extraction step")
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
return np.mean(mfccs.T, axis=0)
def feature_scaling(features):
# Scale features
print("Feature scaling step")
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features.reshape(-1, 1))
return features_scaled.flatten()
def split_audio(audio, sr):
# Split audio into chunks (if necessary)
print("Audio splitting step")
return [audio]
def load_models():
print("Loading models")
svm_model = joblib.load("D:/MCA/4th sem/SER3/models/svm_model.pkl")
label_encoder = joblib.load("D:/MCA/4th sem/SER3/models/label_encoder.pkl")
return svm_model, label_encoder
def audio_classification(svm_model, label_encoder, features_scaled):
print("Predicting emotion using SVM")
svm_pred = svm_model.predict(features_scaled.reshape(1, -1))
svm_emotion = label_encoder.inverse_transform(svm_pred)[0]
return svm_emotion
def predict_emotion(audio_tuple):
print("Received audio data and sample rate")
if audio_tuple is None:
print("No audio file provided")
return "No audio file provided"
print("Audio tuple:", audio_tuple)
if len(audio_tuple) != 2:
print("Invalid audio tuple format")
return "Invalid audio tuple format"
try:
# Save the audio data to a temporary file
temp_folder = "temp"
if not os.path.exists(temp_folder):
os.makedirs(temp_folder)
temp_audio_path = os.path.join(temp_folder, "temp_audio.wav")
sample_rate, audio_data = audio_tuple
# Check if audio data is mono, reshape if necessary
if audio_data.ndim == 1:
audio_data = np.expand_dims(audio_data, axis=1)
sf.write(temp_audio_path, audio_data, sample_rate, subtype='PCM_24')
print("Audio saved successfully")
# Load the saved audio file for processing
audio, sr = librosa.load(temp_audio_path)
print("Audio loaded successfully")
except Exception as e:
print("Error processing audio data:")
traceback.print_exc() # Print full traceback
return f"Error processing audio data: {e}"
audio = noise_reduction(audio)
chunks = split_audio(audio, sr)
svm_model, label_encoder = load_models()
predictions = []
for chunk in chunks:
features = feature_extraction(chunk, sr)
features_scaled = feature_scaling(features)
svm_emotion = audio_classification(svm_model, label_encoder, features_scaled)
predictions.append(svm_emotion)
final_prediction = max(set(predictions), key=predictions.count)
return final_prediction
# Custom HTML for title and credits
title_with_credit = """
<div style="display: flex; justify-content: space-between; align-items: center;">
<h1>Speech Emotion Recognition</h1>
<div style="font-size: 12px; color: gray;">Developed by Dheepan G, 22352018, MCA 2nd year</div>
</div>
"""
# Create Gradio interface
input_audio = gr.Audio(label="Upload Audio File", type="numpy")
output_text = gr.Textbox(label="Predicted Emotion")
gr.Interface(
fn=predict_emotion,
inputs=input_audio,
outputs=output_text,
title=title_with_credit,
description="Upload an audio file to predict the emotion present in the audio."
).launch()