-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathSpeechDetect.m
97 lines (75 loc) · 3.03 KB
/
SpeechDetect.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
% 1. IS
% 2. amp1,amp2
% 2. zrc2
clear all; clc; close all;
% MMSE Filter Denoising
%--- desired signal-------------------
[s,fs]= audioread('bluesky1.wav');
T= 4; % time duration to denoise (less than the duration of input signal)
T_start= 0;
N= T*fs;
n1= max(T_start*fs,1);
n=0:1:(N-1); n= n';
time=(0:N-1)/fs;
s=s/max(abs(s)); % Normalization
s= s(n1:(n1+N-1));
%----LTI system setting-------
delay= 0.01;
h= [ zeros(delay*fs,1); 4];
L= length(h);
%--- noise signals------------
[v,fs]= audioread('crowdtalking2_16k.wav');
v= v(n1:(n1+N+L-1));
v1= conv(h,v); v1= v1((L+1):(L+N));
v2= v((L+1):(L+N));
%---the primary input and reference signals-------
x= v2;
d= s + v1;
%---- optimal filter---------------------------
M= 500; %change the filter size
mu= 0.1; %change the iterate parameter
e= zeros(N,1);
y= zeros(N,1);
w= zeros(M,1);
x_vec= zeros(M,1);
for i=1:N
x_vec= [ x(i); x_vec(1:M-1) ];
y(i)= w'*x_vec;
e(i)= d(i)- y(i);
w= w + mu*e(i)*x_vec;
end
fprintf('%4d %4d \n',db(snr(d,v1)),db(snr(e,v1)));
% Visuluzation-----------------------------------------
figure(1)
subplot 211;plot(time,d); % 带有噪音的语音信号
subplot 212;plot(time,e); % 通过MMSE Filter后的语音信号
figure(2)
plot(time,e,'b');
title('Speech Signal End Point Detection');
ylabel('Amplitude'); axis([0 max(time) -1 1]); grid;
xlabel('Time/s');
% Endpoint Detection
wlen=500; inc=100; % 分帧参数
IS=0.2; overlap=wlen-inc; % 设置IS
NIS=fix((IS*fs-wlen)/inc +1); % 计算NIS
fn=fix((N-wlen)/inc)+1; % 求帧数
frameTime=frame2time(fn, wlen, inc, fs);% 计算每帧对应的时间
ss=enframe(e,wlen,inc)'; % 对消噪后语音信号进行分帧
[zcr,amp,voiceseg,vsl,SF,NF]=SpeechSegment(e,wlen,inc,NIS); % 端点检测
% Output:
% zcr: zero-crossing rate(ZCR)/过零率,
% amp:Short-time Energy(STE)/短时能量,
% voiceseg: a class containing start,end,duration of each speech segments/一个装有每个语音片段开始、结束、时长的类,
% vsl: total number of speech segments/一条音频一共有多少个语音片段,
% SF: An array with speech frame labeled 1/一个长度为fn的数组,所有语音帧被标为1,
% NF: An array with non-speech frame labeled 1/一个长度为fn的数组,所有非语音帧被标为1
for k=1 : vsl % 画出语音信号起止点位置
nx1=voiceseg(k).begin; nx2=voiceseg(k).end;
nxl=voiceseg(k).duration;
fprintf('%4d %4d %4d %4d\n',k,nx1,nx2,nxl);
line([frameTime(nx1) frameTime(nx1)],[-1 1],'color','r','LineStyle','-');
line([frameTime(nx2) frameTime(nx2)],[-1 1],'color','r','LineStyle','--');
end
figure(3); % 画出输入语音的短时能量和过零率,作为调参依据
subplot 211; plot(amp);title('Short-time Energy per frame')
subplot 212; plot(zcr);title('Short-time Zero Crossing Rate per frame');