-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathutils.py
128 lines (108 loc) · 3.36 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
"""
Useful functions for preprocessing.
"""
__date__ = "August 2019 - October 2020"
import numpy as np
import warnings
from scipy.signal import stft
from scipy.interpolate import interp2d
EPSILON = 1e-12
def get_spec(t1, t2, audio, p, fs=32000, target_freqs=None, target_times=None, \
fill_value=-1/EPSILON, max_dur=None, remove_dc_offset=True):
"""
Norm, scale, threshold, stretch, and resize a Short Time Fourier Transform.
Notes
-----
* ``fill_value`` necessary?
* Look at all references and see what can be simplified.
* Why is a flag returned?
Parameters
----------
t1 : float
Onset time.
t2 : float
Offset time.
audio : numpy.ndarray
Raw audio.
p : dict
Parameters. Must include keys: ...
fs : float
Samplerate.
target_freqs : numpy.ndarray or ``None``, optional
Interpolated frequencies.
target_times : numpy.ndarray or ``None``, optional
Intepolated times.
fill_value : float, optional
Defaults to ``-1/EPSILON``.
max_dur : float, optional
Maximum duration. Defaults to ``None``.
remove_dc_offset : bool, optional
Whether to remove any DC offset from the audio. Defaults to ``True``.
Returns
-------
spec : numpy.ndarray
Spectrogram.
flag : bool
``True``
"""
if max_dur is None:
max_dur = p['max_dur']
if t2 - t1 > max_dur + 1e-4:
message = "Found segment longer than max_dur: " + str(t2-t1) + \
"s, max_dur = " + str(max_dur) + "s"
warnings.warn(message)
s1, s2 = int(round(t1*fs)), int(round(t2*fs))
assert s1 < s2, "s1: " + str(s1) + " s2: " + str(s2) + " t1: " + str(t1) + \
" t2: " + str(t2)
# Get a spectrogram and define the interpolation object.
temp = min(len(audio),s2) - max(0,s1)
if temp < p['nperseg'] or s2 <= 0 or s1 >= len(audio):
return np.zeros((p['num_freq_bins'], p['num_time_bins'])), True
else:
temp_audio = audio[max(0,s1):min(len(audio),s2)]
if remove_dc_offset:
temp_audio = temp_audio - np.mean(temp_audio)
f, t, spec = stft(temp_audio, fs=fs, nperseg=p['nperseg'], \
noverlap=p['noverlap'])
t += max(0,t1)
spec = np.log(np.abs(spec) + EPSILON)
interp = interp2d(t, f, spec, copy=False, bounds_error=False, \
fill_value=fill_value)
# Define target frequencies.
if target_freqs is None:
if p['mel']:
target_freqs = np.linspace(_mel(p['min_freq']), \
_mel(p['max_freq']), p['num_freq_bins'])
target_freqs = _inv_mel(target_freqs)
else:
target_freqs = np.linspace(p['min_freq'], p['max_freq'], \
p['num_freq_bins'])
# Define target times.
if target_times is None:
duration = t2 - t1
if p['time_stretch']:
duration = np.sqrt(duration * max_dur) # stretched duration
shoulder = 0.5 * (max_dur - duration)
target_times = np.linspace(t1-shoulder, t2+shoulder, p['num_time_bins'])
# Then interpolate.
interp_spec = interp(target_times, target_freqs, assume_sorted=True)
spec = interp_spec
# Normalize.
spec -= p['spec_min_val']
spec /= (p['spec_max_val'] - p['spec_min_val'])
spec = np.clip(spec, 0.0, 1.0)
# Within-syllable normalize.
if p['within_syll_normalize']:
spec -= np.quantile(spec, p['normalize_quantile'])
spec[spec<0.0] = 0.0
spec /= np.max(spec) + EPSILON
return spec, True
def _mel(a):
"""https://en.wikipedia.org/wiki/Mel-frequency_cepstrum"""
return 1127 * np.log(1 + a / 700)
def _inv_mel(a):
"""https://en.wikipedia.org/wiki/Mel-frequency_cepstrum"""
return 700 * (np.exp(a / 1127) - 1)
if __name__ == '__main__':
pass
###