-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsegmentation.py
201 lines (178 loc) · 7.75 KB
/
segmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
from .utilities import self_tempo_estimation
from librosa import core
from librosa import feature
import matplotlib.pyplot as plt
from madmom.features.downbeats import DBNDownBeatTrackingProcessor as downbeattrack
from madmom.features.downbeats import RNNDownBeatProcessor as beatrnn
import essentia.standard as std
import numpy as np
from scipy.spatial.distance import cdist
import scipy.stats as st
import sys
from pydub import AudioSegment
eps = np.finfo(float).eps
def hz_to_pitch(hz_spectrums, sr):
"""
Get a spectrogram in hz and return a spectrogram in pitch
:param hz_spectrums: The freq spectrum
:param sr: The sample rate of the spectrum
:return: A pitch-spectrogram
"""
pitch_spectrums = []
freq_scale = np.fft.fftfreq(hz_spectrums.shape[0])
for hz_spectrum in hz_spectrums:
pitch_spectrum = np.zeros(int(12*np.log2(int(sr/2)/440)) + 57) # sr/2 is the maximum
for freq in range(1, len(hz_spectrum)):
pitch_spectrum[int(12*np.log2(freq_scale[freq]+eps/440)) + 57] += hz_spectrum[freq]
pitch_spectrums.append(pitch_spectrum/max(pitch_spectrum))
return np.array(pitch_spectrums).transpose()
def get_beat_sync_chroma_and_spectrum(audio, sr=None, bpm=None):
"""
Returns the beat_sync_chroma and the beat_sync_spectrums
:param audio: Path to the song, or numpy array
:param rate: Sample rate in case the audio param is numpy array
:param bpm: Precalculated bpm
:return: (beat_sync_chroma, beat_sync_spec)
"""
if not isinstance(audio, np.ndarray):
sr = 44100
y = std.MonoLoader(filename=audio, samplerate=44100)()
else:
y = audio
eql_y = std.EqualLoudness()(y)
tempo, framed_dbn = self_tempo_estimation(y, sr, tempo=bpm)
if framed_dbn.shape[0] % 4 == 0:
framed_dbn = np.append(framed_dbn, np.array(len(y)/sr))
band1 = (0, 220)
band2 = (220, 1760)
band3 = (1760, sr / 2)
band1list = []
band2list = []
band3list = []
chromas = []
for i in range(1, len(framed_dbn)):
fft_eq = abs(np.fft.fft(eql_y[int(framed_dbn[i - 1] * sr):int(framed_dbn[i] * sr)]))
freqs = np.fft.fftfreq(len(fft_eq), 1 / sr)
band1list.append(np.sqrt(np.mean(sum(fft_eq[np.where(np.logical_and(freqs > band1[0], freqs < band1[1]))]**2))))
band2list.append(np.sqrt(np.mean(sum(fft_eq[np.where(np.logical_and(freqs > band2[0], freqs < band2[1]))]**2))))
band3list.append(np.sqrt(np.mean(sum(fft_eq[np.where(np.logical_and(freqs > band3[0], freqs < band3[1]))]**2))))
stft = abs(core.stft(y[int(framed_dbn[i - 1] * sr):int(framed_dbn[i] * sr)]))
chroma = np.mean(feature.chroma_stft(y=None, S=stft ** 2), axis=1)
chromas.append(chroma)
chromas = np.array(chromas).transpose()
band1list = np.array(band1list).transpose()
band2list = np.array(band2list).transpose()
band3list = np.array(band3list).transpose()
return (chromas, np.vstack([band1list, band2list, band3list]))
def get_beat_sync_spectrums(audio):
"""
Returns a beat-sync 3-energy-band spectrogram
:param audio: Path to the song
:return: Array containing energy in band1, band2, band3
"""
y, sr = core.load(audio, sr=44100)
eql_y = EqualLoudness()(y)
tempo, framed_dbn = self_tempo_estimation(y, sr)
np.append(framed_dbn, np.array(len(y)/sr))
band1 = (0, 220)
band2 = (220, 1760)
band3 = (1760, sr / 2)
band1list = []
band2list = []
band3list = []
for i in range(1, len(framed_dbn)):
fft_eq = abs(np.fft.fft(eql_y[int(framed_dbn[i - 1] * sr):int(framed_dbn[i] * sr)]))
freqs = np.fft.fftfreq(len(fft_eq), 1 / sr)
band1list.append(np.sqrt(np.mean(sum(fft_eq[np.where(np.logical_and(freqs > band1[0], freqs < band1[1]))]**2))))
band2list.append(np.sqrt(np.mean(sum(fft_eq[np.where(np.logical_and(freqs > band2[0], freqs < band2[1]))]**2))))
band3list.append(np.sqrt(np.mean(sum(fft_eq[np.where(np.logical_and(freqs > band3[0], freqs < band3[1]))]**2))))
band1list = np.array(band1list).transpose()
band2list = np.array(band2list).transpose()
band3list = np.array(band3list).transpose()
return np.vstack([band1list, band2list, band3list])
def get_beat_sync_chroma(audio):
"""
Get a beat synchronous chroma
:param audio: The path to the audio file
:return: A beat synchronous chroma
"""
y, sr = core.load(audio, sr=44100)
tempo, framed_dbn = self_tempo_estimation(y, sr)
np.append(framed_dbn, np.array(len(y)/sr))
# Calculate chroma semitone spectrum
chromas = []
for i in range(1, len(framed_dbn)):
stft = abs(core.stft(y[int(framed_dbn[i-1]*sr):int(framed_dbn[i]*sr)]))
chroma = np.mean(feature.chroma_stft(y=None, S=stft**2), axis=1)
chromas.append(chroma)
chromas = np.array(chromas).transpose()
return chromas
def get_dbeat_sync_chroma(audio):
"""
Get a downbeat synchronous chroma
:param audio: The path to the audio file
:return: A downbeat synchronous chroma
"""
y, sr = core.load(audio, sr=44100)
tempo, beats = self_tempo_estimation(y, sr)
np.append(beats, np.array(len(y)/sr))
act = beatrnn()(audio)
beats = downbeattrack(beats_per_bar=[4, 4], fps=100)(act)
downbeats = beats[beats[:, 1] == 1][:][:, 0]
framed_dbn = np.concatenate([np.array([0]), downbeats ])
# Calculate chroma semitone spectrum
semitones = []
chromas = []
for i in range(1, len(framed_dbn)):
stft = abs(core.stft(y[int(framed_dbn[i-1]*sr):int(framed_dbn[i]*sr)]))
chroma = np.mean(feature.chroma_stft(y=None, S=stft**2), axis=1)
semitone = np.mean(hz_to_pitch(stft, sr=sr), axis=1)
chromas.append(chroma)
semitones.append(semitone)
chromas = np.array(chromas).transpose()
semitones = np.array(semitones).transpose()
# Plot the results and return the values
time = np.arange(len(y)) / sr
fig, ax = plt.subplots(3, 1)
ax[0].plot(time, y)
ax[0].vlines(framed_dbn, -1, 1, colors='r', linestyles='dashdot')
ax[0].set_xlim(framed_dbn[0], framed_dbn[-1])
plt.sca(ax[1])
plt.pcolor(framed_dbn, np.arange(13), chromas)
plt.yticks(np.arange(13)+0.5, ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"])
plt.ylim(0, 12)
plt.sca(ax[2])
plt.pcolor(semitones)
print(tempo)
return chromas, semitones, downbeats, tempo
def gkern(kernlen=21, nsig=3):
"""Returns a 2D Gaussian kernel."""
x = np.linspace(-nsig, nsig, kernlen+1)
kern1d = np.diff(st.norm.cdf(x))
kern2d = np.outer(kern1d, kern1d)
return kern2d/kern2d.sum()
def gcheckerboard(kernelen=64, nsig=32):
"""Return a 2D Gaussian checkerboard kernel."""
c = np.array([[-1, 1], [1, -1]])
intsize = int(np.ceil(kernelen/2))
return np.kron(c, np.ones([intsize, intsize])) * gkern(kernelen, nsig)
def slidekernelthroughdiagonal(kernel, matrix):
"""Slide a kernel through a diagonal"""
size_kernel = kernel.shape[0]
size_matrix = matrix.shape[0]
result = np.zeros([size_matrix])
for i in range(size_matrix):
# Calculate zero padding needed
padding_b = -min(i - int(size_kernel/2), 0)
padding_a = -min(size_matrix - int(i + size_kernel/2), 0)
matrix_selection = matrix[max(0, i-int(size_kernel/2)):min(size_matrix, i+int(size_kernel/2)),max(0, i-int(size_kernel/2)):min(size_matrix, i+int(size_kernel/2))]
matrix_padded = np.pad(matrix_selection, [(padding_b, padding_a), (padding_b, padding_a)])
result[i] = np.sum(matrix_padded*kernel)
return result
if __name__ == '__main__':
if len(sys.argv) == 2:
chromas, semitones, downbeats, tempo = get_beat_sync_chroma(sys.argv[1])
plt.figure()
ss_semitones = cdist(semitones.transpose(), semitones.transpose(), metric='euclidean')
plt.pcolor(ss_semitones)
plt.show()