-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfingerprint.py
206 lines (168 loc) · 8.23 KB
/
fingerprint.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import librosa
import numpy as np
from collections import defaultdict
class FingerprintIdentifier:
"""
This class implements a music fingerprint recognition algorithm,
similar to the Shazam music fingerprint recognition algorithm.
For more details, refer to https://www.ee.columbia.edu/~dpwe/papers/Wang03-shazam.pdf,
which is a website of Columbia University.
In this class, we divide the algorithm into two steps:
1) Generate music fingerprints
2) Identify music fingerprints
args:
sr: Audio sampling rate
n_fft: FFT size for STFT, the window size for Short-Time Fourier Transform (STFT),
the larger the size, the higher the frequency resolution.
hop_length: Hop size for STFT, the step size for STFT, controlling the time resolution.
peak_threshold: dB threshold, the threshold for peak detection,
the energy in the spectrum must be higher than this value to be considered a valid peak.
peak_neighborhood: 2D peak detection neighborhood size, the neighborhood size for detecting 2D peaks,
ensuring the peak is a local maximum.
fan_value_frames: Number of peak frames to pair from the anchor,
the time range (in frames) for pairing each peak in fingerprint generation.
min_count: Minimum number of matching points for the result, determining whether the audio files match.
methods:
detect_peaks_2d: 2D peak detection, detecting peaks in the 2D spectrum.
build_fingerprint: Generate music fingerprints, pairing peaks to generate fingerprints.
identify: Identify music fingerprints, determining whether two audio files match.
For detailed instructions on this class, please refer to the fingerprint_manual.md or fingerprint_manual_en.md document.
"""
def __init__(self,sr=16000,n_fft=2048,hop_length=512,peak_threshold=-30.0,peak_neighborhood=3, fan_value_frames=5,min_count=8):
self.sr = sr
self.n_fft = n_fft
self.hop_length = hop_length
self.peak_threshold = peak_threshold
self.peak_neighborhood = peak_neighborhood
self.fan_value_frames = fan_value_frames
self.min_count = min_count
def detect_peaks_2d(self, S_db):
"""
Perform 2D peak detection on a spectrogram.
Args:
S_db (ndarray): 2D spectrogram in decibels, shape (freq_len, time_len).
Returns:
List[Tuple[int, int, float]]: List of peaks as (freq, time, magnitude).
"""
freq_len, time_len = S_db.shape
peaks = []
# Traverse the spectrogram to find peakss.
for t in range(time_len):
for f in range(freq_len):
# Skip points below the peak threshold
val = S_db[f, t]
if val < self.peak_threshold:
continue
# Define local neighborhood boundaries
fmin = max(0, f - self.peak_neighborhood)
fmax = min(freq_len, f + self.peak_neighborhood + 1)
tmin = max(0, t - self.peak_neighborhood)
tmax = min(time_len, t + self.peak_neighborhood + 1)
# Extract the local neighborhood around the point
local_patch = S_db[fmin:fmax, tmin:tmax]
# Check if the point is a local maximum
if val >= np.max(local_patch):
peaks.append((f, t, val))
# Return the peaks list.
return peaks
def build_fingerprint(self, peaks):
"""
Converts a list of peaks into a fingerprint structure.
The fingerprint structure:
- hashkey: (freqA, freqB, dt), representing two frequencies and the time difference.
- value: A list of time offsets (timeA) where this hashkey occurs.
Args:
peaks (List[Tuple[int, int, float]]): List of peaks, each represented as (freq, time, magnitude).
Returns:
defaultdict: A dictionary-like structure where keys are hashkeys, and values are lists of time offsets.
"""
# Sort peaks by time for consistent pair generation
peaks_sorted = sorted(peaks, key=lambda x:x[1]) # x[1] is the time index
n_peaks = len(peaks_sorted)
# Dictionary to store the fingerprint (hashkey -> list of offsets)
hash_dict = defaultdict(list)
# Use two variables to traverse the entire peaks_sorted list
# Initialize the second pointer
j = 0
for i in range(n_peaks):
freqA, timeA, valA = peaks_sorted[i]
# Ensure the second pointer starts after the first pointer
if j < i+1:
j = i+1
# Match the current peak with subsequent peaks within the fan-out limit
while j < n_peaks:
freqB, timeB, valB = peaks_sorted[j]
dt = timeB - timeA
# If the time difference exceeds the fan-out limit, break the loop
if dt > self.fan_value_frames:
break
# Only consider valid pairs where dt > 0
if dt> 0:
# Generate a hashkey using frequencies and time difference
hashkey = (freqA, freqB, dt)
# Store the time offset (timeA) for the hashkey
hash_dict[hashkey].append(timeA)
# Move the second pointer forward
j+=1
# Return the fingerprint dictionary
return hash_dict
def identify(self, ref_audio, sample_audio):
"""
Compare two audio files to determine if they match.
Steps:
1) Generate fingerprints for the reference audio.
2) Detect peaks and compare fingerprints with the sample audio.
3) Count matches and decide based on a minimum threshold.
Args:
ref_audio (ndarray): Reference audio signal.
sample_audio (ndarray): Sample audio signal to compare.
Returns:
Tuple[bool, int]: (is_match, best_count), where:
- is_match (bool): True if match is found, False otherwise.
- best_count (int): Highest number of matching fingerprints.
"""
# A) Generate fingerprints for the reference audio
# Compute STFT and convert to dB
D_ref = librosa.stft(ref_audio, n_fft=self.n_fft, hop_length=self.hop_length, center=False)
S_ref = np.abs(D_ref)
S_db_ref = librosa.amplitude_to_db(S_ref, ref=np.max)
# Detect peaks and build fingerprints
peaks_ref = self.detect_peaks_2d(S_db_ref)
ref_fp = self.build_fingerprint(peaks_ref)
# B) Process the sample audio
# Compute STFT and convert to dB
D_samp = librosa.stft(sample_audio, n_fft=self.n_fft, hop_length=self.hop_length, center=False)
S_samp = np.abs(D_samp)
S_db_samp = librosa.amplitude_to_db(S_samp, ref=np.max)
# Detect peaks in the sample audio
peaks_samp = self.detect_peaks_2d(S_db_samp)
# Compare sample peaks against reference fingerprints
offset_map = defaultdict(int)
peaks_samp_sorted = sorted(peaks_samp, key=lambda x:x[1])
n_samp_peaks= len(peaks_samp_sorted)
# Compare peaks using a two-pointer approach
j=0
for i in range(n_samp_peaks):
freqA, timeA, valA= peaks_samp_sorted[i]
if j< i+1:
j= i+1
while j< n_samp_peaks:
freqB, timeB, valB= peaks_samp_sorted[j]
dt= timeB - timeA
if dt> self.fan_value_frames:
break
if dt> 0:
hashkey= (freqA, freqB, dt)
sample_offset= timeA
if hashkey in ref_fp:
ref_offsets = ref_fp[hashkey]
for ref_offset in ref_offsets:
offset_diff= ref_offset- sample_offset
offset_map[offset_diff]+=1
j+=1
# Determine the best match from the offset histogram
if not offset_map:
return (False,0)
best_off, best_count= max(offset_map.items(), key=lambda x:x[1])
is_match= (best_count>= self.min_count)
return (is_match, best_count)