-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathaudio_embedding_extractor_mnist.py
131 lines (90 loc) · 4.77 KB
/
audio_embedding_extractor_mnist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from __future__ import division
from keras.models import Model, Sequential
from keras.layers import Input, Dense, multiply, Reshape, Flatten, Dropout, LeakyReLU, BatchNormalization, Conv2D, MaxPooling2D
import numpy as np
from utils import load_mnist
from audio_preprocessing_layer import spectrogram, logSpectrogram, logMelSpectrogram
from encoding_layer_ori import Encoding_layer
from keras.datasets import mnist, cifar10
from keras.utils import to_categorical
import tensorflow as tf
from keras.optimizers import Adam
class Extractor(object):
def __init__(self, ):
self.audio_emb_dim = 128
self.batch_size = 100
self.img_rows = 28
self.img_cols = 28
self.channels = 1
self.classes = 10
self.imagine_step = 50
self.audio_sr = 44100 # audio sampling rate
self.audio_duration = 1.05 # encoded audio duration
self.audio_len = 2 * int(0.5 * self.audio_sr * self.audio_duration)
self.img_shape = (self.img_rows, self.img_cols, self.channels)
self.optimizer = Adam(0.0002, 0.5)
self.audio_class_model = self.build_audio_C()
def build_audio_C(self):
img_input = Input(batch_shape=(self.batch_size, self.img_rows, self.img_cols, self.channels),
name='img_input')
x = Encoding_layer(name='vOICe')(img_input)
spectro = logMelSpectrogram(name='logSpectrogram')(x)
# Block 1
x = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv1')(spectro)
#x = LeakyReLU(alpha=0.2)(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x)
# Block 2
x = Conv2D(128, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv2')(x)
#x = LeakyReLU(alpha=0.2)(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x)
# Block 3
x = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv3/conv3_1')(x)
#x = LeakyReLU(alpha=0.2)(x)
x = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv3/conv3_2')(x)
#x = LeakyReLU(alpha=0.2)(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x)
# Block 4
x = Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv4/conv4_1')(x)
#x = LeakyReLU(alpha=0.2)(x)
x = Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv4/conv4_2')(x)
#x = LeakyReLU(alpha=0.2)(x)
x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x)
x = Flatten(name='flatten_')(x)
x = Dense(4096, activation='relu', name='fc1')(x)
embeddings = Dense(self.audio_emb_dim, activation='relu', name='embeddings')(x)
predicts = Dense(self.classes, activation='softmax', name='prediction')(embeddings)
audio_model = Model(inputs=img_input, outputs=predicts)
return audio_model
def train(self, epochs=10):
# load the dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = np.expand_dims(x_train, 3)
x_test = np.expand_dims(x_test, 3)
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
x_train = x_train / 255.0
x_test = x_test / 255.0
print('training the audio net...')
self.audio_class_model.compile(loss='categorical_crossentropy',
optimizer=self.optimizer,
metrics=['accuracy']) # SGD(lr=0.01, momentum=0.9),
self.audio_class_model.fit(x_train, y_train, batch_size=self.batch_size, epochs=epochs)
result = self.audio_class_model.evaluate(x_test, y_test, batch_size=self.batch_size)
print(result)
#self.audio_class_model.save('model/audio_embedding_net_mnist.h5')
print('AudioNet model is saved.')
print('Extracting audio embeddings...')
audio_emb_extractor = Model(inputs=self.audio_class_model.input,
outputs=self.audio_class_model.get_layer(name="embeddings").output)
audio_embeddings = audio_emb_extractor.predict(x_train, batch_size=self.batch_size, verbose=True)
name = 'audio_embeddings/audio_embedding_mnist_train_%d.npy' % epochs
np.save(name, audio_embeddings)
audio_embeddings = audio_emb_extractor.predict(x_test, batch_size=self.batch_size, verbose=True)
name = 'audio_embeddings/audio_embedding_mnist_test_%d.npy' % epochs
np.save(name, audio_embeddings)
print('Finished')
if __name__ == '__main__':
print 'building network ...'
extractor = Extractor()
extractor.train(epochs=15)
# C_Blind